How to apply Scikit-learn transformers to do feature engineering?

Example code for transforming a selected group of variables with Sklearn Transformer Wrapper. It can also answer following questions:

  • How to directly read weather data from Canadian government website?

Prepare data sample


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from feature_engine.encoding import RareLabelEncoder
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import (
    f_regression,
    SelectKBest,
    SelectFromModel,
)
from sklearn.linear_model import Lasso
from feature_engine.wrappers import SklearnTransformerWrapper

#  create range of monthly dates
download_dates = pd.date_range(start='2019-01-01', end='2020-01-01', freq='MS')

#  URL from Chrome DevTools Console
base_url = ("https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&"
              "stationID=51442&Year={}&Month={}&Day=7&timeframe=1&submit=Download+Data") #  add format option to year and month

#  create list of remote URL from base URL
list_of_url = [base_url.format(date.year, date.month) for date in download_dates]

#  download and combine multiple files into one DataFrame
df = pd.concat((pd.read_csv(url) for url in list_of_url))
keepcolumns=['Date/Time (LST)','Longitude (x)','Year', 'Month', 'Day','Temp (°C)','Dew Point Temp (°C)','Hmdx','Wind Spd (km/h)']
data=df[keepcolumns]
data=data.rename(columns={'Date/Time (LST)':'dt_var','Temp (°C)':'T','Dew Point Temp (°C)':'Td',
'Longitude (x)':'Lat','Wind Spd (km/h)':'wind'
})

datetime_series = pd.to_datetime(data['dt_var'])
datetime_index = pd.DatetimeIndex(datetime_series.values)
data1=data.set_index(datetime_index)
data1.drop('dt_var',axis=1,inplace=True)

OneHotEncoder


data1=data.copy()
data1[data1['wind']<10]=0
data1[data1['wind']>30]=2
data1[data1['wind']>2]=1
ohe = SklearnTransformerWrapper(
        OneHotEncoder(sparse=False, drop='first'),
        variables = ['wind'])
ohe.fit(data1)
data1_t = ohe.transform(data1)

SimpleImputer


data=data1.copy()
data=data.drop(columns=['Hmdx'],axis=1) #drop NaN column
data['mm']=data['Month']
data['mm1']=data['mm']+data['Day']*0.01+np.random.normal(0, 1, data['T'].shape)
data['mm2']=data['T']+0.5+data['Td']*0.1+np.random.normal(0, 1, data['T'].shape)

X=pd.DataFrame(data[['Lat', 'Year', 'Month', 'Day', 'T', 'Td', 'mm', 'mm1','mm2']])
Y=pd.DataFrame(data['wind'])
Y.loc[Y['wind']<10,'wind']=0
Y.loc[Y['wind']>0,'wind']=1
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.3, random_state=0)

# set up the wrapper with the SimpleImputer
imputer = SklearnTransformerWrapper(transformer = SimpleImputer(strategy='mean'),
                                    variables = ['T', 'Td'])

# fit the wrapper + SimpleImputer
imputer.fit(X_train)

# transform the data
X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

StandardScaler


X=pd.DataFrame(data[['Lat', 'Year', 'Month', 'Day', 'T', 'Td', 'mm', 'mm1','mm2']])
Y=pd.DataFrame(data['wind'])
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.3, random_state=0)
# set up the wrapper with the StandardScaler
scaler = SklearnTransformerWrapper(transformer = StandardScaler(),
                                    variables = ['T', 'Td'])

# fit the wrapper + StandardScaler
scaler.fit(X_train)

# transform the data
X_train_t = scaler.transform(X_train)
X_test_t = scaler.transform(X_test)

KBinsDiscretizer


X=pd.DataFrame(data[['Lat', 'Year', 'Month', 'Day', 'T', 'Td', 'mm', 'mm1','mm2']])
Y=pd.DataFrame(data['wind'])
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.3, random_state=0)
cols = [var for var in X_train.columns if X_train[var].dtypes !='O']

# let's apply the standard scaler on the above variables

selector = SklearnTransformerWrapper(
    transformer = SelectKBest(f_regression, k=5),
    variables = cols)

selector.fit(X_train.fillna(0), y_train)

# transform the data
X_train_t = selector.transform(X_train.fillna(0))
X_test_t = selector.transform(X_test.fillna(0))

Feature selectors


data=data1.copy()
data=data.drop(columns=['Hmdx'],axis=1) #drop NaN column
data['mm']=data['Month']
data['mm1']=data['mm']+data['Day']*0.01+np.random.normal(0, 1, data['T'].shape)
data['mm2']=data['T']+0.5+data['Td']*0.1+np.random.normal(0, 1, data['T'].shape)
X=pd.DataFrame(data[['Lat', 'Year', 'Month', 'Day', 'T', 'Td', 'mm', 'mm1','mm2']])
Y=pd.DataFrame(data['wind'])
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.3, random_state=0)
cols = [var for var in X_train.columns if X_train[var].dtypes !='O']


# let's select the best variables according to Lasso

lasso = Lasso(alpha=10000, random_state=0)

sfm = SelectFromModel(lasso, prefit=False)

selector = SklearnTransformerWrapper(
    transformer = sfm,
    variables = cols)

selector.fit(X_train.fillna(0), y_train)

Categorical encoder


keepcolumns=['Date/Time (LST)','Longitude (x)','Year', 'Month', 'Day','Temp (°C)',
             'Dew Point Temp (°C)','Hmdx','Stn Press (kPa)','Wind Spd (km/h)','Weather']
data=df[keepcolumns]
data=data.rename(columns={'Longitude (x)':'Lat','Date/Time (LST)':'dt_var','Temp (°C)':'T','Dew Point Temp (°C)':'Td',
                          'Stn Press (kPa)':'ps','Wind Spd (km/h)':'wind'
})
data['mm']=data['Month']
data['mm1']=data['mm']+data['Day']*0.01+np.random.normal(0, 1, data['T'].shape)
data['mm2']=data['T']+0.5+data['Td']*0.1+np.random.normal(0, 1, data['T'].shape)

X=pd.DataFrame(data[['Lat', 'Year', 'Month', 'Day', 'T', 'Td', 'mm', 'mm1','mm2','Weather']])
Y=pd.DataFrame(data['wind'])
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.3, random_state=0)

# let's remove rare labels to avoid errors when encoding
cols=['Weather']
rare_label_enc = RareLabelEncoder(n_categories=2, variables=['Weather'])
X_train = rare_label_enc.fit_transform(X_train.fillna('Missing'))
X_test = rare_label_enc.transform(X_test.fillna('Missing'))

# now let's replace categories by integers
encoder = SklearnTransformerWrapper(
    transformer = OrdinalEncoder(),
    variables = cols,
)
encoder.fit(X_train,y_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

X_train[cols].isnull().mean()
X_train['Weather'].unique()
#array([4, 1, 3, 0, 5, 2])

Pipeline


keepcolumns=['Date/Time (LST)','Longitude (x)','Year', 'Month', 'Day','Temp (°C)',
             'Dew Point Temp (°C)','Hmdx','Stn Press (kPa)','Wind Spd (km/h)','Weather']
data=df[keepcolumns]
data=data.rename(columns={'Longitude (x)':'Lat','Date/Time (LST)':'dt_var','Temp (°C)':'T','Dew Point Temp (°C)':'Td',
                          'Stn Press (kPa)':'ps','Wind Spd (km/h)':'wind'
})

datetime_series = pd.to_datetime(data['dt_var'])
datetime_index = pd.DatetimeIndex(datetime_series.values)
data1=data.set_index(datetime_index)
data1.drop('dt_var',axis=1,inplace=True)
pipeline = Pipeline(steps = [
    ('weather', CategoricalImputer(imputation_method='frequent')),
    ('T', MeanMedianImputer(imputation_method='mean')),
    ('Td', OrdinalEncoder(encoding_method='arbitrary')),
    ('ps', SklearnTransformerWrapper(
        PolynomialFeatures(interaction_only = True, include_bias=False),
        variables=['T','Td','wind']))
])
pipeline.fit(data1)
data1_t = pipeline.transform(data1)