Example code for handling outlier with 3 methods of feature-engine.
Table of Contents
Winsorizer
Caps maximum and/or minimum values of a variable at automatically determined values.[ref:https://feature-engine.readthedocs.io/en/latest/user_guide/outliers/Winsorizer.html]
Code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.outliers import Winsorizer
# Load dataset
def load_titanic():
data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
data = data.replace('?', np.nan)
data['cabin'] = data['cabin'].astype(str).str[0]
data['pclass'] = data['pclass'].astype('O')
data['embarked'].fillna('C', inplace=True)
data['fare'] = data['fare'].astype('float')
data['fare'].fillna(data['fare'].median(), inplace=True)
data['age'] = data['age'].astype('float')
data['age'].fillna(data['age'].median(), inplace=True)
return data
data = load_titanic()
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
data.drop(['survived', 'name', 'ticket'], axis=1),
data['survived'], test_size=0.3, random_state=0)
# set up the capper
#! right tail: mean + 3* std
capper = Winsorizer(capping_method='gaussian', tail='right', fold=3, variables=['age', 'fare'])
#! both tails: mean + 2* std
# capper = Winsorizer(capping_method='gaussian', tail='both', fold=2, variables='fare')
#! right tail: 75th quantile + 3* IQR
#! left tail: 25th quantile - 3* IQR
#! IQR is the inter-quartile range: 75th quantile - 25th quantile
# capper = Winsorizer(capping_method='iqr', tail='both', variables=['age', 'fare'])
#! right tail: 98th percentile
#! left tail: 2nd percentile
# capper = Winsorizer(capping_method='quantiles', tail='both',fold=0.02, variables=['age', 'fare'])
# fit the capper
capper.fit(X_train)
# transform the data
train_t= capper.transform(X_train)
test_t= capper.transform(X_test)
Results
right tail: mean + 3* std
both 3*IQR
ArbitraryOutlierCapper
Code
# capping of age and fare features at right tail
capper = ArbitraryOutlierCapper(max_capping_dict={'age': 50, 'fare': 200}, min_capping_dict=None)
# capping outliers at left tail
capper = ArbitraryOutlierCapper(
max_capping_dict=None, min_capping_dict={'age': 10, 'fare': 100})
# capping outliers at both tails
capper = ArbitraryOutlierCapper(
min_capping_dict={'age': 5, 'fare': 5},
max_capping_dict={'age': 60, 'fare': 150})
capper.fit(X_train)
print(capper.right_tail_caps_)
print(capper.left_tail_caps_)
# transforming train and test data
train_t = capper.transform(X_train)
test_t = capper.transform(X_test)
Outlier Trimmer
Code
# set up the capper
# We want the maximum values to be determined using the 75th quantile of the variable (param
# capping_method) plus 1.5 times the IQR (param fold).
capper = OutlierTrimmer(capping_method='iqr', tail='right', fold=1.5, variables=['age', 'fare'])
# fit the capper
capper.fit(X_train)
# transforming train and test data
train_t = capper.transform(X_train)
test_t = capper.transform(X_test)