How to handle outlier with feature-engine?

  AI, Data, Deep Learning, Machine Learning, Numpy, Pandas, Python

Example code for handling outlier with 3 methods of feature-engine.

Winsorizer

Caps maximum and/or minimum values of a variable at automatically determined values.[ref:https://feature-engine.readthedocs.io/en/latest/user_guide/outliers/Winsorizer.html]

Code


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from feature_engine.outliers import Winsorizer

# Load dataset
def load_titanic():
    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = data.replace('?', np.nan)
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['embarked'].fillna('C', inplace=True)
    data['fare'] = data['fare'].astype('float')
    data['fare'].fillna(data['fare'].median(), inplace=True)
    data['age'] = data['age'].astype('float')
    data['age'].fillna(data['age'].median(), inplace=True)
    return data

data = load_titanic()

# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
            data.drop(['survived', 'name', 'ticket'], axis=1),
            data['survived'], test_size=0.3, random_state=0)

# set up the capper
#! right tail: mean + 3* std
capper = Winsorizer(capping_method='gaussian', tail='right', fold=3, variables=['age', 'fare'])

#! both tails: mean + 2* std
# capper = Winsorizer(capping_method='gaussian', tail='both', fold=2, variables='fare')

#! right tail: 75th quantile + 3* IQR
#! left tail: 25th quantile - 3* IQR
#! IQR is the inter-quartile range: 75th quantile - 25th quantile
# capper = Winsorizer(capping_method='iqr', tail='both', variables=['age', 'fare'])

#! right tail: 98th percentile
#! left tail: 2nd percentile
# capper = Winsorizer(capping_method='quantiles', tail='both',fold=0.02, variables=['age', 'fare'])
# fit the capper
capper.fit(X_train)
# transform the data
train_t= capper.transform(X_train)
test_t= capper.transform(X_test)

Results

right tail: mean + 3* std

both 3*IQR

ArbitraryOutlierCapper

Code


# capping of age and fare features at right tail
capper = ArbitraryOutlierCapper(max_capping_dict={'age': 50, 'fare': 200}, min_capping_dict=None)
# capping outliers at left tail
capper = ArbitraryOutlierCapper(
    max_capping_dict=None, min_capping_dict={'age': 10, 'fare': 100})
# capping outliers at both tails
capper = ArbitraryOutlierCapper(
    min_capping_dict={'age': 5, 'fare': 5},
    max_capping_dict={'age': 60, 'fare': 150})
capper.fit(X_train)
print(capper.right_tail_caps_)
print(capper.left_tail_caps_)

# transforming train and test data
train_t = capper.transform(X_train)
test_t = capper.transform(X_test)

Outlier Trimmer

Code


# set up the capper
# We want the maximum values to be determined using the 75th quantile of the variable (param  
# capping_method) plus 1.5 times the IQR (param fold). 
capper = OutlierTrimmer(capping_method='iqr', tail='right', fold=1.5, variables=['age', 'fare'])

# fit the capper
capper.fit(X_train)

# transforming train and test data
train_t = capper.transform(X_train)
test_t = capper.transform(X_test)