Example python code for handling missing data (ref:Python feature engineering cookbook ). Also answer the following questions:
- How to calculate the percentage of missing values for each vatiable in a pandas table?
- How to remove the observations with missing data in any of the variables?
- How to remove observations if data is missing in all the variables
- How to impute variables with the mean or median?
- How to impute all variables with the same number?
- How to impute different variables with the different number?
- How to replace missing data with a value at the end of the distribution?
- How to replace missing data in categorical variables with an arbitrary value?
- How to replace missing data with a random sample extracted from the variable?
- How to add a binary variable indicating if observations are missing?
- How to delete rows containing missing values?
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.missing_data_imputers import MeanMedianImputer
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import EndTailImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import RandomSampleImputer
from feature_engine.imputation import AddMissingIndicator
from feature_engine.imputation import DropMissingData
#-------------------------------------------------------------------#
data = pd.read_csv('example.csv')
#calculate the percentage of missing values for each variable (A??) and sort them in ascending order
data.isnull().mean().sort_values(ascending=True)
#-------------------------------------------------------------------#
#remove the observations with missing data in any of the variables
data_cca = data.dropna()
# remove observations if data is missing in all the variables
data_all = data.dropna(how='all')
#-------------------------------------------------------------------#
#! MeanMedianImputer
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
data.drop(['Id', 'Predictand'], axis=1),
data['Predictand'],
test_size=0.3,
random_state=0,
)
#-------------------------------------------------------------------#
# set up the imputer
median_imputer = MeanMedianImputer(
imputation_method='median',
variables=['Predictor1', 'Predictor2']
)
# fit the imputer
median_imputer.fit(X_train)
# transform the data
train_t= median_imputer.transform(X_train)
test_t= median_imputer.transform(X_test)
#-------------------------------------------------------------------#
#! ArbitraryNumberImputer, assum X include two variables
#impute all variables with the same number
arbitrary_imputer = ArbitraryNumberImputer(
variables = ['Predictor1', 'Predictor2'],
arbitrary_number = -999
)
arbitrary_imputer.fit(X_train)
# transform the data
train_t= arbitrary_imputer.transform(X_train)
test_t= arbitrary_imputer.transform(X_test)
#-------------------------------------------------------------------#
# impute different variables with different numbers.
transformer = ArbitraryNumberImputer(
imputer_dict = {'Predictor1':1, 'Predictor2':-999}
)
arbitrary_imputer.fit(X_train)# transform the data
train_t= arbitrary_imputer.transform(X_train)
test_t= arbitrary_imputer.transform(X_test)
#-------------------------------------------------------------------#
# imputation values using the mean plus 3 times the standard deviation
# set up the imputer
tail_imputer = EndTailImputer(imputation_method='gaussian',
tail='right',
fold=3,
variables = ['Predictor1', 'Predictor2'])
# fit the imputer
tail_imputer.fit(X_train)
# transform the data
train_t= tail_imputer.transform(X_train)
test_t= tail_imputer.transform(X_test)
#-------------------------------------------------------------------#
#set up the imputer
imputer = CategoricalImputer(variables=['Predictor3', 'Predictor4']])
# fit the imputer
imputer.fit(X_train)
# transform the data
train_t= imputer.transform(X_train)
test_t= imputer.transform(X_test)
#-------------------------------------------------------------------#
# set up the imputer
# sample values at random, observation per observation, using as seed the value of the variable 'Predictor1' plus the value of the variable 'Predictor2'.
imputer = RandomSampleImputer(
random_state=['Predictor1', 'Predictor2'],
seed='observation',
seeding_method='add'
)
# fit the imputer
imputer.fit(X_train)
# transform the data
train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)
#-------------------------------------------------------------------#
# set up the imputer
addBinary_imputer = AddMissingIndicator(
variables=['Predictor1', 'Predictor2','Predictor3', 'Predictor4'],
)
# fit the imputer
addBinary_imputer.fit(X_train)
# transform the data
train_t = addBinary_imputer.transform(X_train)
test_t = addBinary_imputer.transform(X_test)
#-------------------------------------------------------------------#
# set up the imputer
missingdata_imputer = DropMissingData(variables=['Predictor1', 'Predictor2'])
# fit the imputer
missingdata_imputer.fit(X_train)
# transform the data
train_t= missingdata_imputer.transform(X_train)
test_t= missingdata_imputer.transform(X_test)