How to handle missing data?

Example python code for handling missing data (ref:Python feature engineering cookbook ). Also answer the following questions:

  • How to calculate the percentage of missing values for each vatiable in a pandas table?
  • How to remove the observations with missing data in any of the variables?
  • How to remove observations if data is missing in all the variables
  • How to impute variables with the mean or median?
  • How to impute all variables with the same number?
  • How to impute different variables with the different number?
  • How to replace missing data with a value at the end of the distribution?
  • How to replace missing data in categorical variables with an arbitrary value?
  • How to replace missing data with a random sample extracted from the variable?
  • How to add a binary variable indicating if observations are missing?
  • How to delete rows containing missing values?

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.missing_data_imputers import MeanMedianImputer
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import EndTailImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import RandomSampleImputer
from feature_engine.imputation import AddMissingIndicator
from feature_engine.imputation import DropMissingData

#-------------------------------------------------------------------#
data = pd.read_csv('example.csv')
#calculate the percentage of missing values for each variable (A??) and sort them in ascending order
data.isnull().mean().sort_values(ascending=True)

#-------------------------------------------------------------------#
#remove the observations with missing data in any of the variables
data_cca = data.dropna()
# remove observations if data is missing in all the variables
data_all = data.dropna(how='all')

#-------------------------------------------------------------------#
#! MeanMedianImputer
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
                                    data.drop(['Id', 'Predictand'], axis=1),
                                    data['Predictand'],
                                    test_size=0.3,
                                    random_state=0,
                                    )
#-------------------------------------------------------------------#
# set up the imputer
median_imputer = MeanMedianImputer(
                   imputation_method='median',
                   variables=['Predictor1', 'Predictor2']
                   )

# fit the imputer
median_imputer.fit(X_train)

# transform the data
train_t= median_imputer.transform(X_train)
test_t= median_imputer.transform(X_test)


#-------------------------------------------------------------------#
#! ArbitraryNumberImputer, assum X include two variables
#impute all variables with the same number
arbitrary_imputer = ArbitraryNumberImputer(
        variables = ['Predictor1', 'Predictor2'],
        arbitrary_number = -999
        )

arbitrary_imputer.fit(X_train)
# transform the data
train_t= arbitrary_imputer.transform(X_train)
test_t= arbitrary_imputer.transform(X_test)

#-------------------------------------------------------------------#
# impute different variables with different numbers. 
transformer = ArbitraryNumberImputer(
        imputer_dict = {'Predictor1':1, 'Predictor2':-999}
        )

arbitrary_imputer.fit(X_train)# transform the data
train_t= arbitrary_imputer.transform(X_train)
test_t= arbitrary_imputer.transform(X_test)
#-------------------------------------------------------------------#
# imputation values using the mean plus 3 times the standard deviation
# set up the imputer
tail_imputer = EndTailImputer(imputation_method='gaussian',
                          tail='right',
                          fold=3,
                          variables = ['Predictor1', 'Predictor2'])
# fit the imputer
tail_imputer.fit(X_train)
# transform the data
train_t= tail_imputer.transform(X_train)
test_t= tail_imputer.transform(X_test)

#-------------------------------------------------------------------#
#set up the imputer
imputer = CategoricalImputer(variables=['Predictor3', 'Predictor4']])

# fit the imputer
imputer.fit(X_train)

# transform the data
train_t= imputer.transform(X_train)
test_t= imputer.transform(X_test)

#-------------------------------------------------------------------#
# set up the imputer
# sample values at random, observation per observation, using as seed the value of the variable 'Predictor1' plus the value of the variable 'Predictor2'.
imputer = RandomSampleImputer(
        random_state=['Predictor1', 'Predictor2'],
        seed='observation',
        seeding_method='add'
    )

# fit the imputer
imputer.fit(X_train)

# transform the data
train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)


#-------------------------------------------------------------------#
# set up the imputer
addBinary_imputer = AddMissingIndicator(
variables=['Predictor1', 'Predictor2','Predictor3', 'Predictor4'],
)

# fit the imputer
addBinary_imputer.fit(X_train)

# transform the data
train_t = addBinary_imputer.transform(X_train)
test_t = addBinary_imputer.transform(X_test)

#-------------------------------------------------------------------#
# set up the imputer
missingdata_imputer = DropMissingData(variables=['Predictor1', 'Predictor2'])

# fit the imputer
missingdata_imputer.fit(X_train)
# transform the data
train_t= missingdata_imputer.transform(X_train)
test_t= missingdata_imputer.transform(X_test)