# You'll need to install missingno the first time you run this notebook
!pip install missingno

Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: missingno in ./.local/lib/python3.9/site-packages (0.5.0)
Requirement already satisfied: seaborn in /usr/lib/python3.9/site-packages (from missingno) (0.11.2)
Requirement already satisfied: scipy in /usr/lib/python3.9/site-packages (from missingno) (1.7.1)
Requirement already satisfied: numpy in /usr/lib/python3.9/site-packages (from missingno) (1.21.2)
Requirement already satisfied: matplotlib in /usr/lib/python3.9/site-packages (from missingno) (3.4.3)
Requirement already satisfied: cycler>=0.10 in /usr/lib/python3.9/site-packages (from matplotlib->missingno) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/lib/python3.9/site-packages (from matplotlib->missingno) (1.3.2)
Requirement already satisfied: pillow>=6.2.0 in /usr/lib/python3.9/site-packages (from matplotlib->missingno) (8.3.2)
Requirement already satisfied: pyparsing>=2.2.1 in /usr/lib/python3.9/site-packages (from matplotlib->missingno) (2.4.7)
Requirement already satisfied: python-dateutil>=2.7 in /usr/lib/python3.9/site-packages (from matplotlib->missingno) (2.8.2)
Requirement already satisfied: six in /usr/lib/python3.9/site-packages (from cycler>=0.10->matplotlib->missingno) (1.16.0)
Requirement already satisfied: pandas>=0.23 in /usr/lib/python3.9/site-packages (from seaborn->missingno) (1.3.3)
Requirement already satisfied: pytz>=2017.3 in /usr/lib/python3.9/site-packages (from pandas>=0.23->seaborn->missingno) (2021.1)


import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sklearn


df = pd.read_csv('MissingMigrants-Global-2019-12-31_correct.csv.zip', compression='zip')
df.shape

(5987, 20)


df.dtypes

Web ID                                   int64
Region of Incident                      object
Reported Date                           object
Reported Year                            int64
Reported Month                          object
Number Dead                            float64
Minimum Estimated Number of Missing    float64
Total Dead and Missing                   int64
Number of Survivors                    float64
Number of Females                      float64
Number of Males                        float64
Number of Children                     float64
Cause of Death                          object
Location Description                    object
Information Source                      object
Location Coordinates                    object
Migration Route                         object
URL                                     object
UNSD Geographical Grouping              object
Source Quality                           int64
dtype: object


df.head()


print(f'Columns with at least one nan value: {df.isna().any(0).sum()}')
print(f'Rows  with at least one nan value: {df.isna().any(1).sum()}')

Columns with at least one nan value: 12
Rows  with at least one nan value: 5954


msno.matrix(df.sample(100));

<AxesSubplot:>


msno.bar(df.sample(500));

<AxesSubplot:>


msno.heatmap(df.sample(500));

<AxesSubplot:>


msno.dendrogram(df.sample(500));

<AxesSubplot:>


g = df[df.columns[df.isna().any(0)]].isna().sum().sort_values(ascending=False) / df.shape[0]
g = g.reset_index().rename(columns={'index': 'column', 0: 'nans'})
sns.barplot(x='nans', y='column', data=g, ax=plt.subplots(1,1,figsize=(12,5))[1], palette='rocket')
plt.xlabel('ratio of nans');

Text(0.5, 0, 'ratio of nans')


sns.heatmap(df[df.columns[df.isna().any(0)]].isna().astype(int), ax=plt.subplots(1,1,figsize=(8,8))[1]);

<AxesSubplot:>


plt.figure(figsize=(16,10))

# we will plot only columns where there is at least one nan value
nan_columns = df.columns[df.isna().any(0)]
# save the number of nan values per column
count_nans = df[nan_columns].isna().sum()

values = df[nan_columns].sample(500).isna().astype(int).values

column_width = 100
columns = None

for i in range(values.shape[1]):
    name = nan_columns[i]    
    print(f'{name:40}: {count_nans[name]/df.shape[0]:.2%}')
    
    column = values[:, i]
    column = np.array([column for _ in range(column_width)]).T
    
    columns = column if columns is None else np.hstack([columns, column])
    
plt.imshow(columns, cmap='gray');

Number Dead                             : 3.76%
Minimum Estimated Number of Missing     : 90.38%
Number of Survivors                     : 84.73%
Number of Females                       : 82.33%
Number of Males                         : 45.11%
Number of Children                      : 87.42%
Location Description                    : 0.17%
Information Source                      : 0.03%
Location Coordinates                    : 0.02%
Migration Route                         : 51.04%
URL                                     : 36.36%
UNSD Geographical Grouping              : 0.77%

<matplotlib.image.AxesImage at 0x7fea348d19a0>


# sample data with nans
def get_data():
    return pd.DataFrame(
        data={
            'A': [1, 2, 3, np.nan, np.nan, 6],
            'B': [1, -1, 1, -1, 1, np.nan],
            'C': [1, 2, np.nan, 8, 16, 32],
            'D': ['male', 'female', 'male', 'female', None, 'male'],            
            'E': ['red', 'blue', 'red', 'green', None, 'yellow'],}
        
    )

df = get_data()
df


# used to plot original series with a new one where we already handled missing data
def plot_compare(df, column, title=''):
    
    org_df = get_data()
    nans = org_df.index[org_df[column].isna()]

    fig, axes = plt.subplots(1, 2, figsize=(16,2))

    org_df[[column]].plot(marker='o', ms=10, lw=0, ax=axes[0])
    if len(nans) > 0:
        for x in nans:
            axes[0].axvline(x=x, lw=1, ls='dashed', label=f'NaN: {x}')
    axes[0].set_title('original series')
    
    df[[column]].plot(marker='o', ms=10, lw=0, ax = axes[1])
    if len(nans) > 0:
        df.loc[df.index.isin(nans), column].plot(marker='o', ms=10, lw=0, color='orange', ax = axes[1])
        for x in nans:
            axes[1].axvline(x=x, lw=1, ls='dashed', label=f'NaN: {x}')    
    if len(title) > 0:
        title = f'new series ({title})'
    else:
        title = 'new series'
    axes[1].set_title(title)
    
    plt.suptitle(f'column {column}')
    plt.show()


df = get_data()
df = df.dropna(axis=0)
df


plot_compare(df, 'A', 'method=dropna')
plot_compare(df, 'B', 'method=dropna')
plot_compare(df, 'C', 'method=dropna')


df = get_data()
for c in df.select_dtypes('number'):
    df[c] = df[c].fillna(df[c].mean())
df


plot_compare(df, 'A', 'method=fillna(mean)')
plot_compare(df, 'B', 'method=fillna(mean)')
plot_compare(df, 'C', 'method=fillna(mean)')


df = get_data()
for c in df.select_dtypes('number'):
    df[c] = df[c].fillna(df[c].median())
df


plot_compare(df, 'A', 'method=fillna(median)')
plot_compare(df, 'B', 'method=fillna(median)')
plot_compare(df, 'C', 'method=fillna(median)')


df = get_data()
nans = df.isna()

# helper to highligh original NaN cells
def highlight_nans(s):
    return np.where(nans[s.name], 'color:white;background-color:black;text-align:center', '')

df.style.format(formatter=None, na_rep='').apply(highlight_nans, axis=0)


df_ffill = df.fillna(method='ffill')
df_ffill.style.apply(highlight_nans, axis=0)


plot_compare(df_ffill, 'A', 'method=fillna(ffill)')
plot_compare(df_ffill, 'B', 'method=fillna(ffill)')
plot_compare(df_ffill, 'C', 'method=fillna(ffill)')


df_bfill = df.fillna(method='bfill')
df_bfill.style.apply(highlight_nans, axis=0)


plot_compare(df_bfill, 'A', 'method=fillna(bfill)')
plot_compare(df_bfill, 'B', 'method=fillna(bfill)')
plot_compare(df_bfill, 'C', 'method=fillna(bfill)')


df[['D', 'E']].describe()


most_frequent = df[['D', 'E']].mode().head(1)
most_frequent


most_frequent = {c: most_frequent[c].item() for c in most_frequent.columns}
most_frequent

{'D': 'male', 'E': 'red'}


df.fillna(most_frequent).style.apply(highlight_nans, axis=0)


df.interpolate(method='pad', limit=1).style.apply(highlight_nans, axis=0)


df.interpolate(method='bfill', limit=2).style.apply(highlight_nans, axis=0)


df.interpolate(method='linear').style.apply(highlight_nans, axis=0)


df.interpolate(method='quadratic').style.apply(highlight_nans, axis=0)


df.set_index(np.square(df.index)).interpolate(method='quadratic').style.apply(highlight_nans, axis=0)


df.interpolate(method='polynomial', order=3).style.apply(highlight_nans, axis=0)


df.set_index(np.square(df.index)).interpolate(method='polynomial', order=3).style.apply(highlight_nans, axis=0)


# reset_index will free the index as a column that will help later
temp = df.reset_index()
# we need B without nans to work as index for interpolation
temp = temp.loc[df.B.notna(), ['index', 'B', 'D']]
temp = temp.set_index('B')
temp


# use this dictionary to create a one hot encoding version of column D
male_ohe = {'male': 1, 'female': 0, None: None}
temp['is_male'] = temp['D'].apply(lambda x: male_ohe[x]).astype(float)
temp


temp = temp.interpolate(method='nearest')
temp


# we reverse the one hot encoded at is_male column to fill column D
male_ohe_r = {v:k for k,v in male_ohe.items()}
temp['D'] = temp['is_male'].apply(lambda x: male_ohe_r[x])
temp


# recover original index to be used to return values to df
temp = temp.set_index('index')  #df = df.reset_index().set_index('age')


temp


df.loc[df.index.isin(temp.index), 'D'] = temp.D
df


# reset_index will free the index as a column that will help later
temp = df.reset_index()
# we need D without nans to work as index for interpolation 
temp = temp.loc[df.D.notna(), ['index', 'B', 'D']]
temp = temp.set_index('D')
temp


temp.reset_index(inplace=True)
temp


# use the relationship between D and B to recreate B. 
male_ohe = {'male': 1, 'female': -1, None: None}
temp['B'] = temp['D'].apply(lambda x: male_ohe[x]).astype(float)
temp


df.loc[df.index.isin(temp.index), 'B'] = temp.B
df


temp.head()


df = df.assign(
        A=df['A'].interpolate(method='linear'),
        B=df['B'],
        C=df['C'].interpolate(method='quadratic'),
        D=df['D']
    )

df.style.apply(highlight_nans, axis=0)


from matplotlib.patches import Rectangle


plt.figure(figsize=(16, 1))
ax = plt.subplot(111)
colors = {
    'red':     (1,0,0),
    'green':   (0,1,0),
    'blue':    (0,0,1),
    'magenta': (1,0,1),
    'yellow':  (1,1,0),
    'cyan':    (0,1,1),
    'black':   (0,0,0),
    'white':   (1,1,1),
}

col, row = 0., 0.
for name, c in colors.items():
    ax.add_patch(Rectangle((col, row), 1, 1, color=c))
    col += 1/len(colors)
plt.show()


def rgb_part(x, rgb_index):
    global colors
    if x == None: return None
    c = colors[x]
    return c[rgb_index]

df['E_r'] = df['E'].apply(lambda x: rgb_part(x, rgb_index=0))
df['E_g'] = df['E'].apply(lambda x: rgb_part(x, rgb_index=1))
df['E_b'] = df['E'].apply(lambda x: rgb_part(x, rgb_index=2))
df


df[df.D == 'male']


# one hot encode column D (into is_female and is_male and we drop first: is_female)
cat_dummies = pd.get_dummies(df[['D']], drop_first=True)
# we drop categorical variables and we add one hot encoded one
X = pd.concat([df.drop(columns=['D', 'E']), cat_dummies], axis=1)
X


temp.loc[temp.index == 5, 'B'] = np.random.choice(temp[temp.B.notna()].B.unique())
temp


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
X


from sklearn.impute import KNNImputer

# based of nearest 3 neighbors
imputer = KNNImputer(n_neighbors=3)


X = pd.DataFrame(imputer.fit_transform(X),columns = X.columns)


X.isna().any(0).sum(), X.isna().any(1).sum()

(0, 0)


X = pd.DataFrame(scaler.inverse_transform(X), columns = X.columns)

X


plt.figure(figsize=(1,1))
plt.subplot(111).add_patch(Rectangle((0, 0), 1, 1, color=(1,1/3,0)));

<matplotlib.patches.Rectangle at 0x7fea3691fdf0>


# we save it for later comparison
version_1 = X.copy()
X['E'] = df['E']
X.loc[X.index == 4, 'E'] = 'red'
X.drop(columns=['E_r', 'E_g', 'E_b'], inplace=True)
X['D'] = X['D_male'].apply(lambda x: 'male' if x==1 else 'female')
X = X.drop(columns=['D_male'])
version_1 = X.copy()
X


def best_color_for(neighbors):
    scaler = MinMaxScaler()
    X = pd.concat([df.drop(columns=['D', 'E']), cat_dummies], axis=1)
    X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
    imputer = KNNImputer(n_neighbors=neighbors)
    X = pd.DataFrame(imputer.fit_transform(X),columns = X.columns)
    c = tuple(X.loc[X.index == 4, ['E_r', 'E_g', 'E_b']].head(1).values.flatten())
    print(f'Neighbors: {neighbors} Imputed color: {c}')
    plt.figure(figsize=(1,1))
    plt.subplot(111).add_patch(Rectangle((0, 0), 1, 1, color=c))
    plt.show()


for neighbors in range(1, 6):
    best_color_for(neighbors)

Neighbors: 1 Imputed color: (1.0, 1.0, 0.0)
Neighbors: 2 Imputed color: (1.0, 0.5, 0.0)
Neighbors: 3 Imputed color: (1.0, 0.3333333333333333, 0.0)
Neighbors: 4 Imputed color: (0.75, 0.5, 0.0)
Neighbors: 5 Imputed color: (0.6, 0.4, 0.2)


df = get_data()
# one hot encode D
D_dummies = pd.get_dummies(df[['D']], drop_first=True)
# one hot encode color in a better way
df['E_r'] = df['E'].apply(lambda x: rgb_part(x, rgb_index=0))
df['E_g'] = df['E'].apply(lambda x: rgb_part(x, rgb_index=1))
df['E_b'] = df['E'].apply(lambda x: rgb_part(x, rgb_index=2))
# we drop categorical variables and we add one hot encoded one
X = pd.concat([df.drop(columns=['D', 'E']), D_dummies], axis=1)
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
imputer = KNNImputer(n_neighbors=3)
X = pd.DataFrame(imputer.fit_transform(X), columns = X.columns)
X = pd.DataFrame(scaler.inverse_transform(X), columns = X.columns)
X['D'] = X['D_male'].apply(lambda x: 'male' if x==1 else 'female')
X = X.drop(columns=['D_male'])
X


X['E'] = df['E']
X.loc[X.index == 4, 'E'] = 'gray'
X.drop(columns=['E_r', 'E_g', 'E_b'], inplace=True)
version_2 = X.copy()
version_2.style.apply(highlight_nans, axis=0)


plot_compare(version_2, 'A', 'method=global KNN')
plot_compare(version_2, 'B', 'method=global KNN')
plot_compare(version_2, 'C', 'method=global KNN')


version_1.style.apply(highlight_nans, axis=0)


plot_compare(version_1, 'A', 'method=linear interpolation')
plot_compare(version_1, 'B', 'method=random choice from domain')
plot_compare(version_1, 'C', 'method=quadratic interpolation')

	Web ID	Region of Incident	Reported Date	Reported Year	Reported Month	Number Dead	Minimum Estimated Number of Missing	Total Dead and Missing	Number of Survivors	Number of Females	Number of Males	Number of Children	Cause of Death	Location Description	Information Source	Location Coordinates	Migration Route	URL	UNSD Geographical Grouping	Source Quality
0	52673	Mediterranean	December 30, 2019	2019	Dec	1.0	NaN	1	11.0	NaN	NaN	NaN	Hypothermia	Unspecififed location off the coast of Algeria	El Watan	35.568972356329, -1.289773129748	Western Mediterranean	https://bit.ly/2FqQHo4	Uncategorized	1
1	52666	Mediterranean	December 30, 2019	2019	Dec	1.0	NaN	1	NaN	NaN	1.0	NaN	Presumed drowning	Recoverd on Calamorcarro Beach, Ceuta	El Foro de Ceuta	35.912383552874, -5.357673338898	Western Mediterranean	https://bit.ly/39yKRyF	Uncategorized	1
2	52663	East Asia	December 27, 2019	2019	Dec	5.0	NaN	5	NaN	NaN	3.0	NaN	Unknown	Bodies found on boat near Sado Island, Niigata...	Japan Times, Kyodo News, AFP	38.154018233313, 138.086032653130	NaN	http://bit.ly/2sCnBz1, http://bit.ly/2sEra83, ...	Eastern Asia	3
3	52662	Middle East	December 26, 2019	2019	Dec	7.0	NaN	7	64.0	NaN	NaN	NaN	Drowning	Van lake near Adilcevaz, Bitlis, Turkey	EFE, BBC, ARYnews	38.777228612085, 42.739257582031	NaN	http://bit.ly/2ZG2Y19, http://bit.ly/2MLamDf, ...	Western Asia	3
4	52661	Middle East	December 24, 2019	2019	Dec	12.0	NaN	12	NaN	NaN	NaN	NaN	Air strike	Al-Raqw market in Saada, Yemen	UN Humanitarian Coordinator in Yemen, Qatar Tr...	17.245364805636, 43.239093360326	NaN	http://bit.ly/2FjolvD, http://bit.ly/2sD42GR, ...	Western Asia	4

	A	B	C	D	E
0	1.000000	1.000000	1.000000	male	red
1	2.000000	-1.000000	2.000000	female	blue
2	3.000000	1.000000		male	red
3		-1.000000	8.000000	female	green
4		1.000000	16.000000
5	6.000000		32.000000	male	yellow

	A	B	C	D	E
0	1.000000	1.000000	1.000000	male	red
1	2.000000	-1.000000	2.000000	female	blue
2	3.000000	1.000000	2.000000	male	red
3	3.000000	-1.000000	8.000000	female	green
4	3.000000	1.000000	16.000000	female	green
5	6.000000	1.000000	32.000000	male	yellow

	A	B	C	D	E
0	1.000000	1.000000	1.000000	male	red
1	2.000000	-1.000000	2.000000	female	blue
2	3.000000	1.000000	8.000000	male	red
3	6.000000	-1.000000	8.000000	female	green
4	6.000000	1.000000	16.000000	male	yellow
5	6.000000	nan	32.000000	male	yellow

	A	B	C	D	E
0	1.000000	1.000000	1.000000	male	red
1	2.000000	-1.000000	2.000000	female	blue
2	3.000000	1.000000	2.000000	male	red
3	3.000000	-1.000000	8.000000	female	green
4	nan	1.000000	16.000000	female	green
5	6.000000	1.000000	32.000000	male	yellow

CS109A Introduction to Data Science

Missing Data¶

Handling Missing Data¶

Visualising missing data¶

missingno library¶

What are the simplest ways to handle missing data?¶

Naively handling missingness!¶

How do statsmodels and sklearn handle these NaNs?¶

What are some consequences in handling missingness in these fashions?¶

Method: dropna()¶

Method: fillna with mean¶

Method: fillna with median¶

Forward and backward filling methods¶

Method: bfill / pad¶

Method: bfill / backfill¶

Imputation using the most frequent value¶

Imputation through interpolation¶

Forward padding¶

Linear interpolation¶

Quadratic interpolation¶

Quadratic interpolation with a different index¶

Polynomial interpolation of order 3¶

Polynomial interpolation of order 3 with a different index¶

Interpolation using nearest¶

We fixed column 'D', now let's fix column 'B'¶

We can select the best method for every feature¶

It's pending to fill missing data at column E¶

Imputation through modeling¶

What about using KNN to fill everything from the start?¶

	A	B	C	D	E
0	1.0	1.0	1.0	male	red
1	2.0	-1.0	2.0	female	blue
2	3.0	1.0	NaN	male	red
3	NaN	-1.0	8.0	female	green
4	NaN	1.0	16.0	None	None
5	6.0	NaN	32.0	male	yellow

	A	B	C	D	E	E_r	E_g	E_b
0	1.0	1.0	1.000000	male	red	1.0	0.0	0.0
2	3.0	1.0	3.945946	male	red	1.0	0.0	0.0
4	5.0	1.0	16.000000	male	None	NaN	NaN	NaN
5	6.0	1.0	32.000000	male	yellow	1.0	1.0	0.0

	A	B	C	E_r	E_g	E_b	D_male
0	0.0	1.0	0.000000	1.0	0.0	0.0	1.0
1	0.2	0.0	0.032258	0.0	0.0	1.0	0.0
2	0.4	1.0	0.095031	1.0	0.0	0.0	1.0
3	0.6	0.0	0.225806	0.0	1.0	0.0	0.0
4	0.8	1.0	0.483871	NaN	NaN	NaN	1.0
5	1.0	1.0	1.000000	1.0	1.0	0.0	1.0

	D	E
count	5	5
unique	2	4
top	male	red
freq	3	2