## RUN THIS CELL TO GET THE RIGHT FORMATTING 
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


quartets = pd.read_csv('quartets.csv', index_col=0)


quartets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44 entries, 1 to 11
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   x        44 non-null     int64  
 1   y        44 non-null     float64
 2   quartet  44 non-null     object 
dtypes: float64(1), int64(1), object(1)
memory usage: 1.4+ KB


quartets.head()


quartets.sample(5)


quartets['quartet'].unique().tolist()

['I', 'II', 'III', 'IV']


quartets.groupby('quartet').head(3)


quartets.groupby('quartet').sample(2)


quartets.groupby('quartet').size()

quartet
I      11
II     11
III    11
IV     11
dtype: int64


quartets.groupby('quartet').agg(['mean', 'std']).round(3)


quartets[quartets['quartet'] == 'I']


quartets[quartets['quartet'] == 'II']


quartets[quartets['quartet'] == 'III']


quartets[quartets['quartet'] == 'IV']


quartets.groupby('quartet').boxplot(grid=False);


sns.color_palette()


sns.color_palette('pastel')


palette = 'pastel'


fig, axes = plt.subplots(2, 2, figsize=(8,7))
axes = axes.flatten().tolist()
for quartet, g in quartets.groupby('quartet'):
    ax = axes.pop(0)
    sns.boxplot(data=g, ax=ax, palette=palette);    
    ax.set_title(f'quartet {quartet}')
plt.suptitle("Quartets' boxplots");


fig, ax = plt.subplots(1, 1, figsize=(16,4))
sns.boxplot(x='x', y='value', hue='quartet',
            data=pd.melt(quartets, id_vars='quartet', var_name='x', value_name='value'),
            ax=ax, palette=palette)
ax.set_title("quartets' features");


fig, axes = plt.subplots(1, 2, figsize=(16,4))
for i, col in enumerate(['x', 'y']):
    sns.boxplot(x='quartet', y=col, data=quartets, ax=axes[i], palette=palette);
    axes[i].set_title(f'variable {col}')


quartets.groupby('quartet').hist();


for quartet, g in quartets.groupby('quartet'):
    fig, axes = plt.subplots(1 , 2, figsize=(8, 2.5))
    sns.histplot(data=g, x="x", hue='quartet', ax=axes[0], palette=palette, bins=10, kde=True);
    sns.histplot(data=g, x="y", hue='quartet', ax=axes[1], palette=palette, bins=10, kde=True);
    plt.suptitle(f'Quartet {quartet}')


# some elements are 'bars' (default but too noisy when plotting so many features), 'step', 'poly'
element = 'step'
fig, axes = plt.subplots(1 , 2, figsize=(12, 5))
legends = []
for quartet, g in quartets.groupby('quartet'):
    legends.append(f'quartet {quartet}')
    sns.histplot(data=g, x="x", hue='quartet', ax=axes[0], palette=palette, bins=10, kde=False, alpha=.2, element=element);
    sns.histplot(data=g, x="y", hue='quartet', ax=axes[1], palette=palette, bins=10, kde=False, alpha=.2, element=element);
    
axes[0].legend(legends)
axes[1].legend(legends);


for feature in ['x', 'y']:
    # create the grid with condition quartet
    g = sns.FacetGrid(quartets, col="quartet", palette=palette, col_wrap=4)
    # for every condition we are going to create a subplot for the grid for column "feature"
    g.map(sns.histplot, feature, bins=10);
    
# col_wrap define the number of columns. Change the value to 3 and 2 to understand visually its behaviour


melted = pd.melt(quartets, id_vars='quartet', var_name='x', value_name='value').rename(columns={'x':'variable'})
melted


# create the grid with quartets as columns and variable as rows
g = sns.FacetGrid(melted, row="variable", col='quartet', palette=palette, sharex=False)
g.map(sns.histplot, 'value', bins=10);
# we need set sharex to False to avoid distorting shapes between rows (you can try changing it to True)


quartets.groupby('quartet').plot.scatter(x='x', y='y', s=50);


fig, axes = plt.subplots(2,2,figsize=(7,7))
axes = axes.flatten().tolist()
for quartet, g in quartets.groupby('quartet'):
    ax = axes.pop(0)
    sns.scatterplot(data=g, x='x', y='y', ax=ax)    
    ax.set_title(f'quartet {quartet}')
plt.subplots_adjust(hspace=0.3);


g = sns.FacetGrid(quartets, col='quartet', palette=palette, col_wrap=2, sharex=True, sharey=True)
g.map(sns.scatterplot, 'x', 'y');


quartets.sort_values(by='x').groupby('quartet').plot(x='x', y='y', marker='o', lw=.7);


# create one figure of 1 x 1 size.
fig, ax = plt.subplots(1,1,figsize=(16,6))
# plot all 4 quartets in the same ax
quartets.sort_values(by='x').groupby('quartet').plot(x='x', y='y', marker='o', ms=10, lw=.7, alpha=.7, ax=ax)
plt.ylabel('y')
plt.title('All in one quartets');


fig, ax = plt.subplots(1,1,figsize=(16,6))
sns.lineplot(data=quartets, x='x', y='y', hue='quartet', marker='o', ms=10, lw=.7, alpha=.7, ax=ax)
plt.title('All in one quartets');


fig, axes = plt.subplots(1,2,figsize=(16,4))
sns.lineplot(data=quartets, x='x', y='y', lw=.7, ax=axes[0])
axes[0].set_title('one line of seaborn')
quartets.plot(x='x', y='y', lw=.7, ax=axes[1])
axes[1].set_title('one line of matplotlib');


df = pd.read_csv('StudentsPerformance.csv').rename(
        columns={
            'race/ethnicity': 'group',
            'parental level of education': 'parental',
            'test preparation course': 'course',
            'math score': 'math',
            'reading score': 'reading',
            'writing score': 'writing'
        }
    )


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   gender    1000 non-null   object
 1   group     1000 non-null   object
 2   parental  1000 non-null   object
 3   lunch     1000 non-null   object
 4   course    1000 non-null   object
 5   math      1000 non-null   int64 
 6   reading   1000 non-null   int64 
 7   writing   1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


df.head()


df['group'].unique().tolist()

['group B', 'group C', 'group A', 'group D', 'group E']


df['group'] = df['group'].str[-1]
df['group'].unique().tolist()

['B', 'C', 'A', 'D', 'E']


df.head()


df['course'].unique()

array(['none', 'completed'], dtype=object)


# we verify that we have never change this column values yet
if 'completed' in df['course'].unique().tolist():
    df['course'] = df['course'].apply(lambda x: 1 if x == 'completed' else 0)

# we can change the column values type to boolean
df['course'] = df['course'].astype(bool)
df['course'].unique()

array([False,  True])


df.head()


df.isna().sum()

gender      0
group       0
parental    0
lunch       0
course      0
math        0
reading     0
writing     0
dtype: int64


df[['reading','math']].sample(5)


df[['reading','math']].describe()


df[df['math'] == 0]


df[['reading', 'math']].hist(bins=50, grid=False);


plt.figure(figsize=(12,4))
sns.histplot(df[['reading']], bins=50, ax=plt.subplot(121), palette=palette)
sns.histplot(df[['math']], bins=50, ax=plt.subplot(122), palette=palette);


sns.histplot(df[['reading', 'math']], bins=50, palette=palette);


df[['reading', 'math']].plot.kde()
plt.title('KDEs');


sns.histplot(df[['reading', 'math']], bins=50, kde=True, palette=palette);


df[['reading', 'math']].boxplot();


sns.boxplot(data=df[['reading', 'math']], palette=palette);


sns.boxenplot(data=df[['reading', 'math']], palette=palette);


sns.violinplot(data=df[['reading', 'math']], palette=palette);


df.plot.scatter(x='reading', y='math', s=10, alpha=.5, figsize=(6,5))
plt.title('reading vs math');


df[['reading', 'math']].corr()


for method in ['pearson', 'kendall', 'spearman']:
    # iloc is used to access value at first row second column.
    corr = df[['reading', 'math']].corr(method=method).iloc[0,1]
    print(f'{method} correlation: {corr:.3f}')

pearson correlation: 0.818
kendall correlation: 0.617
spearman correlation: 0.804


df.boxplot();


df.corr()


cols = ['math', 'reading', 'writing']
for i, c1 in enumerate(cols):
    c2 = cols[i+1] if i < len(cols)-1 else cols[0]
    df.plot.scatter(x=c1, y=c2, s=10, alpha=.5)
    plt.title(f'{c1} vs {c2}')


sns.pairplot(df.select_dtypes('number'), palette=palette);


df[['gender','math', 'reading', 'writing']].sample(5)


df[['gender','math', 'reading', 'writing']].describe()


df['gender'].value_counts(normalize=True).plot.pie(figsize=(6,6));


df.groupby('gender').mean()


df.groupby('gender').boxplot();


score_cols = df.select_dtypes('number').columns.tolist()
id_vars = [c for c in df.columns if c not in score_cols]
score_cols, id_vars
melted = pd.melt(df, id_vars=id_vars, var_name='skill', value_name='score')
melted.head()


for func in [sns.boxplot, sns.boxenplot, sns.violinplot]:
    g = sns.FacetGrid(melted, col="skill")
    g.map(func, 'score', 'gender', order=None, palette=palette);


sns.pairplot(df, palette=palette, hue='gender');


df.groupby('gender').plot.kde();


df['is_female'] = df['gender'].apply(lambda x: 1 if x == 'female' else 0)
df['is_female'] = df['is_female'].astype(float)

df['is_male'] = df['gender'].apply(lambda x: 1 if x == 'male' else 0)
df['is_male'] = df['is_male'].astype(float)

df.head()


df[['math', 'reading', 'writing']].corrwith(df['is_female'])

math      -0.167982
reading    0.244313
writing    0.301225
dtype: float64


df[['math', 'reading', 'writing']].corrwith(df['is_male'])

math       0.167982
reading   -0.244313
writing   -0.301225
dtype: float64


pd.get_dummies(df['gender']).head()


df_encoded = pd.concat([df[['math', 'reading', 'writing']], pd.get_dummies(df['gender'])], axis=1)
df_encoded.head()


df_encoded.corr()


df_normalized = df_encoded.div(df_encoded.max() - df_encoded.min())
df_normalized.head()


df_normalized.corr().round(14) == df_encoded.corr().round(14)


fig, ax = plt.subplots(1, 1, figsize=(8,6))
sns.heatmap(df_normalized.corr(), annot=True, fmt='.2f', cmap='Blues', ax=ax);


approval_threshold = 40


df['approved'] = df['math'] >= approval_threshold
df['approved'] = df['approved'].astype(int)
df.head()


df['approved'].value_counts(normalize=True).plot.bar();


sns.countplot(data=df, x='group', palette=palette);


df['approved'].value_counts(normalize=True).to_frame()


sns.barplot(data=((df['approved'].value_counts(normalize=True)*100).to_frame()
                     .reset_index().rename(columns={'approved': '%', 'index': 'approved'})),
            x='approved',
            y='%',
            palette=palette);


df[['gender', 'course', 'reading', 'writing', 'math']].groupby('gender').corrwith(df['approved'])


df.groupby('approved')['gender'].value_counts(normalize=True).plot.bar();


tmp = (df.groupby('approved')['gender'].value_counts(normalize=True).to_frame().rename(columns={'gender': '%'})*100).reset_index()
sns.barplot(data=tmp, x='approved', y='%', hue='gender', palette=palette);


sns.countplot(data=df, x='approved', hue='gender', palette=palette);


ax = plt.subplot()
for group, g in df.groupby(['approved','gender']):
    g[['math']].hist(bins=50, ax=ax, alpha=.3, label=f'{group[0]} {group[1]}');
plt.legend();


g = sns.FacetGrid(df, col='approved', row='gender')
g.map(sns.histplot, 'math', palette=palette);


g = sns.FacetGrid(df, col='approved', row='gender')
g.map(sns.histplot, 'reading', palette=palette);


g = sns.FacetGrid(df, col='approved', row='gender')
g.map(sns.histplot, 'writing', palette=palette);


# let's repeat the three features with violinplots
for feature in ['reading', 'writing', 'math']:
    g = sns.FacetGrid(df, col='approved', row='gender', sharex=True, sharey=True)
    g.map(sns.violinplot, feature, order=None, palette=palette);


df['dummy'] = ''
# let's repeat the three features with violinplots
for feature in ['reading', 'writing', 'math']:
    g = sns.FacetGrid(df, col='approved', sharey=True)
    g.map(sns.violinplot, data=df, x='dummy', y=feature, hue='gender', split=True, order=None, palette=palette);
    g.add_legend() # we want to display the gender legend
    g.set_ylabels('score')
    g.fig.subplots_adjust(top=0.8)
    g.fig.suptitle(f'feature: {feature}', fontsize=12, font='verdana')
del df['dummy']


# this should do something similar to pairplot() but without setting the histogram in the diagonal
g = sns.PairGrid(df)
g.map(sns.scatterplot);


del df['is_female']
del df['is_male']


# Create a cubehelix colormap to use with kdeplot
cmap = sns.cubehelix_palette(start=0, light=.95, as_cmap=True)
g = sns.PairGrid(df, diag_sharey=False)
g.map_upper(sns.kdeplot, cmap=cmap, fill=True)
g.map_lower(sns.kdeplot, cmap=cmap, fill=True)
g.map_diag(sns.kdeplot, color='#aa0000', fill=True);

	x	y	quartet
1	10	8.04	I
2	8	6.95	I
3	13	7.58	I
4	9	8.81	I
5	11	8.33	I

	x		y
	mean	std	mean	std
quartet
I	9	3.317	7.501	2.032
II	9	3.317	7.501	2.032
III	9	3.317	7.500	2.030
IV	9	3.317	7.501	2.031

	x	y	quartet
1	10	8.04	I
2	8	6.95	I
3	13	7.58	I
4	9	8.81	I
5	11	8.33	I
6	14	9.96	I
7	6	7.24	I
8	4	4.26	I
9	12	10.84	I
10	7	4.82	I
11	5	5.68	I

	quartet	variable	value
0	I	x	10.00
1	I	x	8.00
2	I	x	13.00
3	I	x	9.00
4	I	x	11.00
...	...	...	...
83	IV	y	5.25
84	IV	y	12.50
85	IV	y	5.56
86	IV	y	7.91
87	IV	y	6.89

	gender	group	parental	lunch	course	math	reading	writing
0	female	B	bachelor's degree	standard	none	72	72	74
1	female	C	some college	standard	completed	69	90	88
2	female	B	master's degree	standard	none	90	95	93
3	male	A	associate's degree	free/reduced	none	47	57	44
4	male	C	some college	standard	none	76	78	75

CS109A Introduction to Data Science

Lab 2: EDA with Pandas (+seaborn)¶¶

Load data¶

Exploration¶

Descriptive Statistics¶

Plot or not to plot?¶

BoxPlot¶

Histograms¶

FacetGrid¶

Scatter plots¶

Line plots¶

All in one¶

Lineplots with seaborn¶

Let´s use a different dataset¶

Let's simplify the dataframe¶

Missing values¶

Histograms for our selected variables¶

Histograms for our selected variables (seaborn)¶

Kernel Density Estimate¶

BoxPlot¶

Scatter to the rescue¶

Correlation¶

Boxplot on the whole dataframe¶

Correlation between all variables¶

Pie plot¶

One Hot Encoding¶

Heatmap¶

Who will approve?¶

Years in a cell¶

PairGrid¶

Summary¶

Some important things about using SEABORN with PANDAS!¶

	x	y	quartet
5	11	8.33	I
10	7	4.82	I
3	13	8.74	II
7	6	6.13	II
10	7	6.42	III
2	8	6.77	III
10	8	7.91	IV
4	8	8.84	IV

	x	y	quartet
1	10	9.14	II
2	8	8.14	II
3	13	8.74	II
4	9	8.77	II
5	11	9.26	II
6	14	8.10	II
7	6	6.13	II
8	4	3.10	II
9	12	9.13	II
10	7	7.26	II
11	5	4.74	II

	x	y	quartet
1	10	7.46	III
2	8	6.77	III
3	13	12.74	III
4	9	7.11	III
5	11	7.81	III
6	14	8.84	III
7	6	6.08	III
8	4	5.39	III
9	12	8.15	III
10	7	6.42	III
11	5	5.73	III

	x	y	quartet
1	8	6.58	IV
2	8	5.76	IV
3	8	7.71	IV
4	8	8.84	IV
5	8	8.47	IV
6	8	7.04	IV
7	8	5.25	IV
8	19	12.50	IV
9	8	5.56	IV
10	8	7.91	IV
11	8	6.89	IV

	gender	group	parental	lunch	course	math	reading	writing
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93
3	male	group A	associate's degree	free/reduced	none	47	57	44
4	male	group C	some college	standard	none	76	78	75

	reading	math
count	1000.000000	1000.00000
mean	69.169000	66.08900
std	14.600192	15.16308
min	17.000000	0.00000
25%	59.000000	57.00000
50%	70.000000	66.00000
75%	79.000000	77.00000
max	100.000000	100.00000

	course	math	reading	writing
course	1.000000	0.177702	0.241780	0.312946
math	0.177702	1.000000	0.817580	0.802642
reading	0.241780	0.817580	1.000000	0.954598
writing	0.312946	0.802642	0.954598	1.000000

	course	math	reading	writing
gender
female	0.355212	63.633205	72.608108	72.467181
male	0.360996	68.728216	65.473029	63.311203

	math	reading	writing	female	male
0	0.72	0.867470	0.822222	1.0	0.0
1	0.69	1.084337	0.977778	1.0	0.0
2	0.90	1.144578	1.033333	1.0	0.0
3	0.47	0.686747	0.488889	0.0	1.0
4	0.76	0.939759	0.833333	0.0	1.0

	math	reading	writing	female	male
math	True	True	True	True	True
reading	True	True	True	True	True
writing	True	True	True	True	True
female	True	True	True	True	True
male	True	True	True	True	True

	reading	math
340	61	58
370	77	84
186	76	80
687	78	77
499	71	76

	reading	math
reading	1.00000	0.81758
math	0.81758	1.00000

	approved
1	0.96
0	0.04

	course	reading	writing	math
gender
female	0.102233	0.513812	0.549594	0.548292
male	0.071767	0.317447	0.331337	0.339371