import pandas as pd


# import the CSV file
df = pd.read_csv("StudentsPerformance.csv")
df.head()


df.iloc[10] == df.loc[10]

gender                         True
race/ethnicity                 True
parental level of education    True
lunch                          True
test preparation course        True
math score                     True
reading score                  True
writing score                  True
Name: 10, dtype: bool


all(df.iloc[10] == df.loc[10])

True


df = df.sample(frac=1, random_state=109)
df


df.iloc[10] == df.loc[10]

gender                         False
race/ethnicity                 False
parental level of education    False
lunch                           True
test preparation course        False
math score                     False
reading score                  False
writing score                  False
dtype: bool


all(df.iloc[10] == df.loc[10])

False


df.loc[10]

gender                                       male
race/ethnicity                            group C
parental level of education    associate's degree
lunch                                    standard
test preparation course                      none
math score                                     58
reading score                                  54
writing score                                  52
Name: 10, dtype: object


df.index[10]

342


df.iloc[10]

gender                              female
race/ethnicity                     group B
parental level of education    high school
lunch                             standard
test preparation course          completed
math score                              69
reading score                           76
writing score                           74
Name: 342, dtype: object


### edTest(test_a) ###
row5 = df.iloc[4]
row5

gender                                male
race/ethnicity                     group C
parental level of education    high school
lunch                             standard
test preparation course          completed
math score                              82
reading score                           84
writing score                           82
Name: 49, dtype: object


df.iloc[:5]


### edTest(test_b) ###
sorted_df = df.sort_values(by='math score', ascending=False)
sorted_df


### edTest(test_c) ###
sorted_row5 = sorted_df.iloc[4]
sorted_row5

gender                                     female
race/ethnicity                            group E
parental level of education    associate's degree
lunch                                    standard
test preparation course                      none
math score                                    100
reading score                                 100
writing score                                 100
Name: 962, dtype: object


# len('head()') < len('iloc[:5]') :)
sorted_df.head()

df


df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')


### edTest(test_d) ###
df = df.rename(columns={'race/ethnicity':'race', 'parental level of education': 'peduc','test preparation course':'course','math score': 'math',
    'reading score': 'reading',
    'writing score': 'writing', })
df


# 'poking holes' in the data
df.iloc[0,5] = None
df.iloc[2,2] = None


df.isna().head()


### edTest(test_e) ###
resultE = df.isna().sum()
display(resultE)

gender     0
race       0
peduc      1
lunch      0
course     0
math       1
reading    0
writing    0
dtype: int64


df['math'] = df['math'].fillna(df['math'].mean())
df.head()


df = df.dropna()
df.head()


df.nunique()

gender      2
race        5
peduc       6
lunch       2
course      2
math       82
reading    72
writing    77
dtype: int64


df['gender'].unique().tolist()

['male', 'female']


# the line below represents  the usage that should be avoided
# df['math'].unique()


### edTest(test_f) ###
def print_uniques(df, col, limit=10):
    """Print column's uniques values when the number of column's uniques is lower or equal than limit """
    n = df[col].nunique()
    if n <= limit:
        print(f'{col}:', df[col].unique().tolist())
    else:
        print(f'{col}:', f'more than {limit} uniques')

for col in df.columns:
    print_uniques(df, col)

gender: ['male', 'female']
race: ['group D', 'group E', 'group C', 'group B', 'group A']
peduc: ['some high school', 'high school', "bachelor's degree", 'some college', "associate's degree", "master's degree"]
lunch: ['free/reduced', 'standard']
course: ['none', 'completed']
math: more than 10 uniques
reading: more than 10 uniques
writing: more than 10 uniques


df.describe()


### edTest(test_g) ###
resultG = df[['writing', 'reading']].aggregate(['mean', 'std'])
display(resultG)


df.groupby(['peduc', 'gender']).agg({'math': 'mean',
                                     'reading': 'mean',
                                     'writing': 'mean'})

Index	Words
0	this
1	is
2	not
3	easy

Index	Words
3	easy
1	is
2	not
0	this

	math	reading	writing
count	999.000000	999.000000	999.000000
mean	66.103202	69.176176	68.059059
std	15.166754	14.605740	15.202426
min	0.000000	17.000000	10.000000
25%	57.000000	59.000000	57.500000
50%	66.000000	70.000000	69.000000
75%	77.000000	79.000000	79.000000
max	100.000000	100.000000	100.000000

		math	reading	writing
peduc	gender
associate's degree	female	65.250000	74.120690	74.000000
associate's degree	male	70.764151	67.433962	65.405660
bachelor's degree	female	68.349206	77.285714	78.380952
bachelor's degree	male	70.581818	68.090909	67.654545
high school	female	59.322581	68.268817	66.731183
high school	male	64.705882	61.480392	58.539216
master's degree	female	66.500000	76.805556	77.638889
master's degree	male	74.826087	73.130435	72.608696
some college	female	65.406780	73.550847	74.050847
some college	male	69.009259	64.990741	63.148148
some high school	female	59.296703	69.109890	68.285714
some high school	male	67.955672	64.693182	61.375000

CS109A Introduction to PANDAS

Lecture 2, Exercise 2: PANDAS Intro 2¶

Indexing - iloc and loc¶

Sorting¶

Column Naming Conventions¶

Look for missing values¶

Categorical Columns - nunique() and unique()¶

Descriptive statistics¶

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93
3	male	group A	associate's degree	free/reduced	none	47	57	44
4	male	group C	some college	standard	none	76	78	75

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
301	male	group D	some high school	free/reduced	none	56	54	52
895	female	group E	some high school	free/reduced	none	32	34	38
763	female	group B	high school	standard	none	62	62	63
854	male	group C	some high school	standard	none	62	64	55
49	male	group C	high school	standard	completed	82	84	82
...	...	...	...	...	...	...	...	...
399	male	group D	some high school	standard	none	60	59	54
141	female	group C	some college	free/reduced	none	59	62	64
757	male	group E	bachelor's degree	free/reduced	completed	70	68	72
245	male	group C	associate's degree	standard	none	85	76	71
262	female	group C	some high school	free/reduced	none	44	50	51

	gender	race	peduc	lunch	course	math	reading	writing
301	False	False	False	False	False	True	False	False
895	False	False	False	False	False	False	False	False
763	False	False	True	False	False	False	False	False
854	False	False	False	False	False	False	False	False
49	False	False	False	False	False	False	False	False