import pandas as pd


pd.__version__

'1.3.2'


s = pd.Series(data=['A', 'B', 'C', 'D', 'E'], index=range(10, 5, -1))
s

10    A
9     B
8     C
7     D
6     E
dtype: object


s.values

array(['A', 'B', 'C', 'D', 'E'], dtype=object)


s.index

RangeIndex(start=10, stop=5, step=-1)


tpl = 'https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.{}.html'

for m in ['clipboard', 'csv', 'excel', 'feather', 'fwf', 'gbq',
          'hdf', 'html', 'json', 'parquet', 'pickle', 'spss',
          'sql', 'sql_query', 'sql_table', 'stata', 'table', 'xml']:

    method = f'read_{m}'
    url =  tpl.format(method)
    print(f'{method}\t{url}')

read_clipboard	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_clipboard.html
read_csv	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
read_excel	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
read_feather	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html
read_fwf	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_fwf.html
read_gbq	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html
read_hdf	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_hdf.html
read_html	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_html.html
read_json	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_json.html
read_parquet	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html
read_pickle	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_pickle.html
read_spss	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_spss.html
read_sql	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html
read_sql_query	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql_query.html
read_sql_table	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql_table.html
read_stata	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_stata.html
read_table	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_table.html
read_xml	https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_xml.html


df = pd.read_html('https://en.wikipedia.org/wiki/Harvard_University', match='School')[0]

df


df = pd.read_html('https://en.wikipedia.org/wiki/Harvard_University', match='School')[0]
df = df.rename(columns=df.iloc[0])[1:]
df


df = pd.read_csv('avocado.csv.zip', index_col=0, compression='zip', nrows=None)


df.shape

(18249, 13)


len(df.index), len(df.columns)

(18249, 13)


df.columns

Index(['Date', 'AveragePrice', 'Total Volume', '4046', '4225', '4770',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region'],
      dtype='object')


type(df.columns) == pd.Index

True


df.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
            ...
             2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
           dtype='int64', length=18249)


df.dtypes

Date             object
AveragePrice    float64
Total Volume    float64
4046            float64
4225            float64
4770            float64
Total Bags      float64
Small Bags      float64
Large Bags      float64
XLarge Bags     float64
type             object
year              int64
region           object
dtype: object


df['Date'] = pd.to_datetime(df['Date'])
df.dtypes

Date            datetime64[ns]
AveragePrice           float64
Total Volume           float64
4046                   float64
4225                   float64
4770                   float64
Total Bags             float64
Small Bags             float64
Large Bags             float64
XLarge Bags            float64
type                    object
year                     int64
region                  object
dtype: object


df.head()


df.tail(2)


few_columns = True
df.info(verbose=few_columns)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18249 entries, 0 to 11
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          18249 non-null  datetime64[ns]
 1   AveragePrice  18249 non-null  float64       
 2   Total Volume  18249 non-null  float64       
 3   4046          18249 non-null  float64       
 4   4225          18249 non-null  float64       
 5   4770          18249 non-null  float64       
 6   Total Bags    18249 non-null  float64       
 7   Small Bags    18249 non-null  float64       
 8   Large Bags    18249 non-null  float64       
 9   XLarge Bags   18249 non-null  float64       
 10  type          18249 non-null  object        
 11  year          18249 non-null  int64         
 12  region        18249 non-null  object        
dtypes: datetime64[ns](1), float64(9), int64(1), object(2)
memory usage: 1.9+ MB


df.describe()


# this should be False because we just say that column name inside brackets returns a Series
type(df['AveragePrice']) == pd.DataFrame

False


type(df['AveragePrice']) == pd.Series

True


# this should be True because we say that a list of column names inside brackets returns a sub dataframe
type(df[['AveragePrice']]) == pd.DataFrame

True


df['AveragePrice'].head()

0    1.33
1    1.35
2    0.93
3    1.08
4    1.28
Name: AveragePrice, dtype: float64


df[['AveragePrice']].head()


df['AveragePrice'].values

array([1.33, 1.35, 0.93, ..., 1.87, 1.93, 1.62])


 df[['AveragePrice']].values

array([[1.33],
       [1.35],
       [0.93],
       ...,
       [1.87],
       [1.93],
       [1.62]])


df['AveragePrice'].values.shape, df[['AveragePrice']].values.shape

((18249,), (18249, 1))


df[['Date','AveragePrice']].head(10)


condition = df['Date'] == '2015-10-25'
condition

0     False
1     False
2     False
3     False
4     False
      ...  
7     False
8     False
9     False
10    False
11    False
Name: Date, Length: 18249, dtype: bool


df[condition].head()


df[(df['Date'] == '2015-10-25') & (df['AveragePrice'] < .90)].head()


df.loc[9]


df.iloc[9]

Date            2015-10-25 00:00:00
AveragePrice                   1.07
Total Volume               74338.76
4046                          842.4
4225                       64757.44
4770                          113.0
Total Bags                  8625.92
Small Bags                  8061.47
Large Bags                   564.45
XLarge Bags                     0.0
type                   conventional
year                           2015
region                       Albany
Name: 9, dtype: object


type(df.iloc[9])

pandas.core.series.Series


df.sum()

/tmp/ipykernel_32/1703867807.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df.sum()

AveragePrice                                              25657.7
Total Volume                                   15523402593.400002
4046                                                5347110739.26
4225                                                5386275717.93
4770                                                 416802342.13
Total Bags                                      4373175798.389999
Small Bags                                          3324870837.51
Large Bags                                           991615770.55
XLarge Bags                                           56689177.33
type            conventionalconventionalconventionalconvention...
year                                                     36792683
region          AlbanyAlbanyAlbanyAlbanyAlbanyAlbanyAlbanyAlba...
dtype: object


df['AveragePrice'].mean()

1.4059784097758825


df.isna().sum()

Date            0
AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64


df.isna().sum(axis=1)

0     0
1     0
2     0
3     0
4     0
     ..
7     0
8     0
9     0
10    0
11    0
Length: 18249, dtype: int64


df.isna().sum().sum()

0


df[df.isna().any(axis=1)]


drop_columns = ['4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags']
df = df.drop(columns=drop_columns)
df


condition = (df['Date'] == '2018-01-07') & (df['type'] == 'organic')
df[condition].sort_values(by='region').head(10)


df[condition].sort_values(by=['region', 'AveragePrice'], ascending=[True, False]).head()


df.sort_index()


df.rename(columns={'AveragePrice': 'price', 'Total Volume': 'volume'}, inplace=True)
df


df.count()

Date      18249
price     18249
volume    18249
type      18249
year      18249
region    18249
dtype: int64


df.region.nunique()

54


df.nunique()

Date        169
price       259
volume    18237
type          2
year          4
region       54
dtype: int64


df['type'].unique()

array(['conventional', 'organic'], dtype=object)


df['type'].unique().tolist()

['conventional', 'organic']


df.value_counts(subset='type')

type
conventional    9126
organic         9123
dtype: int64


df.value_counts(subset='year', sort=False)

year
2015    5615
2016    5616
2017    5722
2018    1296
dtype: int64


df.value_counts(subset=['year', 'type'], sort=False)

year  type        
2015  conventional    2808
      organic         2807
2016  conventional    2808
      organic         2808
2017  conventional    2862
      organic         2860
2018  conventional     648
      organic          648
dtype: int64


df.value_counts(subset=['year', 'type'], sort=False, normalize=True)*100

year  type        
2015  conventional    15.387145
      organic         15.381665
2016  conventional    15.387145
      organic         15.387145
2017  conventional    15.683051
      organic         15.672092
2018  conventional     3.550880
      organic          3.550880
dtype: float64


df.groupby('year').count()


df.groupby('year').max()


df.groupby('year').mean()


df.groupby('year').describe()


df.groupby('year').first()


df.groupby('year').head(1)


condition = (df['year'] == 2018)
df[condition][['price','volume']].agg(['min', 'mean', 'std', 'max'])


df.groupby('year')[['price','volume']].agg(['min', 'mean', 'std', 'max'])


df[condition][['price', 'volume']].quantile(.10)

price        0.970
volume    8174.655
Name: 0.1, dtype: float64


def percentil_10(x): return x.quantile(.10)
def percentil_90(x): return x.quantile(.90)

df[condition][['price','volume']].agg([percentil_10, 'median', percentil_90])


df.groupby('year')[['price','volume']].agg([percentil_10, 'median', percentil_90])

index	First Name	Last Name
0	Ann	Gatton
1	John	Fosa
2	Zack	Kaufman

	0	1
0	School	Founded
1	Harvard College	1636
2	Medicine	1782
3	Divinity	1816
4	Law	1817
5	Dental Medicine	1867
6	Arts and Sciences	1872
7	Business	1908
8	Extension	1910
9	Design	1914
10	Education	1920
11	Public Health	1922
12	Government	1936
13	Engineering and Applied Sciences	2007

	School	Founded
1	Harvard College	1636
2	Medicine	1782
3	Divinity	1816
4	Law	1817
5	Dental Medicine	1867
6	Arts and Sciences	1872
7	Business	1908
8	Extension	1910
9	Design	1914
10	Education	1920
11	Public Health	1922
12	Government	1936
13	Engineering and Applied Sciences	2007

	Date	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	type	year	region
0	2015-12-27	1.33	64236.62	1036.74	54454.85	48.16	8696.87	8603.62	93.25	conventional	2015	Albany
1	2015-12-20	1.35	54876.98	674.28	44638.81	58.33	9505.56	9408.07	97.49	conventional	2015	Albany
2	2015-12-13	0.93	118220.22	794.70	109149.67	130.50	8145.35	8042.21	103.14	conventional	2015	Albany
3	2015-12-06	1.08	78992.15	1132.00	71976.41	72.58	5811.16	5677.40	133.76	conventional	2015	Albany
4	2015-11-29	1.28	51039.60	941.48	43838.39	75.78	6183.95	5986.26	197.69	conventional	2015	Albany

	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	year
count	18249.000000	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	1.824900e+04	18249.000000	18249.000000
mean	1.405978	8.506440e+05	2.930084e+05	2.951546e+05	2.283974e+04	2.396392e+05	1.821947e+05	5.433809e+04	3106.426507	2016.147899
std	0.402677	3.453545e+06	1.264989e+06	1.204120e+06	1.074641e+05	9.862424e+05	7.461785e+05	2.439660e+05	17692.894652	0.939938
min	0.440000	8.456000e+01	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000	2015.000000
25%	1.100000	1.083858e+04	8.540700e+02	3.008780e+03	0.000000e+00	5.088640e+03	2.849420e+03	1.274700e+02	0.000000	2015.000000
50%	1.370000	1.073768e+05	8.645300e+03	2.906102e+04	1.849900e+02	3.974383e+04	2.636282e+04	2.647710e+03	0.000000	2016.000000
75%	1.660000	4.329623e+05	1.110202e+05	1.502069e+05	6.243420e+03	1.107834e+05	8.333767e+04	2.202925e+04	132.500000	2017.000000
max	3.250000	6.250565e+07	2.274362e+07	2.047057e+07	2.546439e+06	1.937313e+07	1.338459e+07	5.719097e+06	551693.650000	2018.000000

CS109a Introduction to PANDAS

Lecture 1, Pandas Intro¶

Pandas¶

Installing¶

PANDAS Basics¶

Importing pandas¶

Pandas data structures¶

pd.Series¶

pd.DataFrame¶

Loading data¶

pd.read_csv¶

Let's use it to load Avocado prices ¶

Load dataset¶

Roughly exploring the data¶

Descriptive statistics¶

Data Selection¶

Column names¶

$[]$ vs $[[]]$¶

Filtering¶

Logical expressions¶

.loc[] vs .iloc[]¶

.loc[]¶

.iloc[]¶

TIP: practice to really learn how and when to use .loc vs i.loc

Mathematical and other methods on a DataFrame¶

Missing Data¶

Dropping¶

Sorting¶

Renaming¶

Counting¶

Grouping¶

Functions¶

Aggregate¶

Summary¶

Facts:¶

Topics left out maybe for other lectures:¶

Next lecture¶

	Date	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	type	year	region
10	2018-01-14	1.93	16205.22	1527.63	2981.04	727.01	10969.54	10919.54	50.00	0.0	organic	2018	WestTexNewMexico
11	2018-01-07	1.62	17489.58	2894.77	2356.13	224.53	12014.15	11988.14	26.01	0.0	organic	2018	WestTexNewMexico

	Date	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	type	year	region
9	2015-10-25	1.07	74338.76	842.40	64757.44	113.00	8625.92	8061.47	564.45	0.0	conventional	2015	Albany
9	2015-10-25	1.09	358478.08	236814.29	64607.97	304.36	56751.46	31826.88	24924.58	0.0	conventional	2015	Atlanta
9	2015-10-25	1.19	656892.03	53766.25	397911.35	49085.74	156128.69	149987.55	6141.14	0.0	conventional	2015	BaltimoreWashington
9	2015-10-25	1.11	59874.45	29521.58	10089.82	6551.57	13711.48	13660.98	0.00	50.5	conventional	2015	Boise
9	2015-10-25	1.02	534249.47	4005.39	430725.78	191.31	99326.99	94581.94	4745.05	0.0	conventional	2015	Boston

	Date	AveragePrice	Total Volume	4046	4225	4770	Total Bags	Small Bags	Large Bags	XLarge Bags	type	year	region
9	2015-10-25	0.86	1010394.81	557469.46	301143.50	49959.10	101822.75	96417.63	5279.41	125.71	conventional	2015	DallasFtWorth
9	2015-10-25	0.88	933623.58	437329.85	313129.29	81274.85	101889.59	57577.21	44260.60	51.78	conventional	2015	Houston
9	2015-10-25	0.83	761261.71	435986.90	240689.98	19968.66	64616.17	64585.35	30.82	0.00	conventional	2015	PhoenixTucson
9	2015-10-25	0.86	4912068.04	2542914.87	1537781.45	247539.31	583832.41	475267.20	108231.39	333.82	conventional	2015	SouthCentral
9	2015-10-25	0.82	635873.60	363487.08	166607.85	31960.04	73818.63	72717.86	1100.77	0.00	conventional	2015	WestTexNewMexico

	Date	AveragePrice	Total Volume	type	year	region
11	2018-01-07	1.54	4816.90	organic	2018	Albany
11	2018-01-07	1.53	15714.11	organic	2018	Atlanta
11	2018-01-07	1.15	82282.71	organic	2018	BaltimoreWashington
11	2018-01-07	1.77	2553.90	organic	2018	Boise
11	2018-01-07	1.91	30096.00	organic	2018	Boston
11	2018-01-07	1.17	9115.92	organic	2018	BuffaloRochester
11	2018-01-07	1.95	156341.57	organic	2018	California
11	2018-01-07	1.08	28741.11	organic	2018	Charlotte
11	2018-01-07	1.83	41573.25	organic	2018	Chicago
11	2018-01-07	1.71	13141.82	organic	2018	CincinnatiDayton

	Date	price	volume	type	region
year
2015	5615	5615	5615	5615	5615
2016	5616	5616	5616	5616	5616
2017	5722	5722	5722	5722	5722
2018	1296	1296	1296	1296	1296

	price	volume
year
2015	1.375590	7.810274e+05
2016	1.338640	8.584206e+05
2017	1.515128	8.623393e+05
2018	1.347531	1.066928e+06

	price	volume
min	0.560000	2.064900e+03
mean	1.347531	1.066928e+06
std	0.305858	4.285501e+06
max	2.300000	6.250565e+07

	price	volume
percentil_10	0.970	8174.655
median	1.345	157175.090
percentil_90	1.750	1810981.615

	price								volume
	count	mean	std	min	25%	50%	75%	max	count	mean	std	min	25%	50%	75%	max
year
2015	5615.0	1.375590	0.375595	0.49	1.07	1.300	1.67	2.79	5615.0	7.810274e+05	3.171256e+06	84.56	6931.6300	76146.82	400176.6800	44655461.51
2016	5616.0	1.338640	0.393708	0.51	1.04	1.300	1.56	3.25	5616.0	8.584206e+05	3.478732e+06	385.55	10643.6850	109597.29	451107.2925	52288697.89
2017	5722.0	1.515128	0.432906	0.44	1.22	1.490	1.77	3.17	5722.0	8.623393e+05	3.481957e+06	515.01	13790.6975	122915.75	426454.5125	61034457.10
2018	1296.0	1.347531	0.305858	0.56	1.13	1.345	1.56	2.30	1296.0	1.066928e+06	4.285501e+06	2064.90	17690.9825	157175.09	529462.2450	62505646.52

	price			volume
	percentil_10	median	percentil_90	percentil_10	median	percentil_90
year
2015	0.96	1.300	1.90	2431.434	76146.82	1285267.958
2016	0.88	1.300	1.86	4146.935	109597.29	1351865.735
2017	0.98	1.490	2.07	5889.687	122915.75	1398304.817
2018	0.97	1.345	1.75	8174.655	157175.09	1810981.615

CS109a Introduction to PANDAS

Lecture 1, Pandas Intro¶

Pandas¶

Installing¶

PANDAS Basics¶

Importing pandas¶

Pandas data structures¶

pd.Series¶

pd.DataFrame¶

Loading data¶

pd.read_csv¶

Let's use it to load Avocado prices¶

Load dataset¶

Roughly exploring the data¶

Descriptive statistics¶

Data Selection¶

Column names¶

$[]$ vs $[[]]$¶

Filtering¶

Logical expressions¶

.loc[] vs .iloc[]¶

.loc[]¶

.iloc[]¶

TIP: practice to really learn how and when to use .loc vs i.loc

Mathematical and other methods on a DataFrame¶

Missing Data¶

Dropping¶

Sorting¶

Renaming¶

Counting¶

Grouping¶

Functions¶

Aggregate¶

Summary¶

Facts:¶

Topics left out maybe for other lectures:¶

Next lecture¶

Let's use it to load Avocado prices ¶