## RUN THIS CELL TO GET THE RIGHT FORMATTING 
import requests
from IPython.core.display import HTML
styles = requests.get("https://raw.githubusercontent.com/Harvard-IACS/2018-CS109A/master/content/styles/cs109.css").text
HTML(styles)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pandas.api.types import CategoricalDtype 
from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from pandas.plotting import scatter_matrix

import seaborn as sns

%matplotlib inline


league_df = pd.read_csv("league_data.csv")
league_df.head()


league_df.shape


league_df.isnull().sum()


league_df = league_df.dropna()


league_df.isnull().sum()


response = 'market_value'
y = league_df[response]


league_df.describe(include="all")


# 1. your code here


# check
list(league_df[['club', 'big_club']].groupby(['big_club']).apply(np.unique))


# 2. your code here


# check
list(league_df[['age_cat', 'age', ]].sort_values(by='age_cat').groupby(['age_cat']).apply(np.unique))


league_df.dtypes


# let's see what features we want to use in the model
categorical_cols = ['position_cat', 'new_signing', 'big_club', 'age_cat', 'region'] # non-ordinal
numerical_cols = ['age', 'page_views', 'fpl_points']
ordinal_cols = [] # we do not have any


league_df.head()


# cast categorical variables as pandas type `category`
cat_type = CategoricalDtype(ordered=False)
for var in categorical_cols:
    league_df[var] = league_df[var].astype(cat_type)


league_df[categorical_cols+numerical_cols].dtypes


# Shape of things
league_df.age.values.reshape(-1,1).shape


# your code here


# check
train_data.shape, test_data.shape, y_train.shape, y_test.shape


train_data.head()


sns.pairplot(train_data[['age', 'page_views', 'market_value']], \
                                  kind='reg', diag_kind='hist');


train_data.columns


train_data[['club','club_id']].\
                                groupby(['club_id']).agg({'club' : np.unique,
                                                        })


train_data.groupby('position').agg({
                                'market_value': np.mean,
                                'page_views': np.median,
                                'fpl_points': np.max
})


categorical_cols, numerical_cols


X_train = train_data[categorical_cols+numerical_cols].copy()
X_test = test_data[categorical_cols+numerical_cols].copy()

X_train.shape, X_test.shape, y_train.shape, y_test.shape


oh = OneHotEncoder(drop='if_binary', sparse=False, handle_unknown='error') 
oh_train = oh.fit_transform(train_data[categorical_cols])
oh_train[:10]


list(zip(categorical_cols, oh.categories_))


oh_train.shape, train_data[categorical_cols].shape


oh_test = oh.transform(test_data[categorical_cols])
oh_test.shape, test_data[categorical_cols].shape


# remember these are "views" of the dataframe
# the dataframe remains unchanged
train_data[categorical_cols].head(5)


train_data[numerical_cols].values.shape, oh_train.shape


dummies_train = pd.get_dummies(train_data[categorical_cols]) #drop_first=True
dummies_train.head()


# transform the test set
dummies_test = pd.get_dummies(test_data[categorical_cols])


pd.set_option('display.max_columns', None)
# create the design matrix for the train set
design_train_df = pd.concat([train_data[numerical_cols], dummies_train], axis=1)
design_train_df.head()


# for the test set
design_test_df = pd.concat([test_data[numerical_cols], dummies_test], axis=1)


design_train_df.dtypes


# the dataframe remains unchanged
train_data[categorical_cols].head(5)


list(zip(categorical_cols, oh.categories_))


#create linear model
regression = LinearRegression()

#fit linear model
regression.fit(design_train_df, y_train)

y_pred = regression.predict(design_test_df)

r2_train = regression.score(design_train_df, y_train)
r2_test = regression.score(design_test_df, y_test)
print(f'R^2 train = {r2_train:.5}')
print(f'R^2 test = {r2_test:.5}')


# transform categoricals
categorical_encoder = OneHotEncoder(drop='if_binary', handle_unknown='error') #handle_unknown='ignore'

# transform numericals
numerical_pipe = Pipeline([
                    ('imputer', SimpleImputer(strategy='mean')),     # for later
                    #('stdscaler', StandardScaler())                 # for later
])

# bring all transformations together
preprocessor = ColumnTransformer([
               ('cat', categorical_encoder, categorical_cols),
               ('num', numerical_pipe, numerical_cols)
])

# add a regressor
lr = LinearRegression()

model = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', lr)
])

model.fit(X_train, y_train)


ohe = (model.named_steps['preprocessor'].named_transformers_['cat'])
feature_names = ohe.get_feature_names(input_features=categorical_cols)
feature_names = np.r_[feature_names, numerical_cols]
feature_names = list(feature_names)
feature_names


print(f'LR train R^2: {model.score(X_train, y_train):.3f}')
print(f'LR test R^2: {model.score(X_test, y_test):.3f}')


# grab the linear regressor
linear_regressor = model.named_steps['regressor']
linear_regressor.coef_.shape


pd.DataFrame(zip(feature_names+numerical_cols, linear_regressor.coef_), columns=['feature', 'coeff'])


preprocessor = make_column_transformer(
         (OneHotEncoder(drop='if_binary', handle_unknown='error'), categorical_cols),
         #(StandardScaler(), numerical_columns),
         (SimpleImputer(strategy='mean'), numerical_cols),
         remainder='passthrough'
)


model = make_pipeline(
    preprocessor,
    LinearRegression()
)

model.fit(X_train, y_train)


feature_names = (model.named_steps['columntransformer']
                      .named_transformers_['onehotencoder']
                      .get_feature_names(input_features=categorical_cols))
feature_names = np.concatenate(
    [feature_names, numerical_cols])

coefs = pd.DataFrame(
    model.named_steps['linearregression'].coef_,
    columns=['Coefficients'], index=feature_names
)

coefs


print(f'LR train R^2: {model.score(X_train, y_train):.3f}')
print(f'LR test R^2: {model.score(X_test, y_test):.3f}')


# load a fresh train and test set.
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")
train_data.head(2)


# your code here


# check
print(f'LR train R^2: {model.score(X_train, y_train):.3f}')
print(f'LR test R^2: {model.score(X_test, y_test):.3f}')


agecoef = float(coefs.loc['age'].values)
age2coef = float(coefs.loc['age_sq'].values)
agecoef, age2coef


x_vals = np.linspace(-100,100,1000)
y_vals = agecoef*x_vals +age2coef*x_vals**2
plt.plot(x_vals, y_vals)
plt.title("Effect of Age on Player Market value")
plt.xlabel("Age")
plt.ylabel("Contribution to Predicted Market Value")
plt.show()

CS109A Introduction to Data Science

Lab 4: Multiple Regression and Feature engineering¶

Learning Objectives¶

1 - Exploring the Football data¶

Introduction¶

Our goal is to fit models that predict the players' market value (what the player could earn when hired by a new team).¶

Data description¶

Import the data¶

We have not talked about handling missing values so we will just drop this here.¶

Applying functions to pandas DataFrames and Series¶

Looking at data types more closely¶

Stratified train/test split¶

2 - Transform categorical variables¶

Using sklearn `OneHotEncoder()`¶

Using pandas `get_dummies()`¶

Now, let's run the model using our design matrices¶

3 - Using Transformation Pipelines¶

Making a pipeline¶

A different way to construct the pipeline¶

4 - Feature Engineering¶

Conceptual questions¶

Conceptual questions¶

END OF LAB 04¶

CS109A Introduction to Data Science

Lab 4: Multiple Regression and Feature engineering¶

Learning Objectives¶

1 - Exploring the Football data¶

Introduction¶

Our goal is to fit models that predict the players' market value (what the player could earn when hired by a new team).¶

Data description¶

Import the data¶

We have not talked about handling missing values so we will just drop this here.¶

Applying functions to pandas DataFrames and Series¶

Looking at data types more closely¶

Stratified train/test split¶

2 - Transform categorical variables¶

Using sklearn OneHotEncoder()¶

Using pandas get_dummies()¶

Now, let's run the model using our design matrices¶

3 - Using Transformation Pipelines¶

Making a pipeline¶

A different way to construct the pipeline¶

4 - Feature Engineering¶

Conceptual questions¶

Conceptual questions¶

END OF LAB 04¶

Using sklearn `OneHotEncoder()`¶

Using pandas `get_dummies()`¶