# Import necessary libraries
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures


# Read the file "Advertising.csv" as a dataframe
df = pd.read_csv("Advertising.csv",index_col=0)

# Take a quick look at the dataframe
df.head()


# Get all the columns except 'sales' as the predictors
X = df.drop(['sales'],axis=1)

# Select 'sales' as the response variable
y = df['sales']


# Initialize a linear regression model with normalize=True
lreg = LinearRegression(normalize=True)

# Fit the model on the entire data
lreg.fit(X, y)


# Get the coefficient of each predictor as a dictionary
coef_dict = dict(zip(df.columns[:-1], np.transpose(lreg.coef_)))
predictors,coefficients = list(zip(*sorted(coef_dict.items(),key=lambda x: x[1])))


# Helper code to visualize the coefficients of all predictors
fig, ax = plt.subplots()
ax.barh(predictors,coefficients, align='center',color="#336600",alpha=0.7)
ax.grid(linewidth=0.2)
ax.set_xlabel("Coefficient")
ax.set_ylabel("Predictors")
plt.show()


# Helper function to compute the t-statistic 
def get_t(arr):
    means = np.abs(arr.mean(axis=0))
    stds = arr.std(axis=0)
    return np.divide(means,stds)


# Initialize an empty list to store the coefficient values
coef_dist = []

# Set the number of bootstraps
numboot = 1000

# Loop over the all the bootstraps
for i in range(___):

    # Get a bootstrapped version of the dataframe
    df_new = df.sample(frac=1,replace=True)

    # Get all the columns except 'sales' as the predictors
    X = df_new.drop(___,axis=1)

    # Select 'sales' as the response variable
    y = df_new[___]

    # Initialize a linear regression model with normalize=True
    lreg = LinearRegression(normalize=___)

    # Fit the model on the entire data
    lreg.fit(___, ___)

    # Append the coefficients of all predictors to the list
    coef_dist.append(lreg.coef_)

# Convert the list to a numpy array
coef_dist = np.array(coef_dist)


# Use the helper function get_t to find the T-test values
tt = get_t(___)
n = df.shape[0]


# Get the t-value associated with each predictor
tt_dict = dict(zip(df.columns[:-1], tt))
predictors, tvalues = list(zip(*sorted(tt_dict.items(),key=lambda x:x[1])))


# Helper code below to visualise the t-values
fig, ax = plt.subplots()
ax.barh(predictors,tvalues, align='center',color="#336600",alpha=0.7)
ax.grid(linewidth=0.2)
ax.set_xlabel("T-test values")
ax.set_ylabel("Predictors")
plt.show();


### edTest(test_pval) ###

# From t-test values compute the p values using scipy.stats 
# T-distribution function
pval = stats.t.sf(tt, n-1)*2

# Here we use sf i.e 'Survival function' which is 1 - CDF of the t distribution.
# We also multiply by two because its a two tailed test.
# Please refer to lecture notes for more information

# Since p values are in reversed order, we find the 'confidence' 
# which is 1-p
conf = ___


# Get the 'confidence' values associated with each predictor
conf_dict = dict(zip(df.columns[:-1], conf))
predictors, confs = list(zip(*sorted(conf_dict.items(),key=lambda x:x[1])))


# Helper code below to visualise the confidence values
fig, ax = plt.subplots()
ax.barh(predictors,confs, align='center',color="#336600",alpha=0.7)
ax.grid(linewidth=0.2)
ax.axvline(x=0.95,linewidth=3,linestyle='--', color = 'black',alpha=0.8,label = '0.95')
ax.set_xlabel("$1-p$ value")
ax.set_ylabel("Predictors")
ax.legend()
plt.show();

Title :¶

Description :¶

Data Description:¶

Instructions:¶

Hints:¶