Lecture 19: SVM 2

Data Science 1: CS 109A/STAT 121A/AC 209A/ E 109A
Instructors: Pavlos Protopapas, Kevin Rader, Rahul Dave

Harvard University
Fall 2017


Import libraries

In [1]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn import ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
sns.set(style="ticks")
%matplotlib inline

%matplotlib inline

Read in the data

In [2]:
# Load dataset_1
data_bone = pd.read_csv('../data/bone.csv', delimiter=',')
data_bone.head()
Out[2]:
rat group treatment bone density lowjump highjump
0 1 Control 1 611 0 0
1 2 Control 1 621 0 0
2 3 Control 1 614 0 0
3 4 Control 1 593 0 0
4 5 Control 1 593 0 0
In [3]:
#creating the bone density measures for each group
controlgroup = data_bone['bone density'][data_bone['group']=='Control']
trt1group = data_bone['bone density'][data_bone['group']=='Lowjump']
trt2group = data_bone['bone density'][data_bone['group']=='Highjump']

#two sample t-test
sp.stats.ttest_ind(controlgroup,trt1group)
Out[3]:
Ttest_indResult(statistic=-1.0760571105207466, pvalue=0.29611370541618665)
In [4]:
#ANOVA F-test
sp.stats.f_oneway(controlgroup,trt1group,trt2group)
Out[4]:
F_onewayResult(statistic=7.977836956953773, pvalue=0.0018951062682877964)
In [9]:
#creating indicator variables for each group
ybar = np.mean(data_bone['bone density'])

controlgroup_ind = 1*(data_bone['bone density'][data_bone['group']=='Control']>ybar)
trt1group_ind = 1*(data_bone['bone density'][data_bone['group']=='Lowjump']>ybar)
trt2group_ind = 1*(data_bone['bone density'][data_bone['group']=='Highjump']>ybar)

mytable = pd.crosstab(data_bone['group'], data_bone['bone density']>ybar)

sp.stats.chi2_contingency(mytable)
Out[9]:
(13.92857142857143, 0.0009450377215474957, 2, array([[4.66666667, 5.33333333],
        [4.66666667, 5.33333333],
        [4.66666667, 5.33333333]]))