Key Word(s): NN
Models¶
In [ ]:
#!pip install seaborn
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
In [1]:
In [2]:
df = pd.read_csv("local-olives-cleaned.csv")
df.head()
Out[2]:
In [3]:
acidlist=['palmitic', 'palmitoleic', 'stearic', 'oleic', 'linoleic', 'linolenic', 'arachidic', 'eicosenoic']
In [4]:
dfsouth=df[df.regionstring=='South']
dfsouth.head()
Out[4]:
Predicting via SVM¶
In [46]:
dfnew=df[['eicosenoic', 'region', 'regionstring']]
dfnew['linoarch']=(0.969/1022.0)*df.linoleic + (0.245/105.0)*df.arachidic
dfnew.head()
Out[46]:
In [5]:
dfnosouth=df[df.regionstring!='South']
dfnosouth.head()
Out[5]:
In [7]:
plt.scatter(dfnosouth.linoleic, dfnosouth.arachidic, c=dfnosouth.region, s=50);
In [20]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC # "Support Vector Classifier"
def plot_svc_decision_function(clf, ax=None):
"""Plot the decision function for a 2D SVC"""
if ax is None:
ax = plt.gca()
x = np.linspace(plt.xlim()[0], plt.xlim()[1], 30)
y = np.linspace(plt.ylim()[0], plt.ylim()[1], 30)
Y, X = np.meshgrid(y, x)
P = np.zeros_like(X)
for i, xi in enumerate(x):
for j, yj in enumerate(y):
P[i, j] = clf.decision_function([[xi, yj]])
return ax.contour(X, Y, P, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
In [23]:
X = dfnosouth[['linoleic', 'arachidic']]
y = (dfnosouth.regionstring.values=='Sardinia')*1
Xtrain, Xtest, ytrain, ytest = train_test_split(X.values ,y)
clf = SVC(kernel="linear")
clf.fit(Xtrain, ytrain)
plt.scatter(Xtrain[:, 0], Xtrain[:, 1], c=ytrain, s=50, cmap='spring', alpha=0.3)
plot_svc_decision_function(clf, plt.gca())
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=200, facecolors='none')
plt.scatter(Xtest[:, 0], Xtest[:, 1], c=ytest, s=50, marker="s", cmap='spring', alpha=0.5);
In [24]:
clf.score(Xtest, ytest)
Out[24]:
In [57]:
confusion_matrix(clf.predict(Xtest), ytest)
Out[57]:
Allowing for crossovers¶
In [28]:
from sklearn.model_selection import GridSearchCV
def cv_optimize_svm(X, y, n_folds=10, num_p=50):
#clf = SVC()
#parameters = {"C": np.logspace(-4, 3, num=num_p), "gamma": np.logspace(-4, 3, num=10)}
clf = SVC(kernel="linear", probability=True)
parameters = {"C": np.logspace(-4, 3, num=num_p)}
gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
gs.fit(X, y)
return gs
def get_optim_classifier_svm(indf, inacidlist, clon, clonval):
subdf=indf[inacidlist]
subdfstd=(subdf - subdf.mean())/subdf.std()
X=subdfstd.values
y=(indf[clon].values==clonval)*1
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8)
#Xtrain, Xtest, ytrain, ytest=X,X,y,y
fitted=cv_optimize_svm(Xtrain, ytrain)
return fitted, Xtrain, ytrain, Xtest, ytest
In [29]:
thesvcfit, Xtr, ytr, Xte, yte = get_optim_classifier_svm(dfnosouth, ['linoleic','arachidic'],'regionstring', "Sardinia")
#thesvcfit, Xtr, ytr, Xte, yte = get_optim_classifier_binary(dfsouthns, ['palmitic','palmitoleic'],'area', 3)
thesvcfit.best_estimator_, thesvcfit.best_params_, thesvcfit.best_score_
Out[29]:
In [45]:
def plot_svm_new(clf,Xtr,ytr,Xte,yte):
plt.scatter(Xtr[:, 0], Xtr[:, 1], c=ytr, s=50, cmap='spring', alpha=0.5)
plt.scatter(Xte[:, 0], Xte[:, 1], marker='s', c=yte, s=50, cmap='spring', alpha=0.5)
#plt.xlim(-1, 4)
#plt.ylim(-1, 6)
plot_svc_decision_function(clf, plt.gca())
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=100, facecolors=None , lw=2, alpha=0.4)
print(dict(kernel="linear",**thesvcfit.best_params_))
clsvc=SVC(**dict(kernel="linear",**thesvcfit.best_params_)).fit(Xtr, ytr)
plot_svm_new(clsvc, Xtr, ytr, Xte, yte)
The best fit allows for a bigger margin by allowing some inbetween penalization. If we use the standard C=1 in scikit-learn you see that we are allowing for less penalization.