In [1]:
import pandas as pd
import numpy as np

Pre processamento

In [2]:
dat=pd.read_csv("/Users/wainer/Dropbox/cursos/ml/abalone.csv",header=None)
In [3]:
dat.head(n=6)
Out[3]:
0 1 2 3 4 5 6 7 8
0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7
5 I 0.425 0.300 0.095 0.3515 0.1410 0.0775 0.120 8
In [4]:
zz=pd.get_dummies(dat)
y=np.where(zz[8]>13,1,0)
zz=zz.drop([8],axis=1)
zz.head(n=6)
Out[4]:
1 2 3 4 5 6 7 0_F 0_I 0_M
0 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 0 0 1
1 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 0 0 1
2 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 1 0 0
3 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 0 0 1
4 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 0 1 0
5 0.425 0.300 0.095 0.3515 0.1410 0.0775 0.120 0 1 0
In [5]:
Xtrain=np.asarray(zz)[:3133,]
Xtest=np.asarray(zz)[3133:,]
ytrain=np.asarray(y)[:3133]
ytest=np.asarray(y)[3133:]

Regressao logistica sem regularizacao

In [6]:
from sklearn.linear_model import LogisticRegression
In [7]:
model1=LogisticRegression(C=1000000,random_state=1).fit(Xtrain,ytrain)
In [8]:
ymod1=model1.predict(Xtest)

Resultado

In [9]:
print('{:.3f}'.format(np.sum(ymod1==ytest)/len(ytest),3))
0.897

Regressão logistica com regularizacao

In [10]:
model2=LogisticRegression(C=1,random_state=1).fit(Xtrain,ytrain)
ymod2=model2.predict(Xtest)
print('{:.3f}'.format(np.sum(ymod2==ytest)/len(ytest),3))
0.892

Com estandardização

In [11]:
from sklearn.preprocessing import StandardScaler
scal=StandardScaler().fit(Xtrain)
Xtrain_sc=scal.transform(Xtrain)
Xtest_sc=scal.transform(Xtest)
model3=LogisticRegression(C=1000000,random_state=1).fit(Xtrain_sc,ytrain)
ymod3=model3.predict(Xtest_sc)
print('{:.3f}'.format(np.sum(ymod3==ytest)/len(ytest),3))
0.897

PCA

In [12]:
from sklearn.decomposition import PCA
In [13]:
uu=PCA().fit(Xtrain)
In [14]:
print(np.round(uu.explained_variance_ratio_,2))
print(np.cumsum(uu.explained_variance_ratio_))
[ 0.51  0.33  0.14  0.    0.    0.    0.    0.    0.    0.  ]
[ 0.51358501  0.84686596  0.99099342  0.99491507  0.99785832  0.99888126
  0.99944619  0.99985374  1.          1.        ]
In [15]:
ww=PCA(n_components=3).fit(Xtrain)
In [16]:
Xtrain_pca=ww.transform(Xtrain)
Xtest_pca=ww.transform(Xtest)

PCA sem regularização

In [17]:
model4=LogisticRegression(C=1000000,random_state=1).fit(Xtrain_pca,ytrain)
ymod4=model4.predict(Xtest_pca)
print('{:.3f}'.format(np.sum(ymod4==ytest)/len(ytest),3))
0.884

PCA com regularização

In [18]:
model5=LogisticRegression(C=1,random_state=1).fit(Xtrain_pca,ytrain)
ymod5=model5.predict(Xtest_pca)
print('{:.3f}'.format(np.sum(ymod5==ytest)/len(ytest),3))
0.884

Valores faltantes - preprocessamento

In [19]:
from sklearn.preprocessing import Imputer
In [21]:
a2=pd.read_csv("~/Dropbox/cursos/ml/abalone-missing.csv",header=None)
In [22]:
a3=pd.get_dummies(a2)
y=np.where(a3[8]>13,1,0)
a3=a3.drop([8],axis=1)
a4=Imputer().fit_transform(a3)
Xtrain=np.asarray(a4)[:3133,]
Xtest=np.asarray(a4)[3133:,]
ytrain=np.asarray(y)[:3133]
ytest=np.asarray(y)[3133:]

valores faltantes sem regularizacao

In [23]:
model6=LogisticRegression(C=1000000,random_state=1).fit(Xtrain,ytrain)
ymod6=model6.predict(Xtest)
print('{:.3f}'.format(np.sum(ymod6==ytest)/len(ytest),3))
0.887