import pandas as pd
import numpy as np
dat=pd.read_csv("/Users/wainer/Dropbox/cursos/ml/abalone.csv",header=None)
dat.head(n=6)
zz=pd.get_dummies(dat)
y=np.where(zz[8]>13,1,0)
zz=zz.drop([8],axis=1)
zz.head(n=6)
Xtrain=np.asarray(zz)[:3133,]
Xtest=np.asarray(zz)[3133:,]
ytrain=np.asarray(y)[:3133]
ytest=np.asarray(y)[3133:]
from sklearn.linear_model import LogisticRegression
model1=LogisticRegression(C=1000000,random_state=1).fit(Xtrain,ytrain)
ymod1=model1.predict(Xtest)
Resultado
print('{:.3f}'.format(np.sum(ymod1==ytest)/len(ytest),3))
model2=LogisticRegression(C=1,random_state=1).fit(Xtrain,ytrain)
ymod2=model2.predict(Xtest)
print('{:.3f}'.format(np.sum(ymod2==ytest)/len(ytest),3))
from sklearn.preprocessing import StandardScaler
scal=StandardScaler().fit(Xtrain)
Xtrain_sc=scal.transform(Xtrain)
Xtest_sc=scal.transform(Xtest)
model3=LogisticRegression(C=1000000,random_state=1).fit(Xtrain_sc,ytrain)
ymod3=model3.predict(Xtest_sc)
print('{:.3f}'.format(np.sum(ymod3==ytest)/len(ytest),3))
from sklearn.decomposition import PCA
uu=PCA().fit(Xtrain)
print(np.round(uu.explained_variance_ratio_,2))
print(np.cumsum(uu.explained_variance_ratio_))
ww=PCA(n_components=3).fit(Xtrain)
Xtrain_pca=ww.transform(Xtrain)
Xtest_pca=ww.transform(Xtest)
model4=LogisticRegression(C=1000000,random_state=1).fit(Xtrain_pca,ytrain)
ymod4=model4.predict(Xtest_pca)
print('{:.3f}'.format(np.sum(ymod4==ytest)/len(ytest),3))
model5=LogisticRegression(C=1,random_state=1).fit(Xtrain_pca,ytrain)
ymod5=model5.predict(Xtest_pca)
print('{:.3f}'.format(np.sum(ymod5==ytest)/len(ytest),3))
from sklearn.preprocessing import Imputer
a2=pd.read_csv("~/Dropbox/cursos/ml/abalone-missing.csv",header=None)
a3=pd.get_dummies(a2)
y=np.where(a3[8]>13,1,0)
a3=a3.drop([8],axis=1)
a4=Imputer().fit_transform(a3)
Xtrain=np.asarray(a4)[:3133,]
Xtest=np.asarray(a4)[3133:,]
ytrain=np.asarray(y)[:3133]
ytest=np.asarray(y)[3133:]
model6=LogisticRegression(C=1000000,random_state=1).fit(Xtrain,ytrain)
ymod6=model6.predict(Xtest)
print('{:.3f}'.format(np.sum(ymod6==ytest)/len(ytest),3))