import numpy as np
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
A function to read the data from a zip file (in url) and return the X array - data, and the y vector of classes
def le_dados(x):
import zipfile
import urllib2
import cStringIO
def digito(fn):
return fn[fn.find('/')+1]
a=urllib2.urlopen(x)
# le o zip file in memory!!
mem=cStringIO.StringIO(a.read())
zf=zipfile.ZipFile(mem)
files=zf.namelist()[1:]
n=len(files)
aux=zf.read(files[0]).split()[4:]
x=np.empty((n,len(aux)),dtype=np.int)
y=np.empty(n,dtype=np.int)
for i in range(len(files)):
f=files[i]
y[i] = int(digito(f))
x[i]=map(int,zf.read(f).split()[4:])
return (x,y)
X,Y=le_dados("http://www.ic.unicamp.br/~wainer/cursos/1s2014/train17.zip")
Y
X[0:10,10:30]
I dont know if this is ok or not - there are only 0 there. Let us see one of the entries. First let turn one the online display.
%matplotlib inline
pyplot can show a 2D array as an image. But we must conver a line of X into a 2D array.
import matplotlib.pyplot as plt
plt.imshow(X[30,:].reshape((64,64)), cmap=plt.cm.gray)
It looks OK. An ugly "1" but a "1"
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X,Y)
Xtest,Ytest=le_dados("http://www.ic.unicamp.br/~wainer/cursos/1s2014/test17.zip")
accur=knn.score(Xtest,Ytest)
print(accur)
The test went well. The accuracy was 0.8125. I can now write the loop for all k
for k in (1,3,5,11,17,21):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X,Y)
print "k=",k,1-knn.score(Xtest,Ytest)
Let us try the PCA now.
pca100=PCA(n_components=100)
X100=pca100.fit_transform(X)
Xtest100=pca100.transform(Xtest)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X100,Y)
print 1-knn.score(Xtest100,Ytest)
Odd, the same number! Let us try the loop
for k in (1,3,5,11,17,21):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X100,Y)
print "k=",k,1-knn.score(Xtest100,Ytest)
So the first is the same but not the others. It seems ok. The other PCA
pca40 = PCA(n_components=40)
X40=pca40.fit_transform(X)
Xtest40=pca40.transform(Xtest)
for k in (1,3,5,11,17,21):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X40,Y)
print "k=",k,1-knn.score(Xtest40,Ytest)
Well, at least is different from the PCA of 100.
The best alternatives were:
* no PCA and k=1, error rate of 0.185
* PCA of 100 and k=1, same error rate
* PCA of 40 and k=3, same error rate
I am somewhat puzzled tha the same error rate was the same
X,Y=le_dados("http://www.ic.unicamp.br/~wainer/cursos/1s2014/train49.zip")
Xtest,Ytest=le_dados("http://www.ic.unicamp.br/~wainer/cursos/1s2014/test49.zip")
for k in (1,3,5,11,17,21):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X,Y)
print "k=",k,1-knn.score(Xtest,Ytest)
pca100 = PCA(n_components=100)
X100=pca100.fit_transform(X)
Xtest100=pca100.transform(Xtest)
for k in (1,3,5,11,17,21):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X100,Y)
print "k=",k,1-knn.score(Xtest100,Ytest)
pca40 = PCA(n_components=40)
X40=pca40.fit_transform(X)
Xtest40=pca40.transform(Xtest)
for k in (1,3,5,11,17,21):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X40,Y)
print "k=",k,1-knn.score(Xtest40,Ytest)
PCA of 40, k=1, error rate 0.12