In [22]:

def le_dados(x):
    import zipfile
    import urllib2
    import cStringIO
    
    def digito(fn):
         return fn[fn.find('/')+1]
        
    a=urllib2.urlopen(x)
    # le o zip file in memory!!
    mem=cStringIO.StringIO(a.read())
    zf=zipfile.ZipFile(mem)
    files=zf.namelist()[1:]
    n=len(files)
    aux=zf.read(files[0]).split()[4:]
    x=np.empty((n,len(aux)),dtype=np.int)
    y=np.empty(n,dtype=np.int)
    
    for i in range(len(files)):
        f=files[i]
        y[i] = int(digito(f))
        x[i]=map(int,zf.read(f).split()[4:])
    
    return (x,y)

In [23]:

X,Y=le_dados("http://www.ic.unicamp.br/~wainer/cursos/1s2014/train17.zip")

In [24]:

Out[24]:

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7])

In [25]:

X[0:10,10:30]

Out[25]:

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

I dont know if this is ok or not - there are only 0 there. Let us see one of the entries. First let turn one the online display.

In [26]:

%matplotlib inline

pyplot can show a 2D array as an image. But we must conver a line of X into a 2D array.

In [27]:

import matplotlib.pyplot as plt

In [28]:

plt.imshow(X[30,:].reshape((64,64)), cmap=plt.cm.gray)

Out[28]:

<matplotlib.image.AxesImage at 0x10bde3910>

It looks OK. An ugly "1" but a "1"

In [29]:

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X,Y)

Out[29]:

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           n_neighbors=1, p=2, weights='uniform')

In [30]:

Xtest,Ytest=le_dados("http://www.ic.unicamp.br/~wainer/cursos/1s2014/test17.zip")

In [31]:

accur=knn.score(Xtest,Ytest)

In [32]:

print(accur)

0.8125

The test went well. The accuracy was 0.8125. I can now write the loop for all k

Loop for all values of k¶

In [34]:

for k in (1,3,5,11,17,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X,Y)
    print "k=",k,1-knn.score(Xtest,Ytest)

k= 1 0.1875
k=

Let us try the PCA now.

In [36]:

pca100=PCA(n_components=100)
X100=pca100.fit_transform(X)
Xtest100=pca100.transform(Xtest)

In [37]:

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X100,Y)
print 1-knn.score(Xtest100,Ytest)

0.1875

Odd, the same number! Let us try the loop

In [38]:

for k in (1,3,5,11,17,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X100,Y)
    print "k=",k,1-knn.score(Xtest100,Ytest)

k= 1 0.1875
k= 3 0.275
k= 5 0.25
k= 11 0.3125
k= 17 0.3
k= 21 0.3

So the first is the same but not the others. It seems ok. The other PCA

In [39]:

pca40 = PCA(n_components=40)
X40=pca40.fit_transform(X)
Xtest40=pca40.transform(Xtest)
for k in (1,3,5,11,17,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X40,Y)
    print "k=",k,1-knn.score(Xtest40,Ytest)

k= 1 0.2125
k= 3 0.1875
k= 5 0.2
k= 11 0.25
k= 17 0.2375
k= 21 0.2625

Well, at least is different from the PCA of 100.

Summary for the 1/7 data set¶

The best alternatives were:

* no PCA and k=1, error rate of 0.185
* PCA of 100 and k=1, same error rate
* PCA of 40 and k=3, same error rate

I am somewhat puzzled tha the same error rate was the same

In [44]:

X,Y=le_dados("http://www.ic.unicamp.br/~wainer/cursos/1s2014/train49.zip")
Xtest,Ytest=le_dados("http://www.ic.unicamp.br/~wainer/cursos/1s2014/test49.zip")

In [47]:

for k in (1,3,5,11,17,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X,Y)
    print "k=",k,1-knn.score(Xtest,Ytest)

k= 1 0.16
k=

In [48]:

pca100 = PCA(n_components=100)
X100=pca100.fit_transform(X)
Xtest100=pca100.transform(Xtest)
for k in (1,3,5,11,17,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X100,Y)
    print "k=",k,1-knn.score(Xtest100,Ytest)

k= 1 0.22
k= 3 0.28
k= 5 0.34
k= 11 0.24
k= 17 0.28
k= 21 0.36

In [49]:

pca40 = PCA(n_components=40)
X40=pca40.fit_transform(X)
Xtest40=pca40.transform(Xtest)
for k in (1,3,5,11,17,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X40,Y)
    print "k=",k,1-knn.score(Xtest40,Ytest)

k= 1 0.12
k= 3 0.16
k= 5 0.18
k= 11 0.26
k= 17 0.24
k= 21 0.32

Exercise 2¶

Loop for all values of k¶

PCA of 100¶

PCA of 40¶

Summary for the 1/7 data set¶

The 4/9 dataset¶

No PCA¶

PCA 100¶

PCA 40¶

Summary for the 4/9 dataset¶