Skip to content

Commit e4a14bb

Browse files
committed
knn slow version
1 parent 25e644a commit e4a14bb

File tree

2 files changed

+135
-0
lines changed

2 files changed

+135
-0
lines changed

knn/distance_test.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#encoding=utf-8
2+
3+
import pandas as pd
4+
import numpy as np
5+
import time
6+
7+
if __name__ == '__main__':
8+
vec_1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9+
vec_2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,]
10+
11+
vec_1 = np.array(vec_1)
12+
vec_2 = np.array(vec_2)
13+
14+
time_1 = time.time()
15+
16+
print np.sqrt(np.sum(np.square(vec_1 - vec_2)))
17+
18+
time_2 = time.time()
19+
print time_2-time_1
20+
21+
print np.linalg.norm(vec_1 - vec_2)
22+
23+
time_3 = time.time()
24+
print time_3-time_2

knn/knn.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#encoding=utf-8
2+
3+
import pandas as pd
4+
import numpy as np
5+
import cv2
6+
import random
7+
import time
8+
9+
from sklearn.cross_validation import train_test_split
10+
from sklearn.metrics import accuracy_score
11+
12+
13+
# 利用opencv获取图像hog特征
14+
def get_hog_features(trainset):
15+
features = []
16+
17+
hog = cv2.HOGDescriptor('../hog.xml')
18+
19+
for img in trainset:
20+
img = np.reshape(img,(28,28))
21+
cv_img = img.astype(np.uint8)
22+
23+
hog_feature = hog.compute(cv_img)
24+
# hog_feature = np.transpose(hog_feature)
25+
features.append(hog_feature)
26+
27+
features = np.array(features)
28+
features = np.reshape(features,(-1,324))
29+
30+
return features
31+
32+
def Predict(testset,trainset,train_labels):
33+
predict = []
34+
count = 0
35+
for test_vec in testset:
36+
print count
37+
count += 1
38+
39+
knn_list = []
40+
41+
for i in range(len(train_labels)):
42+
label = train_labels[i]
43+
train_vec = trainset[i]
44+
45+
dist = np.linalg.norm(train_vec - test_vec)
46+
47+
if len(knn_list) < k: # 如果还不够10个邻近点则直接添加即可
48+
knn_list.append((dist,label))
49+
else:
50+
max_index = -1
51+
max_dist = dist
52+
53+
# 寻找10个邻近点钟距离最远的点
54+
for j in range(k):
55+
if max_dist < knn_list[j][0]:
56+
max_index = j
57+
max_dist = knn_list[max_index][0]
58+
59+
if max_index >= 0:
60+
knn_list[max_index] = (dist,label)
61+
62+
class_total = 10
63+
class_count = [0 for i in range(class_total)]
64+
for dist,label in knn_list:
65+
class_count[label] += 1
66+
67+
mmax = max(class_count)
68+
69+
for i in range(class_total):
70+
if mmax == class_count[i]:
71+
predict.append(i)
72+
break
73+
74+
return np.array(predict)
75+
76+
k = 10
77+
78+
if __name__ == '__main__':
79+
80+
print 'Start read data'
81+
82+
time_1 = time.time()
83+
84+
raw_data = pd.read_csv('../data/train.csv',header=0)
85+
data = raw_data.values
86+
87+
imgs = data[0::,1::]
88+
labels = data[::,0]
89+
90+
features = get_hog_features(imgs)
91+
92+
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
93+
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=23323)
94+
# print train_features.shape
95+
# print train_features.shape
96+
97+
time_2 = time.time()
98+
print 'read data cost ',time_2 - time_1,' second','\n'
99+
100+
print 'Start training'
101+
print 'knn do not need to train'
102+
time_3 = time.time()
103+
print 'training cost ',time_3 - time_2,' second','\n'
104+
105+
print 'Start predicting'
106+
test_predict = Predict(test_features,train_features,train_labels)
107+
time_4 = time.time()
108+
print 'predicting cost ',time_4 - time_3,' second','\n'
109+
110+
score = accuracy_score(test_labels,test_predict)
111+
print "The accruacy socre is ", score

0 commit comments

Comments
 (0)