1
+ #encoding=utf-8
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ import cv2
6
+ import random
7
+ import time
8
+
9
+ from sklearn .cross_validation import train_test_split
10
+ from sklearn .metrics import accuracy_score
11
+
12
+
13
+ # 利用opencv获取图像hog特征
14
+ def get_hog_features (trainset ):
15
+ features = []
16
+
17
+ hog = cv2 .HOGDescriptor ('../hog.xml' )
18
+
19
+ for img in trainset :
20
+ img = np .reshape (img ,(28 ,28 ))
21
+ cv_img = img .astype (np .uint8 )
22
+
23
+ hog_feature = hog .compute (cv_img )
24
+ # hog_feature = np.transpose(hog_feature)
25
+ features .append (hog_feature )
26
+
27
+ features = np .array (features )
28
+ features = np .reshape (features ,(- 1 ,324 ))
29
+
30
+ return features
31
+
32
+ def Predict (testset ,trainset ,train_labels ):
33
+ predict = []
34
+ count = 0
35
+ for test_vec in testset :
36
+ print count
37
+ count += 1
38
+
39
+ knn_list = []
40
+
41
+ for i in range (len (train_labels )):
42
+ label = train_labels [i ]
43
+ train_vec = trainset [i ]
44
+
45
+ dist = np .linalg .norm (train_vec - test_vec )
46
+
47
+ if len (knn_list ) < k : # 如果还不够10个邻近点则直接添加即可
48
+ knn_list .append ((dist ,label ))
49
+ else :
50
+ max_index = - 1
51
+ max_dist = dist
52
+
53
+ # 寻找10个邻近点钟距离最远的点
54
+ for j in range (k ):
55
+ if max_dist < knn_list [j ][0 ]:
56
+ max_index = j
57
+ max_dist = knn_list [max_index ][0 ]
58
+
59
+ if max_index >= 0 :
60
+ knn_list [max_index ] = (dist ,label )
61
+
62
+ class_total = 10
63
+ class_count = [0 for i in range (class_total )]
64
+ for dist ,label in knn_list :
65
+ class_count [label ] += 1
66
+
67
+ mmax = max (class_count )
68
+
69
+ for i in range (class_total ):
70
+ if mmax == class_count [i ]:
71
+ predict .append (i )
72
+ break
73
+
74
+ return np .array (predict )
75
+
76
+ k = 10
77
+
78
+ if __name__ == '__main__' :
79
+
80
+ print 'Start read data'
81
+
82
+ time_1 = time .time ()
83
+
84
+ raw_data = pd .read_csv ('../data/train.csv' ,header = 0 )
85
+ data = raw_data .values
86
+
87
+ imgs = data [0 ::,1 ::]
88
+ labels = data [::,0 ]
89
+
90
+ features = get_hog_features (imgs )
91
+
92
+ # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
93
+ train_features , test_features , train_labels , test_labels = train_test_split (features , labels , test_size = 0.33 , random_state = 23323 )
94
+ # print train_features.shape
95
+ # print train_features.shape
96
+
97
+ time_2 = time .time ()
98
+ print 'read data cost ' ,time_2 - time_1 ,' second' ,'\n '
99
+
100
+ print 'Start training'
101
+ print 'knn do not need to train'
102
+ time_3 = time .time ()
103
+ print 'training cost ' ,time_3 - time_2 ,' second' ,'\n '
104
+
105
+ print 'Start predicting'
106
+ test_predict = Predict (test_features ,train_features ,train_labels )
107
+ time_4 = time .time ()
108
+ print 'predicting cost ' ,time_4 - time_3 ,' second' ,'\n '
109
+
110
+ score = accuracy_score (test_labels ,test_predict )
111
+ print "The accruacy socre is " , score
0 commit comments