1
+ # -*- coding: utf-8 -*-
2
+ # @Author: lock
3
+ # @Date: 2017-12-23 19:24:54
4
+ # @Last Modified by: lock
5
+ # @Last Modified time: 2017-12-23 19:41:34
6
+ import math
7
+ import numpy as np
8
+ from matplotlib import pyplot
9
+ from collections import Counter
10
+ import warnings
11
+ # K最近邻算法
12
+ # 两个分组时k值取3,3个分组时k值取5...
13
+
14
+ # k-Nearest Neighbor算法
15
+ def k_nearest_neighbors (data , predict , k = 3 ):
16
+
17
+ if len (data ) >= k :
18
+ warnings .warn ("k is too small" )
19
+
20
+ # 计算predict点到各点的距离
21
+ distances = []
22
+ for group in data :
23
+ for features in data [group ]:
24
+ #euclidean_distance = np.sqrt(np.sum((np.array(features)-np.array(predict))**2)) # 计算欧拉距离,这个方法没有下面一行代码快
25
+ euclidean_distance = np .linalg .norm (np .array (features )- np .array (predict ))
26
+ distances .append ([euclidean_distance , group ])
27
+
28
+ sorted_distances = [i [1 ] for i in sorted (distances )]
29
+ top_nearest = sorted_distances [:k ]
30
+
31
+ #print(top_nearest) ['red','black','red'] 出现次数最多,返回一个TopN列表。如果n没有被指定,则返回所有元素。当多个元素计数值相同时,排列是无确定顺序的。
32
+ group_res = Counter (top_nearest ).most_common (1 )[0 ][0 ]
33
+ confidence = Counter (top_nearest ).most_common (1 )[0 ][1 ]* 1.0 / k
34
+ # confidences是对本次分类的确定程度,例如(red,red,red),(red,red,black)都分为red组,但是前者显的更自信
35
+ return group_res , confidence
36
+
37
+ if __name__ == '__main__' :
38
+
39
+ dataset = {'black' :[ [1 ,2 ], [2 ,3 ], [3 ,1 ] ], 'red' :[ [6 ,5 ], [7 ,7 ], [8 ,6 ] ]}
40
+ new_features = [3.5 ,5.2 ] # 判断这个样本属于哪个组
41
+
42
+ for i in dataset :
43
+ for ii in dataset [i ]:
44
+ pyplot .scatter (ii [0 ], ii [1 ], s = 50 , color = i )
45
+
46
+ #两个分组时k值取3,3个分组时k值取5
47
+ which_group ,confidence = k_nearest_neighbors (dataset , new_features , k = 3 )
48
+ print (which_group , confidence )
49
+
50
+ #s表示点的大小
51
+ pyplot .scatter (new_features [0 ], new_features [1 ], s = 300 , color = which_group )
52
+
53
+ pyplot .show ()
0 commit comments