Skip to content

Commit f634ff2

Browse files
committed
K最近邻算法
1 parent bc66af2 commit f634ff2

File tree

3 files changed

+70
-1
lines changed

3 files changed

+70
-1
lines changed

README.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -499,11 +499,27 @@ while (p2.next is not None and p2.next.next is not None):
499499
print '快慢指针方式,单链表中间节点为:%s,索引为:%s,只遍历一次链表' % (p1.data,step)
500500
```
501501

502+
### K最近邻算法
503+
```
504+
这个算法比svm简单很多
505+
只需使用初中所学的两点距离公式(欧拉距离公式),计算绿点到各组的距离,看绿点和哪组更接近。
506+
K代表取当前要分类的点最近的k个点,这k个点如果其中属于红点个数占多数,我们就认为绿点应该划分为红组,反之,则划分为黑组。
507+
k值与分类数成正相关,现在是2个分组,那么k值取3,假设是3个分组,那么k值就要取5
508+
参考说明:https://zh.wikipedia.org/wiki/最近鄰居法
509+
依赖:
510+
pip install numpy
511+
pip install matplotlib
512+
513+
下图中标注较大的红点在计算之后被分配到红组
514+
执行:python knn.py
515+
```
516+
![](https://github.com/LockGit/Py/blob/master/img/knn.png)
517+
502518

503519
### 支持向量机 svm.py
504520
```
505521
迟早会忘记的svm
506-
分类算法,寻找一个最优超平面
522+
属分类算法,目标是寻找一个最优超平面,比knn算法复杂
507523
demo为线性可分离数据
508524
509525
参考1:https://zh.wikipedia.org/zh-hans/支持向量机

img/knn.png

113 KB
Loading

knn.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# -*- coding: utf-8 -*-
2+
# @Author: lock
3+
# @Date: 2017-12-23 19:24:54
4+
# @Last Modified by: lock
5+
# @Last Modified time: 2017-12-23 19:41:34
6+
import math
7+
import numpy as np
8+
from matplotlib import pyplot
9+
from collections import Counter
10+
import warnings
11+
# K最近邻算法
12+
# 两个分组时k值取3,3个分组时k值取5...
13+
14+
# k-Nearest Neighbor算法
15+
def k_nearest_neighbors(data, predict, k=3):
16+
17+
if len(data) >= k:
18+
warnings.warn("k is too small")
19+
20+
# 计算predict点到各点的距离
21+
distances = []
22+
for group in data:
23+
for features in data[group]:
24+
#euclidean_distance = np.sqrt(np.sum((np.array(features)-np.array(predict))**2)) # 计算欧拉距离,这个方法没有下面一行代码快
25+
euclidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
26+
distances.append([euclidean_distance, group])
27+
28+
sorted_distances =[i[1] for i in sorted(distances)]
29+
top_nearest = sorted_distances[:k]
30+
31+
#print(top_nearest) ['red','black','red'] 出现次数最多,返回一个TopN列表。如果n没有被指定,则返回所有元素。当多个元素计数值相同时,排列是无确定顺序的。
32+
group_res = Counter(top_nearest).most_common(1)[0][0]
33+
confidence = Counter(top_nearest).most_common(1)[0][1]*1.0/k
34+
# confidences是对本次分类的确定程度,例如(red,red,red),(red,red,black)都分为red组,但是前者显的更自信
35+
return group_res, confidence
36+
37+
if __name__=='__main__':
38+
39+
dataset = {'black':[ [1,2], [2,3], [3,1] ], 'red':[ [6,5], [7,7], [8,6] ]}
40+
new_features = [3.5,5.2] # 判断这个样本属于哪个组
41+
42+
for i in dataset:
43+
for ii in dataset[i]:
44+
pyplot.scatter(ii[0], ii[1], s=50, color=i)
45+
46+
#两个分组时k值取3,3个分组时k值取5
47+
which_group,confidence = k_nearest_neighbors(dataset, new_features, k=3)
48+
print(which_group, confidence)
49+
50+
#s表示点的大小
51+
pyplot.scatter(new_features[0], new_features[1], s=300, color=which_group)
52+
53+
pyplot.show()

0 commit comments

Comments
 (0)