You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

k-means.py 7.0 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. # -*- coding: utf-8 -*-
  2. # ---
  3. # jupyter:
  4. # jupytext_format_version: '1.2'
  5. # jupytext_formats: ipynb,py
  6. # kernelspec:
  7. # display_name: Python 3
  8. # language: python
  9. # name: python3
  10. # language_info:
  11. # codemirror_mode:
  12. # name: ipython
  13. # version: 3
  14. # file_extension: .py
  15. # mimetype: text/x-python
  16. # name: python
  17. # nbconvert_exporter: python
  18. # pygments_lexer: ipython3
  19. # version: 3.5.2
  20. # ---
  21. # # k-means demo
  22. # +
  23. # This line configures matplotlib to show figures embedded in the notebook,
  24. # instead of opening a new window for each figure. More about that later.
  25. # If you are using an old version of IPython, try using '%pylab inline' instead.
  26. # %matplotlib inline
  27. # import librarys
  28. from numpy import *
  29. import matplotlib.pyplot as plt
  30. import pandas as pd
  31. # Load dataset
  32. names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
  33. dataset = pd.read_csv("iris.csv", header=0, index_col=0)
  34. dataset.head()
  35. # -
  36. #对类别进行编码,3个类别分别赋值0,1,2
  37. dataset['class'][dataset['class']=='Iris-setosa']=0
  38. dataset['class'][dataset['class']=='Iris-versicolor']=1
  39. dataset['class'][dataset['class']=='Iris-virginica']=2
  40. def originalDatashow(dataSet):
  41. #绘制原始的样本点
  42. num,dim=shape(dataSet)
  43. marksamples=['ob'] #样本图形标记
  44. for i in range(num):
  45. plt.plot(datamat.iat[i,0],datamat.iat[i,1],marksamples[0],markersize=5)
  46. plt.title('original dataset')
  47. plt.xlabel('sepal length')
  48. plt.ylabel('sepal width')
  49. plt.show()
  50. # + {"scrolled": true}
  51. #获取样本数据
  52. datamat = dataset.loc[:, ['sepal-length', 'sepal-width']]
  53. # 真实的标签
  54. labels = dataset.loc[:, ['class']]
  55. #原始数据显示
  56. originalDatashow(datamat)
  57. # -
  58. def randChosenCent(dataSet,k):
  59. """初始化聚类中心:通过在区间范围随机产生的值作为新的中心点"""
  60. # 样本数
  61. m=shape(dataSet)[0]
  62. # 初始化列表
  63. centroidsIndex=[]
  64. #生成类似于样本索引的列表
  65. dataIndex=list(range(m))
  66. for i in range(k):
  67. #生成随机数
  68. randIndex=random.randint(0,len(dataIndex))
  69. #将随机产生的样本的索引放入centroidsIndex
  70. centroidsIndex.append(dataIndex[randIndex])
  71. #删除已经被抽中的样本
  72. del dataIndex[randIndex]
  73. #根据索引获取样本
  74. centroids = dataSet.iloc[centroidsIndex]
  75. return mat(centroids)
  76. # +
  77. def distEclud(vecA, vecB):
  78. """算距离, 两个向量间欧式距离"""
  79. return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
  80. def kMeans(dataSet, k):
  81. # 样本总数
  82. m = shape(dataSet)[0]
  83. # 分配样本到最近的簇:存[簇序号,距离的平方] (m行 x 2 列)
  84. clusterAssment = mat(zeros((m, 2)))
  85. # step1: 通过随机产生的样本点初始化聚类中心
  86. centroids = randChosenCent(dataSet, k)
  87. print('最初的中心=', centroids)
  88. # 标志位,如果迭代前后样本分类发生变化值为Tree,否则为False
  89. clusterChanged = True
  90. # 查看迭代次数
  91. iterTime = 0
  92. # 所有样本分配结果不再改变,迭代终止
  93. while clusterChanged:
  94. clusterChanged = False
  95. # step2:分配到最近的聚类中心对应的簇中
  96. for i in range(m):
  97. # 初始定义距离为无穷大
  98. minDist = inf;
  99. # 初始化索引值
  100. minIndex = -1
  101. # 计算每个样本与k个中心点距离
  102. for j in range(k):
  103. # 计算第i个样本到第j个中心点的距离
  104. distJI = distEclud(centroids[j, :], dataSet.values[i, :])
  105. # 判断距离是否为最小
  106. if distJI < minDist:
  107. # 更新获取到最小距离
  108. minDist = distJI
  109. # 获取对应的簇序号
  110. minIndex = j
  111. # 样本上次分配结果跟本次不一样,标志位clusterChanged置True
  112. if clusterAssment[i, 0] != minIndex:
  113. clusterChanged = True
  114. clusterAssment[i, :] = minIndex, minDist ** 2 # 分配样本到最近的簇
  115. iterTime += 1
  116. sse = sum(clusterAssment[:, 1])
  117. print('the SSE of %d' % iterTime + 'th iteration is %f' % sse)
  118. # step3:更新聚类中心
  119. for cent in range(k): # 样本分配结束后,重新计算聚类中心
  120. # 获取该簇所有的样本点
  121. ptsInClust = dataSet.iloc[nonzero(clusterAssment[:, 0].A == cent)[0]]
  122. # 更新聚类中心:axis=0沿列方向求均值。
  123. centroids[cent, :] = mean(ptsInClust, axis=0)
  124. return centroids, clusterAssment
  125. # -
  126. # 进行k-means聚类
  127. k = 3 # 用户定义聚类数
  128. mycentroids, clusterAssment = kMeans(datamat, k)
  129. # +
  130. def datashow(dataSet, k, centroids, clusterAssment): # 二维空间显示聚类结果
  131. from matplotlib import pyplot as plt
  132. num, dim = shape(dataSet) # 样本数num ,维数dim
  133. if dim != 2:
  134. print('sorry,the dimension of your dataset is not 2!')
  135. return 1
  136. marksamples = ['or', 'ob', 'og', 'ok', '^r', '^b', '<g'] # 样本图形标记
  137. if k > len(marksamples):
  138. print('sorry,your k is too large,please add length of the marksample!')
  139. return 1
  140. # 绘所有样本
  141. for i in range(num):
  142. markindex = int(clusterAssment[i, 0]) # 矩阵形式转为int值, 簇序号
  143. # 特征维对应坐标轴x,y;样本图形标记及大小
  144. plt.plot(dataSet.iat[i, 0], dataSet.iat[i, 1], marksamples[markindex], markersize=6)
  145. # 绘中心点
  146. markcentroids = ['o', '*', '^'] # 聚类中心图形标记
  147. label = ['0', '1', '2']
  148. c = ['yellow', 'pink', 'red']
  149. for i in range(k):
  150. plt.plot(centroids[i, 0], centroids[i, 1], markcentroids[i], markersize=15, label=label[i], c=c[i])
  151. plt.legend(loc='upper left')
  152. plt.xlabel('sepal length')
  153. plt.ylabel('sepal width')
  154. plt.title('k-means cluster result') # 标题
  155. plt.show()
  156. # 画出实际图像
  157. def trgartshow(dataSet, k, labels):
  158. from matplotlib import pyplot as plt
  159. num, dim = shape(dataSet)
  160. label = ['0', '1', '2']
  161. marksamples = ['ob', 'or', 'og', 'ok', '^r', '^b', '<g']
  162. # 通过循环的方式,完成分组散点图的绘制
  163. for i in range(num):
  164. plt.plot(datamat.iat[i, 0], datamat.iat[i, 1], marksamples[int(labels.iat[i, 0])], markersize=6)
  165. for i in range(0, num, 50):
  166. plt.plot(datamat.iat[i, 0], datamat.iat[i, 1], marksamples[int(labels.iat[i, 0])], markersize=6,
  167. label=label[int(labels.iat[i, 0])])
  168. plt.legend(loc='upper left')
  169. # 添加轴标签和标题
  170. plt.xlabel('sepal length')
  171. plt.ylabel('sepal width')
  172. plt.title('iris true result') # 标题
  173. # 显示图形
  174. plt.show()
  175. # label=labels.iat[i,0]
  176. # -
  177. # 绘图显示
  178. datashow(datamat, k, mycentroids, clusterAssment)
  179. trgartshow(datamat, 3, labels)

机器学习越来越多应用到飞行器、机器人等领域,其目的是利用计算机实现类似人类的智能,从而实现装备的智能化与无人化。本课程旨在引导学生掌握机器学习的基本知识、典型方法与技术,通过具体的应用案例激发学生对该学科的兴趣,鼓励学生能够从人工智能的角度来分析、解决飞行器、机器人所面临的问题和挑战。本课程主要内容包括Python编程基础,机器学习模型,无监督学习、监督学习、深度学习基础知识与实现,并学习如何利用机器学习解决实际问题,从而全面提升自我的《综合能力》。

Contributors (1)