# -*- coding: utf-8 -*- # --- # jupyter: # jupytext_format_version: '1.2' # jupytext_formats: ipynb,py # kernelspec: # display_name: Python 3 # language: python # name: python3 # language_info: # codemirror_mode: # name: ipython # version: 3 # file_extension: .py # mimetype: text/x-python # name: python # nbconvert_exporter: python # pygments_lexer: ipython3 # version: 3.5.2 # --- # # k-means demo # + # This line configures matplotlib to show figures embedded in the notebook, # instead of opening a new window for each figure. More about that later. # If you are using an old version of IPython, try using '%pylab inline' instead. # %matplotlib inline # import librarys from numpy import * import matplotlib.pyplot as plt import pandas as pd # Load dataset names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class'] dataset = pd.read_csv("iris.csv", header=0, index_col=0) dataset.head() # - #对类别进行编码,3个类别分别赋值0,1,2 dataset['class'][dataset['class']=='Iris-setosa']=0 dataset['class'][dataset['class']=='Iris-versicolor']=1 dataset['class'][dataset['class']=='Iris-virginica']=2 def originalDatashow(dataSet): #绘制原始的样本点 num,dim=shape(dataSet) marksamples=['ob'] #样本图形标记 for i in range(num): plt.plot(datamat.iat[i,0],datamat.iat[i,1],marksamples[0],markersize=5) plt.title('original dataset') plt.xlabel('sepal length') plt.ylabel('sepal width') plt.show() # + {"scrolled": true} #获取样本数据 datamat = dataset.loc[:, ['sepal-length', 'sepal-width']] # 真实的标签 labels = dataset.loc[:, ['class']] #原始数据显示 originalDatashow(datamat) # - def randChosenCent(dataSet,k): """初始化聚类中心:通过在区间范围随机产生的值作为新的中心点""" # 样本数 m=shape(dataSet)[0] # 初始化列表 centroidsIndex=[] #生成类似于样本索引的列表 dataIndex=list(range(m)) for i in range(k): #生成随机数 randIndex=random.randint(0,len(dataIndex)) #将随机产生的样本的索引放入centroidsIndex centroidsIndex.append(dataIndex[randIndex]) #删除已经被抽中的样本 del dataIndex[randIndex] #根据索引获取样本 centroids = dataSet.iloc[centroidsIndex] return mat(centroids) # + def distEclud(vecA, vecB): """算距离, 两个向量间欧式距离""" return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB) def kMeans(dataSet, k): # 样本总数 m = shape(dataSet)[0] # 分配样本到最近的簇:存[簇序号,距离的平方] (m行 x 2 列) clusterAssment = mat(zeros((m, 2))) # step1: 通过随机产生的样本点初始化聚类中心 centroids = randChosenCent(dataSet, k) print('最初的中心=', centroids) # 标志位,如果迭代前后样本分类发生变化值为Tree,否则为False clusterChanged = True # 查看迭代次数 iterTime = 0 # 所有样本分配结果不再改变,迭代终止 while clusterChanged: clusterChanged = False # step2:分配到最近的聚类中心对应的簇中 for i in range(m): # 初始定义距离为无穷大 minDist = inf; # 初始化索引值 minIndex = -1 # 计算每个样本与k个中心点距离 for j in range(k): # 计算第i个样本到第j个中心点的距离 distJI = distEclud(centroids[j, :], dataSet.values[i, :]) # 判断距离是否为最小 if distJI < minDist: # 更新获取到最小距离 minDist = distJI # 获取对应的簇序号 minIndex = j # 样本上次分配结果跟本次不一样,标志位clusterChanged置True if clusterAssment[i, 0] != minIndex: clusterChanged = True clusterAssment[i, :] = minIndex, minDist ** 2 # 分配样本到最近的簇 iterTime += 1 sse = sum(clusterAssment[:, 1]) print('the SSE of %d' % iterTime + 'th iteration is %f' % sse) # step3:更新聚类中心 for cent in range(k): # 样本分配结束后,重新计算聚类中心 # 获取该簇所有的样本点 ptsInClust = dataSet.iloc[nonzero(clusterAssment[:, 0].A == cent)[0]] # 更新聚类中心:axis=0沿列方向求均值。 centroids[cent, :] = mean(ptsInClust, axis=0) return centroids, clusterAssment # - # 进行k-means聚类 k = 3 # 用户定义聚类数 mycentroids, clusterAssment = kMeans(datamat, k) # + def datashow(dataSet, k, centroids, clusterAssment): # 二维空间显示聚类结果 from matplotlib import pyplot as plt num, dim = shape(dataSet) # 样本数num ,维数dim if dim != 2: print('sorry,the dimension of your dataset is not 2!') return 1 marksamples = ['or', 'ob', 'og', 'ok', '^r', '^b', ' len(marksamples): print('sorry,your k is too large,please add length of the marksample!') return 1 # 绘所有样本 for i in range(num): markindex = int(clusterAssment[i, 0]) # 矩阵形式转为int值, 簇序号 # 特征维对应坐标轴x,y;样本图形标记及大小 plt.plot(dataSet.iat[i, 0], dataSet.iat[i, 1], marksamples[markindex], markersize=6) # 绘中心点 markcentroids = ['o', '*', '^'] # 聚类中心图形标记 label = ['0', '1', '2'] c = ['yellow', 'pink', 'red'] for i in range(k): plt.plot(centroids[i, 0], centroids[i, 1], markcentroids[i], markersize=15, label=label[i], c=c[i]) plt.legend(loc='upper left') plt.xlabel('sepal length') plt.ylabel('sepal width') plt.title('k-means cluster result') # 标题 plt.show() # 画出实际图像 def trgartshow(dataSet, k, labels): from matplotlib import pyplot as plt num, dim = shape(dataSet) label = ['0', '1', '2'] marksamples = ['ob', 'or', 'og', 'ok', '^r', '^b', '