算法实现一:K-means

匿名 (未验证) 提交于 2019-12-02 23:43:01

k-means基础实现

__author__ = 'Administrator' from numpy import * import time import matplotlib.pyplot as plt     # 计算距离(欧式) def euclDistance(vector1, vector2):     return sqrt(sum(power(vector2 - vector1, 2)))      # 初始中心点(随机) def initCentroids(dataSet, k):     numSamples,dim = dataSet.shape     centroids = zeros((k, dim))     for i in range(k):         index = int(random.uniform(0, numSamples))     centroids[i, :] = dataSet[index, :]     return centroids def loaddata(name):     dataMat=[]     fe=open(name,'r')     for line in fe:         strs=line.restrip().split(',')         flt=map(float,strs)         dataMat.append(flt)     return dataMat     # k-means cluster def kmeans(dataSet, k):     numSamples = dataSet.shape[0]     # first column stores which cluster this sample belongs to,     # second column stores the error between this sample and its centroid     clusterAssment = mat(zeros((numSamples, 2)))     clusterChanged = True      ## step 1: init centroids     centroids = initCentroids(dataSet, k)     while clusterChanged:     clusterChanged = False         ## for each sample     for i in xrange(numSamples):         minDist  = 100000.0             minIndex = 0             ## for each centroid             ## step 2: find the centroid who is closest         for j in range(k):             distance = euclDistance(centroids[j, :], dataSet[i, :])                 if distance < minDist:                     minDist  = distance                 minIndex = j          ## step 3: update its cluster         if clusterAssment[i, 0] != minIndex:             clusterChanged = True                 clusterAssment[i, :] = minIndex, minDist**2      ## step 4: update centroids     for j in range(k):         pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]]         centroids[j, :] = mean(pointsInCluster, axis = 0)      print 'Congratulations, cluster complete!'     return centroids, clusterAssment      # show your cluster only available with 2-D data def showCluster(dataSet, k, centroids, clusterAssment):     numSamples, dim = dataSet.shape     if dim != 2:     print "Sorry! notice ,I can not draw because the dimension of your data is not 2!"     return 1      mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']     if k > len(mark):     print "Sorry! Your k is too large! please contact Zouxy"     return 1      # draw all samples     for i in xrange(numSamples):     markIndex = int(clusterAssment[i, 0])         plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])      mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']     # draw the centroids     for i in range(k):     plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12)      plt.show()   if __name__=='__main__':     data=loaddata('data.txt')     kmeans(data,5) 
文章来源: https://blog.csdn.net/weixin_41442514/article/details/92141144
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!