1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
   |  import numpy as np import matplotlib.pyplot as plt
  def loadDataSet(fileName):     data = np.loadtxt(fileName,delimiter='\t')     return data
  def distEclud(x,y):     return np.sqrt(np.sum((x-y)**2))  
  def randCent(dataSet,k):     m,n = dataSet.shape     centroids = np.zeros((k,n))     for i in range(k):         index = int(np.random.uniform(0,m))          centroids[i,:] = dataSet[index,:]     return centroids 
 
  def KMeans(dataSet,k):     m = np.shape(dataSet)[0]                 clusterAssment = np.mat(np.zeros((m,2)))     clusterChange = True          centroids = randCent(dataSet,k)     while clusterChange:         clusterChange = False                  for i in range(m):             minDist = 100000.0             minIndex = -1                                       for j in range(k):                                  distance = distEclud(centroids[j,:],dataSet[i,:])                 if distance < minDist:                     minDist = distance                     minIndex = j                          if clusterAssment[i,0] != minIndex:                 clusterChange = True                 clusterAssment[i,:] = minIndex,minDist**2                  for j in range(k):             pointsInCluster = dataSet[np.nonzero(clusterAssment[:,0].A == j)[0]]               centroids[j,:] = np.mean(pointsInCluster,axis=0)        print("Congratulations,cluster complete!")     return centroids,clusterAssment def showCluster(dataSet,k,centroids,clusterAssment):     m,n = dataSet.shape     if n != 2:         print("数据不是二维的")         return 1     mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']     if k > len(mark):         print("k值太大了")         return 1          for i in range(m):         markIndex = int(clusterAssment[i,0])         plt.plot(dataSet[i,0],dataSet[i,1],mark[markIndex])     mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']          for i in range(k):         plt.plot(centroids[i,0],centroids[i,1],mark[i])         plt.show() dataSet = loadDataSet("test.txt") k = 4 centroids,clusterAssment = KMeans(dataSet,k) showCluster(dataSet,k,centroids,clusterAssment)
 
  |