1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
   |  def biKMeans(dataMat, k, distMeas=distEclud):     m = shape(dataMat)[0]     clusterAssment = mat(zeros((m, 2)))     centroid0 = mean(dataMat, axis=0).tolist()[0]     centList = [centroid0]     for j in range(m):         clusterAssment[j, 1] = distMeas(mat(centroid0), dataMat[j, :]) ** 2     while len(centList) < k:         for i in range(len(centList)):             ptsInCurrCluster = dataMat[nonzero(                 clusterAssment[:, 0].A == i)[0], :]             centroidMat, splitClustAss = kMeans(                 ptsInCurrCluster, 2, distMeas)             sseSplit = sum(splitClustAss[:, 1])             sseNotSplit = sum(                 clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0],                                1])             print("sseSplit, and notSplit: ", sseSplit, sseNotSplit)             if (sseSplit + sseNotSplit) < lowestSSE:                 bestCentToSplit = i                 bestNewCents = centroidMat                 bestClustAss = splitClustAss.copy()                 lowestSSE = sseSplit + sseNotSplit
          bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(             centList)         bestClustAss[nonzero(bestClustAss[:, 0].A == 0)[0],                      0] = bestCentToSplit         print('the bestCentToSplit is: ', bestCentToSplit)         print('the len of bestClustAss is: ', len(bestClustAss))
          centList[bestCentToSplit] = bestNewCents[0, :].tolist()[             0]         centList.append(             bestNewCents[1, :].tolist()[0])         clusterAssment[nonzero(clusterAssment[:, 0].A == bestCentToSplit)[                            0], :] = bestClustAss     return mat(centList), clusterAssment
 
  |