1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
| def biKMeans(dataMat, k, distMeas=distEclud): m = shape(dataMat)[0] clusterAssment = mat(zeros((m, 2))) centroid0 = mean(dataMat, axis=0).tolist()[0] centList = [centroid0] for j in range(m): clusterAssment[j, 1] = distMeas(mat(centroid0), dataMat[j, :]) ** 2 while len(centList) < k: for i in range(len(centList)): ptsInCurrCluster = dataMat[nonzero( clusterAssment[:, 0].A == i)[0], :] centroidMat, splitClustAss = kMeans( ptsInCurrCluster, 2, distMeas) sseSplit = sum(splitClustAss[:, 1]) sseNotSplit = sum( clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0], 1]) print("sseSplit, and notSplit: ", sseSplit, sseNotSplit) if (sseSplit + sseNotSplit) < lowestSSE: bestCentToSplit = i bestNewCents = centroidMat bestClustAss = splitClustAss.copy() lowestSSE = sseSplit + sseNotSplit
bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len( centList) bestClustAss[nonzero(bestClustAss[:, 0].A == 0)[0], 0] = bestCentToSplit print('the bestCentToSplit is: ', bestCentToSplit) print('the len of bestClustAss is: ', len(bestClustAss))
centList[bestCentToSplit] = bestNewCents[0, :].tolist()[ 0] centList.append( bestNewCents[1, :].tolist()[0]) clusterAssment[nonzero(clusterAssment[:, 0].A == bestCentToSplit)[ 0], :] = bestClustAss return mat(centList), clusterAssment
|