defchooseBestFeatureToSplit(dataSet): numfeature = len(dataSet[0]) - 1 baseEntropy = calcshannon(dataSet) bestInFoGain = 0.0 bestFeature = -1 for i in range(numfeature): #创建唯一的分类标签列表 featList = [example[i] for example in dataSet] vals = set(featList) newEntropy = 0.0 for value in vals: #计算每种分类方式的信息熵 subDataSet = splitDataSet(dataSet, i, value) prob = len(subDataSet)/float(len(dataSet)) newEntropy += prob*calcshannon(subDataSet) infoGain = baseEntropy - newEntropy if (infoGain > bestInFoGain): #计算最好的信息增益 bestInFoGain = infoGain bestFeature = i return bestFeature
递归构建决策树:
1 2 3 4 5 6 7 8
defmajority(classList): classCount = {} for vote in classList: if vote notin classCount.keys():classCount[vote] = 0 classCount[vote] += 1 sortedcount = sorted(classCount.iteritems(), \ key=operator.itemgetter(1), reverse=True) return sortedcount[0][0]
创建树的函数代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
defcreateTree(dataSet, labels): classList = [example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0]) == 1: return majority(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree = {bestFeatLabel: {}} del (labels[bestFeat]) featValues = [example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: sublabels = labels[:] myTree[bestFeatLabel][value] = createTree(splitDataSet\ (dataSet, bestFeat, value), sublabels) return myTree