決策樹算法

原理

  • 通過選擇最好的特征來劃分?jǐn)?shù)據(jù)集,對(duì)數(shù)據(jù)子集繼續(xù)劃分,直到數(shù)據(jù)子集中是相同的類別;劃分?jǐn)?shù)據(jù)集的特征可以通過計(jì)算信息增益的方法來選擇。

優(yōu)點(diǎn)

  • 計(jì)算復(fù)雜度不高,輸出結(jié)果容易理解,可以處理不相關(guān)特征數(shù)據(jù)。

缺點(diǎn)

  • 可能會(huì)產(chǎn)生過度匹配問題。

適用數(shù)據(jù)類型:

  • 標(biāo)稱型和數(shù)值型數(shù)據(jù)。
import math
#計(jì)算熵
def calEntropy(data_2d_array):
    classDict = {}
    for item in data_2d_array:
        classDiff = item[-1]
        classDict[classDiff] = classDict.get(classDiff,0) + 1
    #計(jì)算熵的公式    
    h = 0.0
    for key in classDict:
        prob = classDict[key] / len(data_2d_array) #計(jì)算概率
        h -=  prob * math.log(prob,2) #計(jì)算熵
    return h
calEntropy([[1,1,'yes'],[1,0,'no'],[1,1,'yes'],[0,1,'no'],[0,1,'no']])
0.9709505944546686
#劃分?jǐn)?shù)據(jù)集
def splitData(data_2d_array, feature, value):
    restData = []
    for diff in data_2d_array:
        if diff[feature] == value:
            restList = diff[:feature]
            restList.extend(diff[feature+1:])
            restData.append(restList)
    return restData
data_2d_array=[[1,1,'yes'],[1,0,'no'],[1,1,'yes'],[0,1,'no'],[0,1,'no']]
splitData(data_2d_array,0,1)
[[1, 'yes'], [0, 'no'], [1, 'yes']]
def chooseBestFeatureToSplit(data_2d_array):
    bestInfoGain = 0.0
    bestFeatureIndex = -1
    numFeature = len(data_2d_array[0])-1
    beforeEntropy = calEntropy(data_2d_array)
    for i in range(numFeature):
        uniqValue = set([x[i] for x in data_2d_array])
        afterEntropy = 0
        for val in uniqValue:
            restData = splitData(data_2d_array,i,val)
            subEntropy = calEntropy(restData)
            power = len(restData) / len(data_2d_array) 
            afterEntropy += power*subEntropy #公式參考西瓜書P35
        InfoGain =  beforeEntropy - afterEntropy #afterEntropy越小,數(shù)據(jù)越有序,InfoGain越大
        if InfoGain >= bestInfoGain:
            bestInfoGain = InfoGain
            bestFeatureIndex = i
    return bestFeatureIndex
chooseBestFeatureToSplit(data_2d_array)
0
'''
構(gòu)建決策樹偽代碼:
if 樣本類別相同:
    return 類別
if:遍歷完所有特征:
    return 投票結(jié)果
else:
    選擇最優(yōu)特征
    劃分?jǐn)?shù)據(jù)集
    創(chuàng)建分支節(jié)點(diǎn)
        for 每個(gè)數(shù)據(jù)集:
            分支節(jié)點(diǎn) += 遞歸構(gòu)建決策樹
    return 分支節(jié)點(diǎn)
'''
#少數(shù)服從多數(shù)投票
def vote(classList):
    classCount = {}
    for i in classList:
        classCount[i] += classCount.get(i,0)
    sortClass = sorted(classCount.items(), key= lambda x: x[1])
    return sortClass[0][-1]

#構(gòu)建決策樹
def createTree(data_2d_array,feature):
    classList = [i[-1] for i in data_2d_array]
    if classList.count(classList[0]) == len(data_2d_array):
        return classList[0]
    if len(data_2d_array[0]) == 1:
        return vote(classList)
    
    #選擇最優(yōu)特征
    bestFeatureIndex = chooseBestFeatureToSplit(data_2d_array)
    bestFeature = feature[bestFeatureIndex]
    
    #創(chuàng)建分支節(jié)點(diǎn)
    myTree = {feature[bestFeatureIndex]:{}}
    del(feature[bestFeatureIndex])
    #數(shù)據(jù)子集繼續(xù)劃分
    uniFeatureValueSet = set([i[bestFeatureIndex] for i in data_2d_array])
    for val in uniFeatureValueSet:
        subFeature = feature[:]
        subData = splitData(data_2d_array, bestFeatureIndex, val)
        myTree[bestFeature][val] = createTree(subData,subFeature)
    return myTree
data_2d_array=[[1,1,'yes'],[1,0,'no'],[1,1,'yes'],[0,1,'no'],[0,1,'no']]
createTree(data_2d_array,feature=['no surfacing','flippers'])
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
#使用文本注解繪制樹節(jié)點(diǎn)
import matplotlib.pyplot as plt
decisionNode = dict(boxstyle='sawtooth',fc='0.8')
leafNode = dict(boxstyle='round4',fc='.8')
arrow_args = dict(arrowstyle='<-')

def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy = parentPt, \
            xycoords = 'axes fraction', xytext = centerPt, \
            textcoords = 'axes fraction', va = "center", \
            ha = "center", bbox = nodeType, arrowprops = arrow_args)
def createPlot():
    fig = plt.figure(facecolor='white')
#     fig.clf()
    createPlot.ax1 = plt.subplot(111, frameon = False) #frameon 是否繪制矩形貼圖
    plotNode('decisionNode', (0.5, 0.1), (0.1, 0.5), decisionNode)
    plotNode('leafNode', (0.8, 0.1), (0.3, 0.8), leafNode)
    plt.show()
createPlot()

output_5_0.png
#獲取葉子節(jié)點(diǎn)個(gè)數(shù)和樹的深度
def getNumLeafs(myTree):
    firstNode = list(myTree.keys())[0]
    secondDict = myTree[firstNode]
    numLeafs = 0
    for key in secondDict:
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs
def getTreeDepth(myTree):
    maxDepth = 0
    firstNode = list(myTree.keys())[0]
    secondDict = myTree[firstNode]
    
    #遍歷所有節(jié)點(diǎn)來計(jì)算這個(gè)節(jié)點(diǎn)的深度,找到最深那條分支,裝進(jìn)袋子里(裝袋法)。
    for key in secondDict:
        if type(secondDict[key]).__name__ == 'dict':
            #本節(jié)點(diǎn)深度 = 1(本節(jié)點(diǎn))+ 子樹深度
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth
'''自己做的錯(cuò)誤解法:
def getTreeDepth(myTree):
    numDepth = 0
    firstNode = list(myTree.keys())[0]
    secondDict = myTree[firstNode]
    for key in secondDict:
        if type(secondDict[key]).__name__ == 'dict':
            numDepth += 1
        else:
            numDepth = 1
    return numDepth
'''
myTree = {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1:{'flippers': {0: 'no', 1: 'yes'}}}}}}
print('number of leafs is: %d'%getNumLeafs(myTree))
print('depth of tree is: %d' %getTreeDepth(myTree))
number of leafs is: 4
depth of tree is: 3
#使用文本注解繪制樹節(jié)點(diǎn)
import matplotlib.pyplot as plt
decisionNode = dict(boxstyle='sawtooth',fc='0.8')
leafNode = dict(boxstyle='round4',fc='.8')
arrow_args = dict(arrowstyle='<-')

def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlotTree.ax1.annotate(nodeTxt, xy = parentPt, \
            xycoords = 'axes fraction', xytext = centerPt, \
            textcoords = 'axes fraction', va = "center", \
            ha = "center", bbox = nodeType, arrowprops = arrow_args)
    
#獲取葉子節(jié)點(diǎn)個(gè)數(shù)和樹的深度
def getNumLeafs(myTree):
    firstNode = list(myTree.keys())[0]
    secondDict = myTree[firstNode]
    numLeafs = 0
    for key in secondDict:
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs
def getTreeDepth(myTree):
    maxDepth = 0
    firstNode = list(myTree.keys())[0]
    secondDict = myTree[firstNode]
    
    #遍歷所有節(jié)點(diǎn)來計(jì)算這個(gè)節(jié)點(diǎn)的深度,找到最深那條分支,裝進(jìn)袋子里(裝袋法)。
    for key in secondDict:
        if type(secondDict[key]).__name__ == 'dict':
            #本節(jié)點(diǎn)深度 = 1(本節(jié)點(diǎn))+ 子樹深度
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth

def plotMidText(centerPt, parentPt, txtString):
    xMid = centerPt[0] + (parentPt[0] - centerPt[0])/2
    yMid = centerPt[1] + (parentPt[1] - centerPt[1])/2
    createPlotTree.ax1.text(xMid,yMid,txtString)
def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree)
    depth = getTreeDepth(myTree)
    firstNode = list(myTree.keys())[0]
    centerPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW,plotTree.yOff)
    plotMidText(centerPt, parentPt,nodeTxt)
    plotNode(firstNode, centerPt, parentPt,decisionNode)
    secondDict = myTree[firstNode]
    plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
    for key in secondDict:
        if type(secondDict[key]).__name__ == 'dict':
            plotTree(secondDict[key], centerPt, str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
            plotNode(secondDict[key],(plotTree.xOff, plotTree.yOff),centerPt,leafNode)
            plotMidText((plotTree.xOff,plotTree.yOff),centerPt,str(key))
    plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD
    
def createPlotTree(inTree):
    fig = plt.figure(facecolor='white')
    axprops = dict(xticks=[],yticks=[])
    createPlotTree.ax1 = plt.subplot(frameon=False,**axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -.5 / plotTree.totalW
    plotTree.yOff = 1.0
    plotTree(inTree,(.5,1.0),'')
    plt.show()
myTree = {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1:{'flippers': {0: 'no', 1: 'yes'}}}}}}
createPlotTree(myTree)
output_7_0.png
myTree = {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1:'yes'}}}}
featureName = ['no surfacing','flippers']
testVector = [1,1]
#分類
def classify(myTree, featureName, testVector):
    firstNode = list(myTree.keys())[0]
    
    #找到第一個(gè)特征所對(duì)應(yīng)在testVector的值
    indexOfFirstNode = featureName.index(firstNode) 
    valOfFirstNodeInTestVec = testVector[indexOfFirstNode]
    
    secondDict = myTree[firstNode]
    for val in secondDict: #遍歷子樹的key,其實(shí)就是該節(jié)點(diǎn)的value
        if valOfFirstNodeInTestVec == val:
            if type(secondDict[val]).__name__ == 'dict':
                res = classify(secondDict[val],featureName, testVector)
            else:
                res = secondDict[val]
    return res
classify(myTree, featureName, testVector)
'yes'
#存儲(chǔ)決策樹
def dumpTree(myTree,filename):
    import pickle
    fw = open(filename,'wb') #以二進(jìn)制的方式打開
    pickle.dump(myTree,fw) #pickle存儲(chǔ)方式默認(rèn)是二進(jìn)制方式
    fw.close()
def loadTree(filename):
    import pickle
    fr = open(filename,'rb')
    tree =  pickle.load(fr)
    fr.close()
    return tree
dumpTree(myTree,'classifierStorage.txt')
loadTree('classifierStorage.txt')
{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

使用決策樹預(yù)測(cè)隱形眼鏡類型

def classifyLenses():
    fr = open('../../Reference Code/Ch03/lenses.txt')
    lenses = [line.strip().split('\t') for line in fr.readlines()]
    feature = ['age','prescript','astigmatic','tearTate']
    lenseTree = createTree(lenses,feature)
    showTree = createPlotTree(lenseTree)
    return showTree
classifyLenses()
output_11_0.png
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

  • 在計(jì)算機(jī)科學(xué)中,樹是一種很重要的數(shù)據(jù)結(jié)構(gòu),比如我們最為熟悉的二叉查找樹(Binary Search Tree),紅...
    ZPPenny閱讀 16,582評(píng)論 3 20
  • 簡(jiǎn)單介紹 ??機(jī)器學(xué)習(xí)主要分為倆大類:分類問題和回歸問題。決策樹是常用的分類學(xué)習(xí)算法,當(dāng)然也能用于處理回歸問題,同...
    Daoba閱讀 20,123評(píng)論 0 4
  • 基本概念 決策樹(decision tree)是一種常見的機(jī)器學(xué)習(xí)方法,它是基于樹結(jié)構(gòu)來進(jìn)行決策的,這恰是人類在面...
    司馬安安閱讀 1,588評(píng)論 0 3
  • 0x01 概述 決策樹是附加概率結(jié)果的一個(gè)樹狀的決策圖,是直觀的運(yùn)用統(tǒng)計(jì)概率分析的圖法。機(jī)器學(xué)習(xí)中決策樹是一個(gè)預(yù)測(cè)...
    Pino_HD閱讀 1,510評(píng)論 0 0
  • 歡迎關(guān)注微信公眾號(hào):AI Engine 之前寫過一篇文章是關(guān)于決策樹實(shí)例的,對(duì)于急于上手的同學(xué)應(yīng)該會(huì)有一定的...
    AI_Engine閱讀 2,074評(píng)論 0 2

友情鏈接更多精彩內(nèi)容