機(jī)器學(xué)習(xí)實(shí)戰(zhàn)第3章-決策樹

1、trees.py

import sys 
reload(sys) 
sys.setdefaultencoding("utf-8") 
import operator

from math import log
def calcshannonent(dataset):
    numentries=len(dataset)
    labelcounts={}
    for featvec in dataset:
        currentlabel=featvec[-1]
        if currentlabel not in labelcounts.keys():
            labelcounts[currentlabel]=0
        labelcounts[currentlabel]+=1
        shannonent=0
    for key in labelcounts:
        prob=float(labelcounts[key])/numentries
        shannonent-=prob*log(prob,2)
    return shannonent    

def createdataset():
    dataset=[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
    labels=['no surfacing','flippers']
    return dataset,labels

def splitdataset(dataset,axis,value):
    retdataset=[]
    for featvec in dataset:
        if featvec[axis]==value:
            reducedfeatvec=featvec[:axis]
            reducedfeatvec.extend(featvec[axis+1:])
            retdataset.append(reducedfeatvec)
    return retdataset

def choosebestfeaturetosplit(dataset):
    numberfeature=len(dataset[0])-1
    baseentropy=calcshannonent(dataset)
    bestinfogain=0.0;bestfeature=-1
    for i in range(numberfeature):
        featlist=[example[i] for example in dataset]
        uniquevals=set(featlist)
        newentropy=0.0
        for value in uniquevals:
            subdataset=splitdataset(dataset,i,value)
            prob=len(subdataset)/float(len(dataset))
            newentropy+=prob*calcshannonent(subdataset)
        infogain=baseentropy-newentropy
        if infogain>bestinfogain:
            bestinfogain=infogain
            bestfeature=i
    return bestfeature

def majoritycnt(classlist):
    classcount={}
    for vote in classlist:
        if vote not in classcount.keys:
            classcount[vote]=0
        classcount[vote]+=1
    sortedclasscount=sorted(classcount.iteritems(),\
        key=operator.itemgetter(1),reverse=True)
    return sortedclasscount[0][0]

def createtree(dataset,labels):
    classlist=[example[-1] for example in dataset]
    if classlist.count(classlist[0])==len(classlist):
        return classlist[0]
    if len(dataset[0])==1:
        return majoritycnt
    bestfeat=choosebestfeaturetosplit(dataset)
    bestfeaturelabel=labels[bestfeat]
    mytree={bestfeaturelabel:{}}
    del (labels[bestfeat])
    featvalues=[example[bestfeat] for example in dataset]
    uniquevals=set(featvalues)
    for value in uniquevals:
        sublabels=labels[:]
        mytree[bestfeaturelabel][value]=createtree(splitdataset(dataset,bestfeat,value),sublabels)
    return mytree

if __name__=='__main__':
    mydat,labels=createdataset()
    #mydat[0][-1]='maybe'
    #print calcshannonent(mydat)
    #print choosebestfeaturetosplit(mydat)  
    mytree=createtree(mydat,labels)
    print mytree

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容