決策樹(decision tree)是一種基本的分類與回歸方法。學(xué)習(xí)時(shí),利用訓(xùn)練數(shù)據(jù)根據(jù)損失函數(shù)最小化的原則建立決策樹模型;預(yù)測(cè)時(shí),對(duì)新的數(shù)據(jù),利用決策樹模型進(jìn)行分類。包括特征選擇、決策樹的生成、決策樹的剪枝。
例題5.1
算法步驟:
輸入:訓(xùn)練數(shù)據(jù)集D和特征集A
輸出:特征A對(duì)訓(xùn)練數(shù)據(jù)集D的信息增益g(D,A)
步驟:
(1)計(jì)算數(shù)據(jù)集D的經(jīng)驗(yàn)熵H(D)

(2)計(jì)算特征A對(duì)數(shù)據(jù)集D的經(jīng)驗(yàn)條件熵H(D|A)

(3)計(jì)算信息增益

import numpy as np
from math import log
# 熵
def calc_ent(datasets):
data_length = len(datasets)
label_count = {}
for i in range(data_length):
label = datasets[i][-1]
if label not in label_count:
label_count[label] = 0
label_count[label] += 1
ent = -sum([(p / data_length) * log(p / data_length, 2) for p in label_count.values()])
return ent
# 經(jīng)驗(yàn)條件熵
def cond_ent(datasets, axis=0):
data_length = len(datasets)
feature_sets = {}
for i in range(data_length):
feature = datasets[i][axis]
if feature not in feature_sets:
feature_sets[feature] = []
feature_sets[feature].append(datasets[i])
cond_ent = sum([(len(p) / data_length) * calc_ent(p) for p in feature_sets.values()])
return cond_ent
# 信息增益
def info_gain(ent, cond_ent):
return ent - cond_ent
# 最大信息增益
def info_gain_train(datasets):
count = len(datasets[0]) - 1
ent = calc_ent(datasets)
best_feature = []
for c in range(count):
c_info_gain = info_gain(ent, cond_ent(datasets, axis=c))
best_feature.append((c, c_info_gain))
print('特征({}) - info_gain - {:.3f}'.format(labels[c], c_info_gain))
best_ = max(best_feature, key=lambda x: x[-1])
return '特征({})的信息增益最大,選擇為根節(jié)點(diǎn)特征'.format(labels[best_[0]])
def create_data():
datasets=[['青年', '否', '否', '一般', '否'],
['青年', '否', '否', '好', '否'],
['青年', '是', '否', '好', '是'],
['青年', '是', '是', '一般', '是'],
['青年', '否', '否', '一般', '否'],
['中年', '否', '否', '一般', '否'],
['中年', '否', '否', '好', '否'],
['中年', '是', '是', '好', '是'],
['中年', '否', '是', '非常好', '是'],
['中年', '否', '是', '非常好', '是'],
['老年', '否', '是', '非常好', '是'],
['老年', '否', '是', '好', '是'],
['老年', '是', '否', '好', '是'],
['老年', '是', '否', '非常好', '是'],
['老年', '否', '否', '一般', '否']]
labels=[u'年齡', u'有工作', u'有自己的房子', u'信貸情況', u'類別']
return datasets, labels
datasets, labels=create_data()
info_gain_train(np.array(datasets))
ID3算法
輸入:訓(xùn)練數(shù)據(jù)集D,特征集A,閾值eta
輸出:決策樹T
步驟:
(1)若D中所有實(shí)例屬于同一類Ck,則T為單結(jié)點(diǎn)樹,并將類Ck作為該結(jié)點(diǎn)的類標(biāo)記,返回T;
(2)若A=?,則T為單結(jié)點(diǎn)樹,并將D中實(shí)例最大的類Ck作為該結(jié)點(diǎn)的類標(biāo)記,返回T;
(3)否則,計(jì)算A中各特征對(duì)D的信息增益,選擇信息增益最大的特征Ag;
(4)如果Ag的信息增益小于閾值eta,則置T為單結(jié)點(diǎn)樹,并將D中實(shí)例數(shù)最大的類Ck作為該結(jié)點(diǎn)的類標(biāo)記,返回T;
(5)否則對(duì)Ag的每一可能值αi,依Ag=αi將D分割為若干非空子集Di,將Di中實(shí)例數(shù)最大的類作為標(biāo)記,構(gòu)建子結(jié)點(diǎn),由結(jié)點(diǎn)及其子結(jié)點(diǎn)構(gòu)成樹T,返回T;
(6)對(duì)第i個(gè)子結(jié)點(diǎn),以Di為訓(xùn)練集,以A-Ag為特征集,遞歸調(diào)用步驟1~5,得到子樹Ti,返回T。
import numpy as np
from math import log
import pandas as pd
class Node:
def __init__(self, root=True, label=None, feature_name=None, feature=None):
self.root = root
self.label = label
self.feature_name = feature_name
self.feature = feature
self.tree = {}
self.result = {
'label': self.label,
'feature': self.feature,
'tree': self.tree
}
def __repr__(self):
# 自定義返回值
return '{}'.format(self.result)
def add_node(self, val, node):
self.tree[val] = node
def predict(self, features):
if self.root is True:
return self.label
return self.tree[features[self.feature]].predict(features)
class DTree:
def __init__(self, epsilon=0.1):
self.epsilon = epsilon
self._tree = {}
# 熵
@staticmethod
def calc_ent(datasets):
data_length = len(datasets)
label_count = {}
for i in range(data_length):
label = datasets[i][-1]
if label not in label_count:
label_count[label] = 0
label_count[label] += 1
ent = -sum([(p / data_length) * log(p / data_length, 2) for p in label_count.values()])
return ent
# 經(jīng)驗(yàn)條件熵
def cond_ent(self, datasets, axis=0):
data_length = len(datasets)
feature_sets = {}
for i in range(data_length):
feature = datasets[i][axis]
if feature not in feature_sets:
feature_sets[feature] = []
feature_sets[feature].append(datasets[i])
cond_ent = sum([(len(p) / data_length) * self.calc_ent(p) for p in feature_sets.values()])
return cond_ent
# 信息增益
@staticmethod
def info_gain(ent, cond_ent):
return ent - cond_ent
# 最大信息增益
def info_gain_train(self, datasets):
count = len(datasets[0]) - 1
ent = self.calc_ent(datasets)
best_feature = []
for c in range(count):
c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c))
best_feature.append((c, c_info_gain))
best_ = max(best_feature, key=lambda x: x[-1])
return best_
def train(self, train_data):
_, y_train, features = train_data.iloc[:, :-1], train_data.iloc[:, -1], train_data.columns[:-1]
# 1,若D中實(shí)例屬于同一類Ck,則T為單節(jié)點(diǎn)樹,并將類Ck作為結(jié)點(diǎn)的類標(biāo)記,返回T
if len(y_train.value_counts()) == 1:
return Node(root=True, label=y_train.iloc[0])
# 2, 若A為空,則T為單節(jié)點(diǎn)樹,將D中實(shí)例樹最大的類Ck作為該節(jié)點(diǎn)的類標(biāo)記,返回T
if len(features) == 0:
return Node(root=True,
label=y_train.value_counts().sort_values(ascending=False).index[0])
# 3,計(jì)算最大信息增益 Ag為信息增益最大的特征
max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
max_feature_name = features[max_feature]
# 4,Ag的信息增益小于閾值eta,則置T為單節(jié)點(diǎn)樹,并將D中是實(shí)例數(shù)最大的類Ck作為該節(jié)點(diǎn)的類標(biāo)記,返回T
if max_info_gain < self.epsilon:
return Node(
root=True,
label=y_train.value_counts().sort_values(ascending=False).index[0]
)
# 5,構(gòu)建Ag子集
node_tree = Node(
root=False, feature_name=max_feature_name, feature=max_feature
)
feature_list = train_data[max_feature_name].value_counts().index
for f in feature_list:
sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop([max_feature_name], axis=1)
# 6 遞歸生成樹
sub_tree = self.train(sub_train_df)
node_tree.add_node(f, sub_tree)
return node_tree
def fit(self, train_data):
self._tree = self.train(train_data)
return self._tree
def predict(self, X_test):
return self._tree.predict(X_test)
def create_data():
datasets = [['青年', '否', '否', '一般', '否'],
['青年', '否', '否', '好', '否'],
['青年', '是', '否', '好', '是'],
['青年', '是', '是', '一般', '是'],
['青年', '否', '否', '一般', '否'],
['中年', '否', '否', '一般', '否'],
['中年', '否', '否', '好', '否'],
['中年', '是', '是', '好', '是'],
['中年', '否', '是', '非常好', '是'],
['中年', '否', '是', '非常好', '是'],
['老年', '否', '是', '非常好', '是'],
['老年', '否', '是', '好', '是'],
['老年', '是', '否', '好', '是'],
['老年', '是', '否', '非常好', '是'],
['老年', '否', '否', '一般', '否']]
labels = [u'年齡', u'有工作', u'有自己的房子', u'信貸情況', u'類別']
return datasets, labels
datasets, labels = create_data()
data_df = pd.DataFrame(datasets, columns=labels)
dt = DTree()
tree = dt.fit(data_df)
print(dt.predict(['老年', '否', '否', '一般']))