91人妻福利久久视频,亚洲一区免费,少妇婷婷五月天

導入包和數據集

# 導入包
import numpy as np
import matplotlib.pyplot as plt

# 導入數據，生成相應的輸入數據和輸出數據集
from sklearn import datasets

iris = datasets.load_iris()
# 全部行，保留后兩個特征，總共4個特征
X = iris.data[:, 2:]
y = iris.target

查看數據的相關信息

# 查看數據集的具體信息
iris.keys()   
# dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])

# 查看樣本信息
Z = iris.data
Z.shape

# 查看輸出實例信息
y = iris.target
y.shape

# 查看輸出的具體信息
iris.target_names

image.png

繪圖

plt.scatter(X[y==0, 0], X[y==0, 1])
plt.scatter(X[y==1, 0], X[y==1, 1])
plt.scatter(X[y==2, 0], X[y==2, 1])
plt.show()

image.png

調用sklearn

# 導入決策樹包
from sklearn.tree import DecisionTreeClassifier

# 創(chuàng)建實例，傳入兩個參數：樹的最大深入和特征判斷的標準即信息熵
dt_clf = DecisionTreeClassifier(max_depth=2, criterion='entropy')
# 擬合過程，返回擬合結果
dt_clf.fit(X, y)

# 模型的決策邊界創(chuàng)建
def plot_decision_boundary(model, axis):
    # meshgrid，linspace，reshape
    x0, x1 = np.meshgrid(
    np.linspace(axis[0], axis[1], int((axis[1] - axis[0])*100)).reshape((70,10)), 
    np.linspace(axis[2], axis[3], int((axis[3] - axis[2])*100)).reshape((30,10))
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]
    
    y_predict = model.predict(X_new)
    z = y_predict.reshape(x0.shape)
    
    # 自定義圖形的顏色元素
    from matplotlib.colors import ListedColormap
    custom_camp = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
    
    plt.contourf(x0, x1, z, linewidth=5, cmap=custom_camp)

plot_decision_boundary(dt_clf, axis=[0.5, 7.5, 0, 3])

plt.scatter(X[y==0, 0], X[y==0, 1])
plt.scatter(X[y==1, 0], X[y==1, 1])
plt.scatter(X[y==2, 0], X[y==2, 1])
plt.show()

image.png

信息熵實現

熵來自于熱力學系統(tǒng)，熵越大，粒子運動越劇烈，不規(guī)則性越大；反之亦然。在信息論中代表不確定的度量

信息熵越大，不確定性越大
信息熵越小，不確定性越小

計算公式： $H=-\sum _{i=1}^{k}p_ilog(p_i)$
二分類問題： $H(p)=-p log_2p-(1-p)log_2(1-p)$

image.png

結果

當樣本的分類中只有兩個類別，且當它們的占比都為0.5時

占比為0.5時，不確定性最高，即熵最大
當x變大或者變小的時候，更加偏向于某個類，確定性都會提高，信息熵都會變小
當每個類別是等概率的時，不確定性最高

使用信息熵尋找最優(yōu)劃分

# d：節(jié)點維度，value：閾值
def split(X, y, d, value):
    # 左、右兩邊閾值
    index_a = (X[:,d] <= value)
    index_b = (X[:,d] > value)
    # 返回的是分成的兩類數據
    return X[index_a], X[index_b], y[index_a], y[index_b]

from collections import Counter
from math import log

def entropy(y):
    counter = Counter(y)
    res = 0.0
    for num in counter.values():
        p = num / len(y)
        res += -p * log(p)
    return res

def try_split(X, y):
    # 目的是為了找出信息熵更小的值，找出更好的則更新best_entropy
    best_entropy = float("inf")
    best_d, best_v = -1, -1
    # 對X的維度進行窮搜遍歷
    for d in range(X.shape[1]):
        sorted_index = np.argsort(X[:, d])
        for i in range(1, len(X)):
            if X[sorted_index[i-1], d] != X[sorted_index[i], d]:
                v = (X[sorted_index[i-1], d] + X[sorted_index[i], d]) / 2
                x_l, x_r, y_l, y_r = split(X, y, d, v)
                e = entropy(y_l) + entropy(y_r)
                if e < best_entropy:
                    best_entropy, best_d, best_v = e, d, v
    return best_entropy, best_d, best_v

image.png

基尼系數

計算公式： $G=1-\sum_{i=1}^{k}p_i^2$

在sklearn中默認使用的是基尼系數進行決策樹的劃分

from collections import Counter
from math import log

# d：節(jié)點維度，value：閾值
def split(X, y, d, value):
    # 左、右兩邊閾值
    index_a = (X[:,d] <= value)
    index_b = (X[:,d] > value)
    # 返回的是分成的兩類數據
    return X[index_a], X[index_b], y[index_a], y[index_b]

def gini(y):
    # 變成字典形式
    counter = Counter(y)
    res = 1.0
    for num in counter.values():
        p = num / len(y)
        # 公式中用1-p**2， 不斷地遍歷
        res += -p ** 2
    return res

def try_split(X, y):
    # 目的是為了找出基尼系數更小的值，找出更好的則更新best_g
    best_g = float("inf")
    best_d, best_v = -1, -1
    # 對X的每個特征進行遍歷
    for d in range(X.shape[1]):
        # 每個特征維度上進行排序
        sorted_index = np.argsort(X[:, d])
        # 對每個樣本進行遍歷
        for i in range(1, len(X)):
            # 在維度d上，相鄰兩個樣本的值不等，二者均值為搜索的v
            if X[sorted_index[i-1], d] != X[sorted_index[i], d]:
                v = (X[sorted_index[i-1], d] + X[sorted_index[i], d]) / 2
                # 利用當前的d，v進行split方法
                x_l, x_r, y_l, y_r = split(X, y, d, v)
                e = gini(y_l) + gini(y_r)
                if e < best_g:
                    best_g, best_d, best_v = e, d, v
    return best_g, best_d, best_v

image.png

CART和超參數

Classification And Regression Tree 是sklearn中的默認方式

預測:O(logm)
訓練：O(nmlogm)

剪枝的作用是：降低復雜度和防止過擬合

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
# 使用的虛擬數據
X, y = datasets.make_moons(noise=0.15, random_state=666)

# X[y==0, 0]：找出當y取值為0時，X的第一個值（索引為0）
plt.scatter(X[y==0, 0], X[y==0, 1])
plt.scatter(X[y==1, 0], X[y==1, 1])
plt.show()

image.png

from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X, y)
# 模型的決策邊界創(chuàng)建
def plot_decision_boundary(model, axis):
    # meshgrid，linspace，reshape
    x0, x1 = np.meshgrid(
    np.linspace(axis[0], axis[1], int((axis[1] - axis[0])*100)).reshape((40,10)), 
    np.linspace(axis[2], axis[3], int((axis[3] - axis[2])*100)).reshape((10,25))
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]
    
    y_predict = model.predict(X_new)
    z = y_predict.reshape(x0.shape)
    
    # 自定義圖形的顏色元素
    from matplotlib.colors import ListedColormap
    custom_camp = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])
    
    plt.contourf(x0, x1, z, linewidth=5, cmap=custom_camp)

plot_decision_boundary(dt_clf, axis=[-1.5, 2.5, -1.0, 1.5])

plt.scatter(X[y==0, 0], X[y==0, 1])
plt.scatter(X[y==1, 0], X[y==1, 1])
plt.show()

image.png

調節(jié)參數

# 傳入參數max_depth=2
dt_clf = DecisionTreeClassifier(max_depth=2)
dt_clf.fit(X ,y)

plot_decision_boundary(dt_clf, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0, 0], X[y==0, 1])
plt.scatter(X[y==1, 0], X[y==1, 1])
plt.show()

image.png

決策樹應用于回歸問題：波士頓房價

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target

from sklearn.model_selection import train_test_split
# 進行數據的分割
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

# 在測試數據集上的準確率不高
dt_reg.score(X_test, y_test)   # 0.58

# 訓練數據集上100%準確，在測試數據集上不好，說明出現過擬合現象
dt_reg.score(X_train, y_train)  # 1

網格搜索

from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train, y_train)

# 利用網格搜索進行調參，一定要注意每個參數的起始值
para_grid = [
    {
        "max_depth": [i for i in range(1, 10)],
        "max_leaf_nodes": [i for i in range(2, 10)],
        "min_samples_split": [i for i in range(2,10)],
        "max_features": [i for i in range(1, 10)]
    }
]

# 導入調參的包，傳入實例和設置的參數
from sklearn.model_selection import GridSearchCV
# 創(chuàng)建實例
grid_search = GridSearchCV(dt_reg, para_grid)

%%time
grid_search.fit(X_train, y_train)

image.png

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

機器學習——sklearn實現決策樹

機器學習——sklearn實現決策樹

導入包和數據集

查看數據的相關信息

繪圖

調用sklearn

信息熵實現

結果

使用信息熵尋找最優(yōu)劃分

基尼系數

CART和超參數

調節(jié)參數

決策樹應用于回歸問題：波士頓房價

網格搜索

相關閱讀更多精彩內容

友情鏈接更多精彩內容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

機器學習——sklearn實現決策樹

導入包和數據集

查看數據的相關信息

繪圖

調用sklearn

信息熵實現

結果

使用信息熵尋找最優(yōu)劃分

基尼系數

CART和超參數

調節(jié)參數

決策樹應用于回歸問題：波士頓房價

網格搜索

相關閱讀更多精彩內容

友情鏈接更多精彩內容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av