參考:https://blog.csdn.net/y0367/article/details/51501780
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
train = pd.read_csv("train.csv", dtype={"Age": np.float64},)
print(train.head(10))
print(len(train))
def harmonize_data(titanic):
# 填充空數(shù)據(jù) 和 把string數(shù)據(jù)轉(zhuǎn)成integer表示
# 對(duì)于年齡字段發(fā)生缺失,我們用所有年齡的均值替代
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
# 性別男: 用0替代
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
# 性別女: 用1替代
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())
return titanic
train_data = harmonize_data(train)
# 列出對(duì)生存結(jié)果有影響的字段
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# 存放不同參數(shù)取值,以及對(duì)應(yīng)的精度,每一個(gè)元素都是一個(gè)三元組(a, b, c)
results = []
# 最小葉子結(jié)點(diǎn)的參數(shù)取值
sample_leaf_options = list(range(1, 500, 3))
# 決策樹(shù)個(gè)數(shù)參數(shù)取值
n_estimators_options = list(range(1, 1000, 5))
groud_truth = train_data['Survived'][601:]
for leaf_size in sample_leaf_options:
for n_estimators_size in n_estimators_options:
alg = RandomForestClassifier(min_samples_leaf=leaf_size, n_estimators=n_estimators_size, random_state=50)
alg.fit(train_data[predictors][:600], train_data['Survived'][:600])
predict = alg.predict(train_data[predictors][601:])
# 用一個(gè)三元組,分別記錄當(dāng)前的 min_samples_leaf,n_estimators, 和在測(cè)試數(shù)據(jù)集上的精度
results.append((leaf_size, n_estimators_size, (groud_truth == predict).mean()))
# 真實(shí)結(jié)果和預(yù)測(cè)結(jié)果進(jìn)行比較,計(jì)算準(zhǔn)確率
print((groud_truth == predict).mean())
# 打印精度最大的那一個(gè)三元組
print(max(results, key=lambda x: x[2]))