解決樣本不平衡問題
from imblearn.over_sampling import RandomOverSampler
機器學(xué)習(xí)實戰(zhàn)編碼技巧
一堆頭文件:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler #過采樣
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler #數(shù)據(jù)預(yù)處理
#模型選擇
from sklearn.model_selection import cross_val_score #交叉驗證的方法
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, KFold, GridSearchCV
#模型評估結(jié)果
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import cohen_kappa_score #用于一致性檢驗的指標,也可以用于衡量分類的效果。
# 模型調(diào)參
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
#機器學(xué)習(xí)模塊
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier #集成學(xué)習(xí)模塊
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
import missingno as msno #是一個可視化缺失值的庫
from scipy.stats import randint #隨機數(shù)
from catboost import CatBoostClassifier #機器學(xué)習(xí)庫
import xgboost as xgb
from xgboost import XGBClassifier, plot_importance
import lightgbm as lgb #基于決策樹的提升算法
import pickle #模型打包
SVD截斷奇異值分解
a = pd.get_dummies(X['Region_Code'], prefix = 'Region_Code', drop_first=True)
svd=TruncatedSVD(n_components=5, n_iter=7, random_state=42)
svd.fit(a)
data1 = pd.DataFrame(svd.transform(a))
data1.columns = ['Region_Code_'+str(i) for i in range(5)]
對樣本進行下采樣:
rus = RandomUnderSampler(random_state=0)
X, y = rus.fit_resample(X, y)
- 召回率:樣本中的正例有多少被預(yù)測正確了
交叉驗證:
kf = model_selection.KFold(n_splits=10, random_state=None, shuffle=False)
scores = model_selection.cross_val_score(model, X_train, y_train, cv=kf)
主成分分析:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
X = raw_nonu[['Vehicle_Age_1-2 Year','Vehicle_Age_< 1 Year','Vehicle_Age_> 2 Years']]
pca=PCA(n_components=1) #主成分個數(shù)
pca.fit(X)
pca_Vehicile = pca.transform(X)
正態(tài)性檢驗
print("Vintage",stats.shapiro(X_train.Vintage))
計算得分:
accuracy_score(y_test, predictions)
precision_score(y_test, predictions)
recall_score(y_test, predictions)
f1_score(y_test, predictions)
混淆矩陣:
plot_confusion_matrix(tree_classifier,X_test, y_test) #(分類器,特征,標簽)
數(shù)據(jù)標準化/歸一化:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
ss = StandardScaler()
train[num_feat] = ss.fit_transform(train[num_feat])
mm = MinMaxScaler()
train[['Annual_Premium']] = mm.fit_transform(train[['Annual_Premium']])
matlab化:
%pylab inline
畫ROC曲線:
%pylab inline
y_score = rf_load.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)
title('Random Forest ROC curve: CC Fraud')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')
plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))
roc_auc_score(y_test, y_score)
plt.savefig("./ROC曲線")
劃分訓(xùn)練集測試集:
from sklearn.model_selection import train_test_split
train_target=train['Response']
x_train,x_test,y_train,y_test = train_test_split(train,train_target, random_state = 0)
決策樹:
tree_classifier = DecisionTreeClassifier()
xgboost
model_xgb = XGBClassifier()
model_xgb.fit(X, y,eval_metric='mlogloss')
隨機森林:
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
random_search = {'criterion': ['entropy', 'gini'],
'max_depth': [2,3,4,5,6,7,10],
'min_samples_leaf': [4, 6, 8],
'min_samples_split': [5, 7,10],
'n_estimators': [300]}
clf = RandomForestClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = random_search, n_iter = 10,
cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
KNN:
KNN = KNeighborsClassifier(n_neighbors=11, metric='minkowski', p = 2)
KNN.fit(X_train, y_train)
KNN_predictions = KNN.predict(X_test)
KNN_predictions
BaggingClassifier
b_classifier = BaggingClassifier()
b_classifier.fit(X_train, y_train)
b_predictions = b_classifier.predict(X_test)
b_predictions
構(gòu)造一個等比數(shù)列:
range_m = np.logspace(0, 2, num=5).astype(int)
獨熱編碼:
var=pd.get_dummies(var, prefix = 'var', drop_first=True) #默認扔掉第一個啞變量,還會把原來那個也刪除掉。
categorical_vars = ['Gender', 'Vehicle_Age', 'Vehicle_Damage','Region_Code']
#列出你要編碼的字段
for var in categorical_vars:
data = pd.concat([data, pd.get_dummies(data[var], prefix = var)], 1)
data = data.drop(var, 1) #刪除原字段
保存模型:
import pickle
filename = 'rf_model.sav'
pickle.dump(model, open(filename, 'wb'))
#導(dǎo)入模型
rf_load = pickle.load(open(filename, 'rb'))