機器學(xué)習(xí)實戰(zhàn)

解決樣本不平衡問題

from imblearn.over_sampling import RandomOverSampler

機器學(xué)習(xí)實戰(zhàn)編碼技巧

一堆頭文件:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  

from imblearn.over_sampling import RandomOverSampler #過采樣

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler  #數(shù)據(jù)預(yù)處理

#模型選擇
from sklearn.model_selection import cross_val_score #交叉驗證的方法
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, KFold, GridSearchCV

#模型評估結(jié)果
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report  
from sklearn.metrics import plot_confusion_matrix 
from sklearn.metrics import cohen_kappa_score #用于一致性檢驗的指標,也可以用于衡量分類的效果。

# 模型調(diào)參
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe 

#機器學(xué)習(xí)模塊
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier #集成學(xué)習(xí)模塊
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

import missingno as msno #是一個可視化缺失值的庫

from scipy.stats import randint #隨機數(shù)

from catboost import CatBoostClassifier  #機器學(xué)習(xí)庫
import xgboost as xgb
from xgboost import XGBClassifier, plot_importance
import lightgbm as lgb #基于決策樹的提升算法

import pickle #模型打包

SVD截斷奇異值分解

a = pd.get_dummies(X['Region_Code'], prefix = 'Region_Code', drop_first=True)
svd=TruncatedSVD(n_components=5, n_iter=7, random_state=42)
svd.fit(a)
data1 = pd.DataFrame(svd.transform(a))
data1.columns = ['Region_Code_'+str(i) for i in range(5)]

對樣本進行下采樣:

rus = RandomUnderSampler(random_state=0)
X, y = rus.fit_resample(X, y)
  • 召回率:樣本中的正例有多少被預(yù)測正確了

交叉驗證:

kf = model_selection.KFold(n_splits=10, random_state=None, shuffle=False)
scores = model_selection.cross_val_score(model, X_train, y_train, cv=kf)

主成分分析:

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

X = raw_nonu[['Vehicle_Age_1-2 Year','Vehicle_Age_< 1 Year','Vehicle_Age_> 2 Years']]
pca=PCA(n_components=1)  #主成分個數(shù)
pca.fit(X)
pca_Vehicile = pca.transform(X)

正態(tài)性檢驗

print("Vintage",stats.shapiro(X_train.Vintage))

計算得分:

accuracy_score(y_test, predictions)
precision_score(y_test, predictions)
recall_score(y_test, predictions)
f1_score(y_test, predictions)

混淆矩陣:

plot_confusion_matrix(tree_classifier,X_test, y_test) #(分類器,特征,標簽)

數(shù)據(jù)標準化/歸一化:

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
ss = StandardScaler()
train[num_feat] = ss.fit_transform(train[num_feat])

mm = MinMaxScaler()
train[['Annual_Premium']] = mm.fit_transform(train[['Annual_Premium']])

matlab化:

%pylab inline

畫ROC曲線:

%pylab inline
y_score = rf_load.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score)

title('Random Forest ROC curve: CC Fraud')
xlabel('FPR (Precision)')
ylabel('TPR (Recall)')

plot(fpr,tpr)
plot((0,1), ls='dashed',color='black')
plt.show()
print ('Area under curve (AUC): ', auc(fpr,tpr))

roc_auc_score(y_test, y_score)
plt.savefig("./ROC曲線")

劃分訓(xùn)練集測試集:

from sklearn.model_selection import train_test_split
train_target=train['Response']
x_train,x_test,y_train,y_test = train_test_split(train,train_target, random_state = 0)

決策樹:
tree_classifier = DecisionTreeClassifier()

xgboost

model_xgb = XGBClassifier()
model_xgb.fit(X, y,eval_metric='mlogloss')

隨機森林:

from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
random_search = {'criterion': ['entropy', 'gini'],
               'max_depth': [2,3,4,5,6,7,10],
               'min_samples_leaf': [4, 6, 8],
               'min_samples_split': [5, 7,10],
               'n_estimators': [300]}

clf = RandomForestClassifier()
model = RandomizedSearchCV(estimator = clf, param_distributions = random_search, n_iter = 10, 
                               cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
model.fit(x_train,y_train)

y_pred=model.predict(x_test)

KNN:

KNN = KNeighborsClassifier(n_neighbors=11, metric='minkowski', p = 2)
KNN.fit(X_train, y_train)

KNN_predictions = KNN.predict(X_test)
KNN_predictions

BaggingClassifier

b_classifier = BaggingClassifier()
b_classifier.fit(X_train, y_train)

b_predictions = b_classifier.predict(X_test)
b_predictions

構(gòu)造一個等比數(shù)列:

range_m = np.logspace(0, 2, num=5).astype(int)

獨熱編碼:

var=pd.get_dummies(var, prefix = 'var', drop_first=True) #默認扔掉第一個啞變量,還會把原來那個也刪除掉。

categorical_vars = ['Gender', 'Vehicle_Age', 'Vehicle_Damage','Region_Code'] 
#列出你要編碼的字段
for var in categorical_vars:
    data = pd.concat([data, pd.get_dummies(data[var], prefix = var)], 1)
    data = data.drop(var, 1)    #刪除原字段

保存模型:

import pickle
filename = 'rf_model.sav'
pickle.dump(model, open(filename, 'wb'))

#導(dǎo)入模型
rf_load = pickle.load(open(filename, 'rb'))
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容