可以參考
LightGBM原生/sk接口的常用參數(shù)
LightGBM使用
lightGBM調(diào)參
所有的參數(shù)含義,參考:http://lightgbm.apachecn.org/cn/latest/Parameters.html
調(diào)參過程:
-
num_leaves
LightGBM使用的是leaf-wise的算法,因此在調(diào)節(jié)樹的復(fù)雜程度時(shí),使用的是num_leaves而不是max_depth。
樣本分布非平衡數(shù)據(jù)集:可以
param[‘is_unbalance’]=’true’;Bagging參數(shù):bagging_fraction+bagging_freq(必須同時(shí)設(shè)置)、feature_fraction。bagging_fraction可以使bagging的更快的運(yùn)行出結(jié)果,feature_fraction設(shè)置在每次迭代中使用特征的比例;
min_data_in_leaf、min_sum_hessian_in_leaf:調(diào)大它的值可以防止過擬合,它的值通常設(shè)置的比較大。
sklearn接口形式的LightGBM示例
這里主要以sklearn的使用形式來使用lightgbm算法,包含建模,訓(xùn)練,預(yù)測,網(wǎng)格參數(shù)優(yōu)化。
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
# 加載數(shù)據(jù)
print('Load data...')
iris = load_iris()
data=iris.data
target = iris.target
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values
print('Start training...')
# 創(chuàng)建模型,訓(xùn)練模型
gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.05,n_estimators=20)
gbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5)
print('Start predicting...')
# 測試機(jī)預(yù)測
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# 模型評估
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# feature importances
print('Feature importances:', list(gbm.feature_importances_))
# 網(wǎng)格搜索,參數(shù)優(yōu)化
estimator = lgb.LGBMRegressor(num_leaves=31)
param_grid = {
'learning_rate': [0.01, 0.1, 1],
'n_estimators': [20, 40]
}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(X_train, y_train)
print('Best parameters found by grid search are:', gbm.best_params_)
原生形式使用lightgbm
# coding: utf-8
# pylint: disable = invalid-name, C0111
import json
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
iris = load_iris()
data=iris.data
target = iris.target
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
# 加載你的數(shù)據(jù)
# print('Load data...')
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
#
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values
# 創(chuàng)建成lgb特征的數(shù)據(jù)集格式
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# 將參數(shù)寫成字典下形式
params = {
'task': 'train',
'boosting_type': 'gbdt', # 設(shè)置提升類型
'objective': 'regression', # 目標(biāo)函數(shù)
'metric': {'l2', 'auc'}, # 評估函數(shù)
'num_leaves': 31, # 葉子節(jié)點(diǎn)數(shù)
'learning_rate': 0.05, # 學(xué)習(xí)速率
'feature_fraction': 0.9, # 建樹的特征選擇比例
'bagging_fraction': 0.8, # 建樹的樣本采樣比例
'bagging_freq': 5, # k 意味著每 k 次迭代執(zhí)行bagging
'verbose': 1 # <0 顯示致命的, =0 顯示錯(cuò)誤 (警告), >0 顯示信息
}
print('Start training...')
# 訓(xùn)練 cv and train
gbm = lgb.train(params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5)
print('Save model...')
# 保存模型到文件
gbm.save_model('model.txt')
print('Start predicting...')
# 預(yù)測數(shù)據(jù)集
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# 評估模型
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)