新聞推薦項(xiàng)目05-排序模型

排序模型

通過召回的操作, 我們已經(jīng)進(jìn)行了問題規(guī)模的縮減, 對(duì)于每個(gè)用戶, 選擇出了N篇文章作為了候選集,并基于召回的候選集構(gòu)建了與用戶歷史相關(guān)的特征,以及用戶本身的屬性特征,文章本省的屬性特征,以及用戶與文章之間的特征,下面就是使用機(jī)器學(xué)習(xí)模型來對(duì)構(gòu)造好的特征進(jìn)行學(xué)習(xí),然后對(duì)測(cè)試集進(jìn)行預(yù)測(cè),得到測(cè)試集中的每個(gè)候選集用戶點(diǎn)擊的概率,返回點(diǎn)擊概率最大的topk個(gè)文章,作為最終的結(jié)果。

排序階段選擇了三個(gè)比較有代表性的排序模型,它們分別是:

  1. LGB的排序模型
  2. LGB的分類模型
  3. 深度學(xué)習(xí)的分類模型DIN

得到了最終的排序模型輸出的結(jié)果之后,還選擇了兩種比較經(jīng)典的模型集成的方法:

  1. 輸出結(jié)果加權(quán)融合
  2. Staking(將模型的輸出結(jié)果再使用一個(gè)簡(jiǎn)單模型進(jìn)行預(yù)測(cè))
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import time
from datetime import datetime
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

讀取排序特征

data_path = './data_raw/'
save_path = './temp_results/'
offline = False
# 重新讀取數(shù)據(jù)的時(shí)候,發(fā)現(xiàn)click_article_id是一個(gè)浮點(diǎn)數(shù),所以將其轉(zhuǎn)換成int類型
trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)

if offline:
    val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')
    val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)
else:
    val_user_item_feats_df = None
    
tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)

# 做特征的時(shí)候?yàn)榱朔奖悖o測(cè)試集也打上了一個(gè)無效的標(biāo)簽,這里直接刪掉就行
del tst_user_item_feats_df['label']

返回排序后的結(jié)果

def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判斷是不是每個(gè)用戶都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定義列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)
# 排序結(jié)果歸一化
def norm_sim(sim_df, weight=0.0):
    # print(sim_df.head())
    min_sim = sim_df.min()
    max_sim = sim_df.max()
    if max_sim == min_sim:
        sim_df = sim_df.apply(lambda sim: 1.0)
    else:
        sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))

    sim_df = sim_df.apply(lambda sim: sim + weight)  # plus one
    return sim_df

LGB排序模型

# 防止中間出錯(cuò)之后重新讀取數(shù)據(jù)
trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()
    
tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()
# 定義特征列
lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 
            'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',
            'click_environment','click_deviceGroup', 'click_os', 'click_country', 
            'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',
            'words_hbo', 'category_id', 'created_at_ts','words_count']
# 排序模型分組
trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

if offline:
    val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
    g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values
# 排序模型定義
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  
# 排序模型訓(xùn)練
if offline:
    lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,
                eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
else:
    lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)
# 模型預(yù)測(cè)
tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)

# 將這里的排序結(jié)果保存一份,用戶后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)
# 預(yù)測(cè)結(jié)果重新排序, 及生成提交結(jié)果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')
# 五折交叉驗(yàn)證,這里的五折交叉是以用戶為目標(biāo)進(jìn)行五折劃分
#  這一部分與前面的單獨(dú)訓(xùn)練和驗(yàn)證是分開的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id','label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉驗(yàn)證,并將中間結(jié)果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 訓(xùn)練集與驗(yàn)證集的用戶分組
    train_idx.sort_values(by=['user_id'], inplace=True)
    g_train = train_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    valid_idx.sort_values(by=['user_id'], inplace=True)
    g_val = valid_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    # 定義模型
    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  
    # 訓(xùn)練模型
    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], 
                   eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
    
    # 預(yù)測(cè)驗(yàn)證集結(jié)果
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    
    # 對(duì)輸出結(jié)果進(jìn)行歸一化
    valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 將驗(yàn)證集的預(yù)測(cè)結(jié)果放到一個(gè)列表中,后面進(jìn)行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是線上測(cè)試,需要計(jì)算每次交叉驗(yàn)證的結(jié)果相加,最后求平均
    if not offline:
        sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存訓(xùn)練集交叉驗(yàn)證產(chǎn)生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)
    
# 測(cè)試集的預(yù)測(cè)結(jié)果,多次交叉驗(yàn)證求平均,將預(yù)測(cè)的score和對(duì)應(yīng)的rank特征保存,可以用于后面的staking,這里還可以構(gòu)造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存測(cè)試集交叉驗(yàn)證的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)
# 預(yù)測(cè)結(jié)果重新排序, 及生成提交結(jié)果
# 單模型生成提交結(jié)果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')

LGB分類模型

# 模型及參數(shù)的定義
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  
# 模型訓(xùn)練
if offline:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],
                    eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                    eval_metric=['auc', ],early_stopping_rounds=50, )
else:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])
# 模型預(yù)測(cè)
tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]

# 將這里的排序結(jié)果保存一份,用戶后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)
# 預(yù)測(cè)結(jié)果重新排序, 及生成提交結(jié)果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')
# 五折交叉驗(yàn)證,這里的五折交叉是以用戶為目標(biāo)進(jìn)行五折劃分
#  這一部分與前面的單獨(dú)訓(xùn)練和驗(yàn)證是分開的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉驗(yàn)證,并將中間結(jié)果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 模型及參數(shù)的定義
    lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  
    # 訓(xùn)練模型
    lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], 
                          eval_metric=['auc', ],early_stopping_rounds=50, )
    
    # 預(yù)測(cè)驗(yàn)證集結(jié)果
    valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], 
                                                              num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
    # 對(duì)輸出結(jié)果進(jìn)行歸一化 分類模型輸出的值本身就是一個(gè)概率值不需要進(jìn)行歸一化
    # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 將驗(yàn)證集的預(yù)測(cè)結(jié)果放到一個(gè)列表中,后面進(jìn)行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是線上測(cè)試,需要計(jì)算每次交叉驗(yàn)證的結(jié)果相加,最后求平均
    if not offline:
        sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], 
                                                     num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存訓(xùn)練集交叉驗(yàn)證產(chǎn)生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)
    
# 測(cè)試集的預(yù)測(cè)結(jié)果,多次交叉驗(yàn)證求平均,將預(yù)測(cè)的score和對(duì)應(yīng)的rank特征保存,可以用于后面的staking,這里還可以構(gòu)造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存測(cè)試集交叉驗(yàn)證的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)
# 預(yù)測(cè)結(jié)果重新排序, 及生成提交結(jié)果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')

DIN模型

用戶的歷史點(diǎn)擊行為列表

這個(gè)是為后面的DIN模型服務(wù)的

if offline:
    all_data = pd.read_csv('./data_raw/train_click_log.csv')
else:
    trn_data = pd.read_csv('./data_raw/train_click_log.csv')
    tst_data = pd.read_csv('./data_raw/testA_click_log.csv')
    all_data = trn_data.append(tst_data)
hist_click =all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()
his_behavior_df = pd.DataFrame()
his_behavior_df['user_id'] = hist_click['user_id']
his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']
trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df.copy()
else: 
    val_user_item_feats_df_din_model = None
    
tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()
trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')
else:
    val_user_item_feats_df_din_model = None

tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

DIN模型簡(jiǎn)介

我們下面嘗試使用DIN模型, DIN的全稱是Deep Interest Network, 這是阿里2018年基于前面的深度學(xué)習(xí)模型無法表達(dá)用戶多樣化的興趣而提出的一個(gè)模型, 它可以通過考慮【給定的候選廣告】和【用戶的歷史行為】的相關(guān)性,來計(jì)算用戶興趣的表示向量。具體來說就是通過引入局部激活單元,通過軟搜索歷史行為的相關(guān)部分來關(guān)注相關(guān)的用戶興趣,并采用加權(quán)和來獲得有關(guān)候選廣告的用戶興趣的表示。與候選廣告相關(guān)性較高的行為會(huì)獲得較高的激活權(quán)重,并支配著用戶興趣。該表示向量在不同廣告上有所不同,大大提高了模型的表達(dá)能力。所以該模型對(duì)于此次新聞推薦的任務(wù)也比較適合, 我們?cè)谶@里通過當(dāng)前的候選文章與用戶歷史點(diǎn)擊文章的相關(guān)性來計(jì)算用戶對(duì)于文章的興趣。 該模型的結(jié)構(gòu)如下:

image-20201116201646983

我們這里直接調(diào)包來使用這個(gè)模型, 關(guān)于這個(gè)模型的詳細(xì)細(xì)節(jié)部分我們會(huì)在下一期的推薦系統(tǒng)組隊(duì)學(xué)習(xí)中給出。下面說一下該模型如何具體使用:deepctr的函數(shù)原型如下:

def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False,
dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation="dice",
att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024,
task='binary'):

  • dnn_feature_columns: 特征列, 包含數(shù)據(jù)所有特征的列表
  • history_feature_list: 用戶歷史行為列, 反應(yīng)用戶歷史行為的特征的列表
  • dnn_use_bn: 是否使用BatchNormalization
  • dnn_hidden_units: 全連接層網(wǎng)絡(luò)的層數(shù)和每一層神經(jīng)元的個(gè)數(shù), 一個(gè)列表或者元組
  • dnn_activation_relu: 全連接網(wǎng)絡(luò)的激活單元類型
  • att_hidden_size: 注意力層的全連接網(wǎng)絡(luò)的層數(shù)和每一層神經(jīng)元的個(gè)數(shù)
  • att_activation: 注意力層的激活單元類型
  • att_weight_normalization: 是否歸一化注意力得分
  • l2_reg_dnn: 全連接網(wǎng)絡(luò)的正則化系數(shù)
  • l2_reg_embedding: embedding向量的正則化稀疏
  • dnn_dropout: 全連接網(wǎng)絡(luò)的神經(jīng)元的失活概率
  • task: 任務(wù), 可以是分類, 也可是是回歸

在具體使用的時(shí)候, 我們必須要傳入特征列和歷史行為列, 但是再傳入之前, 我們需要進(jìn)行一下特征列的預(yù)處理。具體如下:

  1. 首先,我們要處理數(shù)據(jù)集, 得到數(shù)據(jù), 由于我們是基于用戶過去的行為去預(yù)測(cè)用戶是否點(diǎn)擊當(dāng)前文章, 所以我們需要把數(shù)據(jù)的特征列劃分成數(shù)值型特征, 離散型特征和歷史行為特征列三部分, 對(duì)于每一部分, DIN模型的處理會(huì)有不同
    1. 對(duì)于離散型特征, 在我們的數(shù)據(jù)集中就是那些類別型的特征, 比如user_id這種, 這種類別型特征, 我們首先要經(jīng)過embedding處理得到每個(gè)特征的低維稠密型表示, 既然要經(jīng)過embedding, 那么我們就需要為每一列的類別特征的取值建立一個(gè)字典,并指明embedding維度, 所以在使用deepctr的DIN模型準(zhǔn)備數(shù)據(jù)的時(shí)候, 我們需要通過SparseFeat函數(shù)指明這些類別型特征, 這個(gè)函數(shù)的傳入?yún)?shù)就是列名, 列的唯一取值(建立字典用)和embedding維度。
    2. 對(duì)于用戶歷史行為特征列, 比如文章id, 文章的類別等這種, 同樣的我們需要先經(jīng)過embedding處理, 只不過和上面不一樣的地方是,對(duì)于這種特征, 我們?cè)诘玫矫總€(gè)特征的embedding表示之后, 還需要通過一個(gè)Attention_layer計(jì)算用戶的歷史行為和當(dāng)前候選文章的相關(guān)性以此得到當(dāng)前用戶的embedding向量, 這個(gè)向量就可以基于當(dāng)前的候選文章與用戶過去點(diǎn)擊過得歷史文章的相似性的程度來反應(yīng)用戶的興趣, 并且隨著用戶的不同的歷史點(diǎn)擊來變化,去動(dòng)態(tài)的模擬用戶興趣的變化過程。這類特征對(duì)于每個(gè)用戶都是一個(gè)歷史行為序列, 對(duì)于每個(gè)用戶, 歷史行為序列長(zhǎng)度會(huì)不一樣, 可能有的用戶點(diǎn)擊的歷史文章多,有的點(diǎn)擊的歷史文章少, 所以我們還需要把這個(gè)長(zhǎng)度統(tǒng)一起來, 在為DIN模型準(zhǔn)備數(shù)據(jù)的時(shí)候, 我們首先要通過SparseFeat函數(shù)指明這些類別型特征, 然后還需要通過VarLenSparseFeat函數(shù)再進(jìn)行序列填充, 使得每個(gè)用戶的歷史序列一樣長(zhǎng), 所以這個(gè)函數(shù)參數(shù)中會(huì)有個(gè)maxlen,來指明序列的最大長(zhǎng)度是多少。
    3. 對(duì)于連續(xù)型特征列, 我們只需要用DenseFeat函數(shù)來指明列名和維度即可。
  2. 處理完特征列之后, 我們把相應(yīng)的數(shù)據(jù)與列進(jìn)行對(duì)應(yīng),就得到了最后的數(shù)據(jù)。

下面根據(jù)具體的代碼感受一下, 邏輯是這樣, 首先我們需要寫一個(gè)數(shù)據(jù)準(zhǔn)備函數(shù), 在這里面就是根據(jù)上面的具體步驟準(zhǔn)備數(shù)據(jù), 得到數(shù)據(jù)和特征列, 然后就是建立DIN模型并訓(xùn)練, 最后基于模型進(jìn)行測(cè)試。

# 導(dǎo)入deepctr
from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import * 
import tensorflow as tf

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
# 數(shù)據(jù)準(zhǔn)備函數(shù)
def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, his_behavior_fea, emb_dim=32, max_len=100):
    """
    數(shù)據(jù)準(zhǔn)備函數(shù):
    df: 數(shù)據(jù)集
    dense_fea: 數(shù)值型特征列
    sparse_fea: 離散型特征列
    behavior_fea: 用戶的候選行為特征列
    his_behavior_fea: 用戶的歷史行為特征列
    embedding_dim: embedding的維度, 這里為了簡(jiǎn)單, 統(tǒng)一把離散型特征列采用一樣的隱向量維度
    max_len: 用戶序列的最大長(zhǎng)度
    """
    
    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) for feat in sparse_fea]
    
    dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]
    
    var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1,
                                    embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in hist_behavior_fea]
    
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    
    # 建立x, x是一個(gè)字典的形式
    x = {}
    for name in get_feature_names(dnn_feature_columns):
        if name in his_behavior_fea:
            # 這是歷史行為序列
            his_list = [l for l in df[name]]
            x[name] = pad_sequences(his_list, maxlen=max_len, padding='post')      # 二維數(shù)組
        else:
            x[name] = df[name].values
    
    return x, dnn_feature_columns
# 把特征分開
sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', 
              'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']

behavior_fea = ['click_article_id']

hist_behavior_fea = ['hist_click_article_id']

dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',
             'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',
             'words_hbo','words_count']
# dense特征進(jìn)行歸一化, 神經(jīng)網(wǎng)絡(luò)訓(xùn)練都需要將數(shù)值進(jìn)行歸一化處理
mm = MinMaxScaler()

# 下面是做一些特殊處理,當(dāng)在其他的地方出現(xiàn)無效值的時(shí)候,不處理無法進(jìn)行歸一化,剛開始可以先把他注釋掉,在運(yùn)行了下面的代碼
# 之后如果發(fā)現(xiàn)報(bào)錯(cuò),應(yīng)該先去想辦法處理如何不出現(xiàn)inf之類的值
# trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)
# tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)

for feat in dense_fea:
    trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])
    
    if val_user_item_feats_df_din_model is not None:
        val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])
    
    tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])
# 準(zhǔn)備訓(xùn)練數(shù)據(jù)
x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea, 
                                               sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
y_trn = trn_user_item_feats_df_din_model['label'].values

if offline:
    # 準(zhǔn)備驗(yàn)證數(shù)據(jù)
    x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea, 
                                                   sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
    y_val = val_user_item_feats_df_din_model['label'].values
    
dense_fea = [x for x in dense_fea if x != 'label']
x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, 
                                               sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
# 建立模型
model = DIN(dnn_feature_columns, behavior_fea)

# 查看模型結(jié)構(gòu)
model.summary()

# 模型編譯
model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])
WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:255: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_article_id (InputLayer)   [(None, 1)]          0                                            
__________________________________________________________________________________________________
category_id (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_environment (InputLayer)  [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_deviceGroup (InputLayer)  [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_os (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_country (InputLayer)      [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_region (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_referrer_type (InputLayer [(None, 1)]          0                                            
__________________________________________________________________________________________________
is_cat_hab (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
sparse_emb_user_id (Embedding)  (None, 1, 32)        1600032     user_id[0][0]                    
__________________________________________________________________________________________________
sparse_seq_emb_hist_click_artic multiple             525664      click_article_id[0][0]           
                                                                 hist_click_article_id[0][0]      
                                                                 click_article_id[0][0]           
__________________________________________________________________________________________________
sparse_emb_category_id (Embeddi (None, 1, 32)        7776        category_id[0][0]                
__________________________________________________________________________________________________
sparse_emb_click_environment (E (None, 1, 32)        128         click_environment[0][0]          
__________________________________________________________________________________________________
sparse_emb_click_deviceGroup (E (None, 1, 32)        160         click_deviceGroup[0][0]          
__________________________________________________________________________________________________
sparse_emb_click_os (Embedding) (None, 1, 32)        288         click_os[0][0]                   
__________________________________________________________________________________________________
sparse_emb_click_country (Embed (None, 1, 32)        384         click_country[0][0]              
__________________________________________________________________________________________________
sparse_emb_click_region (Embedd (None, 1, 32)        928         click_region[0][0]               
__________________________________________________________________________________________________
sparse_emb_click_referrer_type  (None, 1, 32)        256         click_referrer_type[0][0]        
__________________________________________________________________________________________________
sparse_emb_is_cat_hab (Embeddin (None, 1, 32)        64          is_cat_hab[0][0]                 
__________________________________________________________________________________________________
no_mask (NoMask)                (None, 1, 32)        0           sparse_emb_user_id[0][0]         
                                                                 sparse_seq_emb_hist_click_article
                                                                 sparse_emb_category_id[0][0]     
                                                                 sparse_emb_click_environment[0][0
                                                                 sparse_emb_click_deviceGroup[0][0
                                                                 sparse_emb_click_os[0][0]        
                                                                 sparse_emb_click_country[0][0]   
                                                                 sparse_emb_click_region[0][0]    
                                                                 sparse_emb_click_referrer_type[0]
                                                                 sparse_emb_is_cat_hab[0][0]      
__________________________________________________________________________________________________
hist_click_article_id (InputLay [(None, 50)]         0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 1, 320)       0           no_mask[0][0]                    
                                                                 no_mask[1][0]                    
                                                                 no_mask[2][0]                    
                                                                 no_mask[3][0]                    
                                                                 no_mask[4][0]                    
                                                                 no_mask[5][0]                    
                                                                 no_mask[6][0]                    
                                                                 no_mask[7][0]                    
                                                                 no_mask[8][0]                    
                                                                 no_mask[9][0]                    
__________________________________________________________________________________________________
no_mask_1 (NoMask)              (None, 1, 320)       0           concatenate[0][0]                
__________________________________________________________________________________________________
attention_sequence_pooling_laye (None, 1, 32)        13961       sparse_seq_emb_hist_click_article
                                                                 sparse_seq_emb_hist_click_article
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 1, 352)       0           no_mask_1[0][0]                  
                                                                 attention_sequence_pooling_layer[
__________________________________________________________________________________________________
sim0 (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
time_diff0 (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
word_diff0 (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
sim_max (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
sim_min (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
sim_sum (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
sim_mean (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
score (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
rank (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
click_size (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
time_diff_mean (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
active_level (InputLayer)       [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_time_hob1 (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_time_hob2 (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
words_hbo (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
words_count (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 352)          0           concatenate_1[0][0]              
__________________________________________________________________________________________________
no_mask_3 (NoMask)              (None, 1)            0           sim0[0][0]                       
                                                                 time_diff0[0][0]                 
                                                                 word_diff0[0][0]                 
                                                                 sim_max[0][0]                    
                                                                 sim_min[0][0]                    
                                                                 sim_sum[0][0]                    
                                                                 sim_mean[0][0]                   
                                                                 score[0][0]                      
                                                                 rank[0][0]                       
                                                                 click_size[0][0]                 
                                                                 time_diff_mean[0][0]             
                                                                 active_level[0][0]               
                                                                 user_time_hob1[0][0]             
                                                                 user_time_hob2[0][0]             
                                                                 words_hbo[0][0]                  
                                                                 words_count[0][0]                
__________________________________________________________________________________________________
no_mask_2 (NoMask)              (None, 352)          0           flatten[0][0]                    
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 16)           0           no_mask_3[0][0]                  
                                                                 no_mask_3[1][0]                  
                                                                 no_mask_3[2][0]                  
                                                                 no_mask_3[3][0]                  
                                                                 no_mask_3[4][0]                  
                                                                 no_mask_3[5][0]                  
                                                                 no_mask_3[6][0]                  
                                                                 no_mask_3[7][0]                  
                                                                 no_mask_3[8][0]                  
                                                                 no_mask_3[9][0]                  
                                                                 no_mask_3[10][0]                 
                                                                 no_mask_3[11][0]                 
                                                                 no_mask_3[12][0]                 
                                                                 no_mask_3[13][0]                 
                                                                 no_mask_3[14][0]                 
                                                                 no_mask_3[15][0]                 
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 352)          0           no_mask_2[0][0]                  
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 16)           0           concatenate_2[0][0]              
__________________________________________________________________________________________________
no_mask_4 (NoMask)              multiple             0           flatten_1[0][0]                  
                                                                 flatten_2[0][0]                  
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 368)          0           no_mask_4[0][0]                  
                                                                 no_mask_4[1][0]                  
__________________________________________________________________________________________________
dnn_1 (DNN)                     (None, 80)           89880       concatenate_3[0][0]              
__________________________________________________________________________________________________
dense (Dense)                   (None, 1)            80          dnn_1[0][0]                      
__________________________________________________________________________________________________
prediction_layer (PredictionLay (None, 1)            1           dense[0][0]                      
==================================================================================================
Total params: 2,239,602
Trainable params: 2,239,362
Non-trainable params: 240
__________________________________________________________________________________________________
# 模型訓(xùn)練
if offline:
    history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256)
else:
    # 也可以使用上面的語句用自己采樣出來的驗(yàn)證集
    # history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256)
    history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)
Epoch 1/2
290964/290964 [==============================] - 55s 189us/sample - loss: 0.4209 - binary_crossentropy: 0.4206 - auc: 0.7842
Epoch 2/2
290964/290964 [==============================] - 52s 178us/sample - loss: 0.3630 - binary_crossentropy: 0.3618 - auc: 0.8478
# 模型預(yù)測(cè)
tst_user_item_feats_df_din_model['pred_score'] = model.predict(x_tst, verbose=1, batch_size=256)
tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'din_rank_score.csv', index=False)
500000/500000 [==============================] - 20s 39us/sample
# 預(yù)測(cè)結(jié)果重新排序, 及生成提交結(jié)果
rank_results = tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']]
submit(rank_results, topk=5, model_name='din')
# 五折交叉驗(yàn)證,這里的五折交叉是以用戶為目標(biāo)進(jìn)行五折劃分
#  這一部分與前面的單獨(dú)訓(xùn)練和驗(yàn)證是分開的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_din_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

dense_fea = [x for x in dense_fea if x != 'label']
x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, 
                                                   sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)

# 五折交叉驗(yàn)證,并將中間結(jié)果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 準(zhǔn)備訓(xùn)練數(shù)據(jù)
    x_trn, dnn_feature_columns = get_din_feats_columns(train_idx, dense_fea, 
                                                       sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
    y_trn = train_idx['label'].values

    # 準(zhǔn)備驗(yàn)證數(shù)據(jù)
    x_val, dnn_feature_columns = get_din_feats_columns(valid_idx, dense_fea, 
                                                   sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
    y_val = valid_idx['label'].values
    
    history = model.fit(x_trn, y_trn, verbose=1, epochs=2, validation_data=(x_val, y_val) , batch_size=256)
    
    # 預(yù)測(cè)驗(yàn)證集結(jié)果
    valid_idx['pred_score'] = model.predict(x_val, verbose=1, batch_size=256)   
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 將驗(yàn)證集的預(yù)測(cè)結(jié)果放到一個(gè)列表中,后面進(jìn)行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是線上測(cè)試,需要計(jì)算每次交叉驗(yàn)證的結(jié)果相加,最后求平均
    if not offline:
        sub_preds += model.predict(x_tst, verbose=1, batch_size=256)[:, 0]   
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存訓(xùn)練集交叉驗(yàn)證產(chǎn)生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_din_cls_feats.csv', index=False)
    
# 測(cè)試集的預(yù)測(cè)結(jié)果,多次交叉驗(yàn)證求平均,將預(yù)測(cè)的score和對(duì)應(yīng)的rank特征保存,可以用于后面的staking,這里還可以構(gòu)造其他更多的特征
tst_user_item_feats_df_din_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_din_model['pred_score'] = tst_user_item_feats_df_din_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_din_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_din_model['pred_rank'] = tst_user_item_feats_df_din_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存測(cè)試集交叉驗(yàn)證的新特征
tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_din_cls_feats.csv', index=False)

模型融合

加權(quán)融合

# 讀取多個(gè)模型的排序結(jié)果文件
lgb_ranker = pd.read_csv(save_path + 'lgb_ranker_score.csv')
lgb_cls = pd.read_csv(save_path + 'lgb_cls_score.csv')
din_ranker = pd.read_csv(save_path + 'din_rank_score.csv')

# 這里也可以換成交叉驗(yàn)證輸出的測(cè)試結(jié)果進(jìn)行加權(quán)融合
rank_model = {'lgb_ranker': lgb_ranker, 
              'lgb_cls': lgb_cls, 
              'din_ranker': din_ranker}
def get_ensumble_predict_topk(rank_model, topk=5):
    final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])
    rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))
    
    final_recall = final_recall.append(rank_model['lgb_ranker'])
    final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()
    
    submit(final_recall, topk=topk, model_name='ensemble_fuse')
get_ensumble_predict_topk(rank_model)

Staking

# 讀取多個(gè)模型的交叉驗(yàn)證生成的結(jié)果文件
# 訓(xùn)練集
trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')
trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')
trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv')

# 測(cè)試集
tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')
tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')
tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')
# 將多個(gè)模型輸出的特征進(jìn)行拼接

finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]
finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]

for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats, trn_din_cls_feats]):
    for feat in [ 'pred_score', 'pred_rank']:
        col_name = feat + '_' + str(idx)
        finall_trn_ranker_feats[col_name] = trn_model[feat]

for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats, tst_din_cls_feats]):
    for feat in [ 'pred_score', 'pred_rank']:
        col_name = feat + '_' + str(idx)
        finall_tst_ranker_feats[col_name] = tst_model[feat]
# 定義一個(gè)邏輯回歸模型再次擬合交叉驗(yàn)證產(chǎn)生的特征對(duì)測(cè)試集進(jìn)行預(yù)測(cè)
# 這里需要注意的是,在做交叉驗(yàn)證的時(shí)候可以構(gòu)造多一些與輸出預(yù)測(cè)值相關(guān)的特征,來豐富這里簡(jiǎn)單模型的特征
from sklearn.linear_model import LogisticRegression

feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1', 'pred_score_2', 'pred_rank_2']

trn_x = finall_trn_ranker_feats[feat_cols]
trn_y = finall_trn_ranker_feats['label']

tst_x = finall_tst_ranker_feats[feat_cols]

# 定義模型
lr = LogisticRegression()

# 模型訓(xùn)練
lr.fit(trn_x, trn_y)

# 模型預(yù)測(cè)
finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]
# 預(yù)測(cè)結(jié)果重新排序, 及生成提交結(jié)果
rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]
submit(rank_results, topk=5, model_name='ensumble_staking')

總結(jié)

本章主要學(xué)習(xí)了三個(gè)排序模型,包括LGB的Rank, LGB的Classifier還有深度學(xué)習(xí)的DIN模型, 當(dāng)然,對(duì)于這三個(gè)模型的原理部分,我們并沒有給出詳細(xì)的介紹, 請(qǐng)大家課下自己探索原理,也歡迎大家把自己的探索與所學(xué)分享出來,我們一塊學(xué)習(xí)和進(jìn)步。最后,我們進(jìn)行了簡(jiǎn)單的模型融合策略,包括簡(jiǎn)單的加權(quán)和Stacking。

關(guān)于Datawhale: Datawhale是一個(gè)專注于數(shù)據(jù)科學(xué)與AI領(lǐng)域的開源組織,匯集了眾多領(lǐng)域院校和知名企業(yè)的優(yōu)秀學(xué)習(xí)者,聚合了一群有開源精神和探索精神的團(tuán)隊(duì)成員。Datawhale 以“for the learner,和學(xué)習(xí)者一起成長(zhǎng)”為愿景,鼓勵(lì)真實(shí)地展現(xiàn)自我、開放包容、互信互助、敢于試錯(cuò)和勇于擔(dān)當(dāng)。同時(shí) Datawhale 用開源的理念去探索開源內(nèi)容、開源學(xué)習(xí)和開源方案,賦能人才培養(yǎng),助力人才成長(zhǎng),建立起人與人,人與知識(shí),人與企業(yè)和人與未來的聯(lián)結(jié)。 本次數(shù)據(jù)挖掘路徑學(xué)習(xí),專題知識(shí)將在天池分享,詳情可關(guān)注Datawhale:

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容