公益AI-NLP刷分

打卡太突然了,直接上代碼

# 導(dǎo)包
import time
import math
import sys
# sys.path.append("/home/kesci/input")
import re
import pandas as pd
import numpy as np
import collections
import os
import random
import time
from tqdm import tqdm
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
! pip install jieba -i https://pypi.douban.com/simple
! pip install -i https://pypi.tuna.tsinghua.edu.cn/simple gensim
import jieba
import gensim
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

# 讀取數(shù)據(jù)
def read_data(path):
    # 返回二維列表,axis=0是行(即有多少評(píng)論),axis=1是列(0是是否推薦,1是評(píng)論內(nèi)容)
    with open(path, 'r') as f:
        lines = f.read()
    comments = [line.split('\t') for line in lines.split('\n')] # 用split處理str返回列表
    return comments[:-1] # 刪除最后一個(gè)元素,貌似是\n
    
cmts = read_data('/home/kesci/input/Comments9120/train_shuffle.txt')
# 示例 cmts[300][1]='牛油果卷很清爽'
# len(cmts)=16000
for i in range(len(cmts)):
    cmts[i].reverse()
# 形如[['酸菜魚不錯(cuò)', '0'],
# ['輕食素食都是友善的飲食方式', '0'],……


# 調(diào)整cmts_hantout格式,使其與cmts一致,以便于共享函數(shù)
cmts_handout = read_data('/home/kesci/input/Comments9120/test_handout.txt')
for cmt_handout in cmts_handout:
    cmt_handout.extend('0')

cmts_merge = cmts+cmts_handout

# 方法二:創(chuàng)建詞索引

def get_tokenized_9120(data):
    '''
    @params:
        data: 數(shù)據(jù)的列表,列表中的每個(gè)元素為 [文本字符串,0/1標(biāo)簽] 二元組
    @return: 切分詞后的文本的列表,列表中的每個(gè)元素為切分后的詞序列
    '''
    def tokenizer(text):
        return jieba.lcut(text) # 對(duì)于英文這樣分[tok.lower() for tok in text.split(' ')]
    
    return [tokenizer(review) for review, _ in data]

def get_vocab_9120(data):
    '''
    @params:
        data: 同上
    @return: 數(shù)據(jù)集上的詞典,Vocab 的實(shí)例(freqs, stoi, itos)
    '''
    tokenized_data = get_tokenized_9120(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=1) # 原來(lái)min_freq是5,改成1看看

vocab = get_vocab_9120(cmts_merge)
print('# words in vocab:', len(vocab)) 

# 詞典和詞語(yǔ)的索引創(chuàng)建好后,就可以將數(shù)據(jù)集的文本從字符串的形式轉(zhuǎn)換為單詞下標(biāo)序列的形式,以待之后的使用。
def preprocess_9120(data, vocab):
    '''
    @params:
        data: 同上,原始的讀入數(shù)據(jù)
        vocab: 訓(xùn)練集上生成的詞典
    @return:
        features: 單詞下標(biāo)序列,形狀為 (n, max_l) 的整數(shù)張量
        labels: 情感標(biāo)簽,形狀為 (n,) 的0/1整數(shù)張量
    '''
    max_l = 10  # 將每條評(píng)論通過(guò)截?cái)嗷蛘哐a(bǔ)0,使得長(zhǎng)度變成10,根據(jù)實(shí)際需求后續(xù)可以減少

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_9120(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([int(score) for _, score in data]) # 文本里01是str,改為int
    return features, labels
    
# features_tmp, labels_tmp = preprocess_9120(cmts, vocab)
# print(features_tmp)
# print(labels_tmp)
# # tensor([[ 243,    4,    0,  ...,    0,    0,    0],
# #         [2668, 1450,    9,  ..., 1899,    0,    0],
# #         [2302, 1048,   18,  ...,    0,    0,    0],
# #         ...,
# #         [ 295,   15,    4,  ...,    0,    0,    0],
# #         [  16,   67,   36,  ...,    0,    0,    0],
# #         [8946,  193,    7,  ...,    0,    0,    0]])
# # tensor([0, 0, 0,  ..., 0, 1, 0])

# 分類train和test數(shù)據(jù)集
k = 100
test_data = cmts[:len(cmts)//k] # 前1/k是test數(shù)據(jù)
train_data = cmts[len(cmts)//k:] # 后(k-1)/k是train數(shù)據(jù)

# # 互換
# train_data = cmts[:len(cmts)//k] # 前1/k是test數(shù)據(jù)
# test_data = cmts[len(cmts)//k:] # 后(k-1)/k是train數(shù)據(jù)

# 創(chuàng)建數(shù)據(jù)迭代器
train_set = Data.TensorDataset(*preprocess_9120(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_9120(test_data, vocab))

# 上面的代碼等價(jià)于下面的注釋代碼
# train_features, train_labels = preprocess_imdb(train_data, vocab)
# test_features, test_labels = preprocess_imdb(test_data, vocab)
# train_set = Data.TensorDataset(train_features, train_labels)
# test_set = Data.TensorDataset(test_features, test_labels)

# len(train_set) = features.shape[0] or labels.shape[0]
# train_set[index] = (features[index], labels[index])

batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
print('#batches:', len(train_iter))

class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        '''
        @params:
            vocab: 在數(shù)據(jù)集上創(chuàng)建的詞典,用于獲取詞典大小
            embed_size: 嵌入維度大小
            num_hiddens: 隱藏狀態(tài)維度大小
            num_layers: 隱藏層個(gè)數(shù)
        '''
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        
        # encoder-decoder framework
        # bidirectional設(shè)為True即得到雙向循環(huán)神經(jīng)網(wǎng)絡(luò)
        self.encoder = nn.LSTM(input_size=embed_size, 
                                hidden_size=num_hiddens, 
                                num_layers=num_layers,
                                bidirectional=True)
        self.decoder = nn.Linear(4*num_hiddens, 2) # 初始時(shí)間步和最終時(shí)間步的隱藏狀態(tài)作為全連接層輸入
        
    def forward(self, inputs):
        '''
        @params:
            inputs: 詞語(yǔ)下標(biāo)序列,形狀為 (batch_size, seq_len) 的整數(shù)張量
        @return:
            outs: 對(duì)文本情感的預(yù)測(cè),形狀為 (batch_size, 2) 的張量
        '''
        # 因?yàn)長(zhǎng)STM需要將序列長(zhǎng)度(seq_len)作為第一維,所以需要將輸入轉(zhuǎn)置
        embeddings = self.embedding(inputs.permute(1, 0)) # (seq_len, batch_size, d)
        # rnn.LSTM 返回輸出、隱藏狀態(tài)和記憶單元,格式如 outputs, (h, c)
        outputs, _ = self.encoder(embeddings) # (seq_len, batch_size, 2*h)
        encoding = torch.cat((outputs[0], outputs[-1]), -1) # (batch_size, 4*h)
        outs = self.decoder(encoding) # (batch_size, 2)
        return outs

embed_size, num_hiddens, num_layers = 300, 150, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

# 加載預(yù)訓(xùn)練的詞向量

# # 預(yù)訓(xùn)練文件太大,本地來(lái)加載
# model = KeyedVectors.load_word2vec_format(datapath(r'E:\Computer Science\9.deep learning\Project\NLP\pretrained_corpus\sgns.weibo.bigram-char'), binary=False)

# def load_pretrained_embedding(words, pretrained_vocab):
#     '''
#     @params:
#         words: 需要加載詞向量的詞語(yǔ)列表,以 itos (index to string) 的詞典形式給出
#         pretrained_vocab: 預(yù)訓(xùn)練詞向量
#     @return:
#         embed: 加載到的詞向量
#     '''
#     embed = torch.zeros(len(words), pretrained_vocab.vector_size) # 初始化為0
#     oov_count = 0 # out of vocabulary
#     for i, word in enumerate(words):
#         try:
#             embed[i, :] = torch.from_numpy(pretrained_vocab[word])
#         except KeyError:
#             oov_count += 1
#     if oov_count > 0:
#         print("There are %d oov words." % oov_count)
#     return embed

# net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, model))
# net.embedding.weight.requires_grad = False # 直接加載預(yù)訓(xùn)練好的, 所以不需要更新它

# # pickle保存變量
# import pickle
# f = open('data.pkl','wb')
# pickle.dump(net.embedding.weight.data,f)  
# f.close()

# pickle讀取變量
import pickle
f = open('/home/kesci/work/merge_train_test.pkl','rb')
read_data = pickle.load(f)  
f.close()
net.embedding.weight.data.copy_(read_data)
net.embedding.weight.requires_grad = False # 直接加載預(yù)訓(xùn)練好的, 所以不需要更新它

# 訓(xùn)練模型
def evaluate_accuracy(data_iter, net, device=device):
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval()
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train()
            else:
                if('is_training' in net.__code__.co_varnames):
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))


lr, num_epochs = 0.01, 15
optimizer = torch.optim.Adam(
    filter(
        lambda p: p.requires_grad,
        net.parameters()),
    lr=lr)
loss = nn.CrossEntropyLoss()

train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

# 評(píng)價(jià)模型
def predict_sentiment(net, vocab, sentences):
    '''
    @params:
        net: 訓(xùn)練好的模型
        vocab: 在該數(shù)據(jù)集上創(chuàng)建的詞典,用于將給定的單詞序轉(zhuǎn)換為單詞下標(biāo)的序列,從而輸入模型
        sentence: 需要分析情感的文本,以單詞序列的形式給出
    @return: 預(yù)測(cè)的結(jié)果,positive 為正面情緒文本,negative 為負(fù)面情緒文本
    '''
    predict_result = []
    for sentence in sentences:
        # print(sentence)
        device = list(net.parameters())[0].device # 讀取模型所在的環(huán)境
        sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
        t = net(sentence.view((1, -1)))
        m = nn.Softmax(dim=1)
        hat = m(t)[0][1]
        hat = hat.detach().cpu().numpy()
        hat.astype(np.float16)
        predict_result = np.append(predict_result, hat)
        # print(hat)
        # print('\n')
        # label = torch.argmax(net(sentence.view((1, -1))), dim=1)
        # print(label)
        # print(net(sentence.view((1, -1))))
        # return 'positive' if label.item() == 1 else 'negative'
    
    return predict_result

# predict_sentiment(net, vocab, [['飯', '哈哈香'],['飯', '香']])

# vocab_handout = get_vocab_9120(cmts_handout)
tokenized_data_handout = get_tokenized_9120(cmts_handout)
result = pd.DataFrame(predict_sentiment(net, vocab, tokenized_data_handout))

result.to_csv("./output_v299.csv")
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容