#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: xxx
import string
import sys
import csv
import json
import logging
import random
import pymysql
import os
import sys
# import jieba
# import jieba.posseg
# import jieba.analyse
# import json
import time
#from text_quality_classifier import para_sims_tagger_feature_exact
#from text_quality_classifier import layout_tagger_feature_extract
#from content_base_feature import content_base_fea_extract
#from custom_dict_feature import feature_custom_dict_v4_train
#from infor_entropy_fea import part_of_speech_extract_jieba
# 設(shè)定日志級別和格式
logging.basicConfig(
level=logging.FATAL,
format=
'%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
def connect_db():
"""mysql鏈接
"""
return pymysql.connect(host='10.xxx.83.15',
port=5203,
user='om_article_r',
password='0f59ee296',
database='om_article',
charset='utf8')
# 鏈接
con = connect_db()
cur = con.cursor()
def select_from_sql(cmsid):
"""查詢
"""
# 計(jì)算時間
date = cmsid
if date[0].isdigit():
date = date[:6]
else:
date = date[3:9]
# 拼接查詢串
sql_str = ("select media_id,title,content "
+ " from news_article_%s where cmsid='%s' limit 1" % (date, cmsid))
logging.info(sql_str)
try:
# 查詢
cur.execute(sql_str)
# 獲取
row = cur.fetchone()
return row
except:
return ("", "", "")
def get_baicao_metas(art_file, out_file):
"""請求baicao,獲取內(nèi)容
"""
# 去重結(jié)果
results = {}
lst_cmsid = []
lst_label = []
lst_media_id = []
lst_title = []
lst_content = []
# fw = open(out_file, "w", encoding='utf-8')
# # 寫表頭
# fw.write("cmsid\tlabel\tmedia_id\ttitle\tcontent\n")
with open(out_file, "w", encoding='utf-8') as fw:
fw.write("cmsid\tlabel\tmedia_id\ttitle\tcontent\n")
# fr = open(art_file, 'r', encoding='utf-8')
with open(art_file, 'r', encoding='utf-8') as fr:
lines = fr.readlines()
for line in lines[1:]:
try:
line_list = line.strip().split('\t')
label = line_list[0]
cmsid = line_list[1]
# 請求百草mysql
media_id, title, content = select_from_sql(cmsid)
content = content.replace("\n", "").replace("\r", "").replace("\t", "")
title = title.replace("\n", "").replace("\r", "").replace("\t", "")
lst_cmsid.append(cmsid)
lst_label.append(label)
lst_media_id.append(media_id)
lst_title.append(title)
lst_content.append(content)
# featrue_list=[]#,feature_size,paras = nlp_fea_extrt.feature_extract(title,content_html)
# content = line_list[4]
if title == "":
logging.warn("not found: " + cmsid)
continue
# with open(out_file, "w", encoding='utf-8') as fw:
# fw.write(cmsid + '\t' + label + '\t' + media_id + '\t' + title + '\t' + content + '\n')
print(len(lst_cmsid))
except:
continue
# 控制頻率
time.sleep(0.0)
with open(out_file, "a", encoding='utf-8') as fw:
for i in range(len(lst_cmsid)):
fw.write(lst_cmsid[i] + '\t' + lst_label[i] + '\t' + lst_media_id[i] + '\t' + lst_title[i] + '\t' + lst_content[i] + '\n')
cur.close()
con.close()
art_file = '../ft_local/titles_n_o_2.eval.a.1_30'
out_file = '../ft_local/titles_n_o_2_out.eval.a.1_30'
get_baicao_metas(art_file, out_file)
# # usage
# if len(sys.argv) < 3:
# print("usage:")
# print("\t./request_baicao.py corpus[IN] corpus.new[OUT]")
# sys.exit(1)
#
# # run
# get_baicao_metas(sys.argv[1], sys.argv[2])
# sys.exit(0)
pymysql
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。
相關(guān)閱讀更多精彩內(nèi)容
- 上面可見已經(jīng)安裝成功 但是引入的時候會報(bào)錯: 解決方法: 在uwsgi.ini中添加
- 每天都一堆的事情等著去研究等著去做。。。又是晚上11點(diǎn)了。。。再研究一下吧。。。 按照上一天說的進(jìn)度今天研究下py...
- 今天在用pymysql插入數(shù)據(jù)時出現(xiàn)一下異常。 后來檢查數(shù)據(jù)發(fā)現(xiàn)your-price列對應(yīng)的數(shù)據(jù)為float,長度...
- Mysql驅(qū)動介紹 MySQL-python(棄用):也就是MySQLdb,是對C語言操作MySQL數(shù)據(jù)庫的一個簡...