第四周作業(yè) 爬取微博分組文本信息

import requests
import json
import re

headers = {
    'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 
    Mobile/15A372Safari/604.1',
    'cookie':'SINAGLOBAL=4840874402537.198.1560220210050; '
    'SCF=Al45B-3Yy8PRwpBxWrPVwC4KWFAgGR-K8wmHVOdZAvHlPBrN37LJmYK5XUy-LpOxJIO9sKiJ1DrRFkfWWiJOp9g.; '
    'login=609423641c81693ee710ee69b0d0e34c; _s_tentry=login.sina.com.cn; '
    'Apache=941276552050.3962.1560254570611; '
    'ULV=1560254570632:5:5:5:941276552050.3962.1560254570611:1560254071344; '
    'webim_unReadCount=%7B%22time%22%3A1560254573029%2C%22dm_pub_total%22%3A
     4%2C%22chat_group_pc%22%3A0%2C%22allcountNum%22%3A9%2C%22msgbox%22%3A0%7D; '
     'login_sid_t=5a2760576b886a56beefe6dba6fefe88; cross_origin_proto=SSL; '
     'UOR=ent.china.com.cn,widget.weibo.com,login.sina.com.cn; '
     'SUB=_2A25x--jNDeRhGedJ6VoX9SjKzzyIHXVTcV0FrDV8PUJbmtAKLWfRkW9NViqjNjVIAYg5Zqc3Q5pOtTJOyjna0kGM; '
     'SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhiYQ5eDzjWlO-mzTFlCvA55NHD95QpS0zRSo-cSoB7Ws4DqcjZgJH7IGLEBP8fqBtt; 
      SUHB=0sqwkAiAGL641B; '
      'SRT=D.QqHBJZPtINsgO!Mb4cYGSdM1ibS6dDbOT!kw5ebHNEYdPFidJmSpMERt4EP1RcsrA4kJP-
      SNTsVuObS9Vd9KTbSHAPbeiP9oRqiMiQBLWEM1O3bgUcmnTrWi*B.vAflW-P9Rc0lR-
      ykSDvnJqiQVbiRVPBtS!r3JZPQVqbgVdWiMZ4siOzu4DbmKPWfTPbMidPm5c9QUm0kUQ94McoDW!msi4uzMD!s4QBOJG9N4-
      0kRsi6AZWQVqMmVdigdcW1NDWHMroEAbvtSeECWGA7; '
      'SRF=1560254621; ALF=1591790621; SSOLoginState=1560254621'
}

url = 'https://m.weibo.cn/feed/group?gid=201109200354798847'

def get_info(url,page):
    res = requests.get(url,headers = headers)
    json_data = json.loads(res.text)
    statuses = json_data['data']['statuses']
    for statuse in statuses:
        text = statuse['text']
        new_text = re.sub('[a-zA-Z0-9\s<="_>:/.?%]+','',text,re.S)
        print(new_text)
    next_cursor = json_data['data']['next_cursor']
    page = page + 1
    if page <=20:
        next_url =f'https://m.weibo.cn/feed/group?gid=201109200354798847&max_id={next_cursor}'
        get_info(next_url,page)
    else:
        pass

get_info(url,1)
屏幕截圖.jpg
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀(guān)點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

友情鏈接更多精彩內(nèi)容