B站爬蟲代碼

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import pymongo
import time
import datetime as dt
import random
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
print('導(dǎo)入成功!','\n','-'*20)

def get_urls(n):
    #【分頁網(wǎng)頁url采集】函數(shù)
    #n:頁數(shù)參數(shù)
    lst = []
    for i in range(n):
        lst.append('https://space.bilibili.com/82366241/video?tid=0&page=%i&keyword=&order=pubdate' %(i+1))
    return lst

def get_data(ui,d_h,d_c1,d_c2,table):
    #【視頻網(wǎng)頁url采集】函數(shù)
    #u:起始網(wǎng)址
    #d_h:user-agent信息
    #d_c1,d_c2:cookies信息(兩個網(wǎng)址)
    r1 = requests.get(url = ui,headers = d_h,cookies = d_c1)
    soup1 = BeautifulSoup(r1.text,'lxml')
    title = soup1.h1['title']
    time = re.search(r'(\d*-\d*-\d* \d*:\d*:\d*)',soup1.find('div',class_="video-data").text).group(1)
    aid = re.search(r'av(\d*).*',ui).group(1)
    up = soup1.find('div',class_="name").a.text
    r2 = requests.get(url = 'https://api.bilibili.com/x/web-interface/archive/stat?aid=%s' %aid,
                      headers = d_h,cookies = d_c2)
    soup2 = BeautifulSoup(r2.text,'lxml')
    name = soup1.find('div',class_="info open").text
    info = re.search(r'"view.*(\d*),"now_rank"',soup2.text).group().split(',')[:-2]
    date = str(dt.date.today())
    dic = {}
    dic['標(biāo)題'] = title
    dic['介紹'] = name
    dic['aid'] = aid
    dic['上線時間'] = time
    dic['up主'] = up
    dic['采集時間'] = date
    for i in info:
        dic[i.split(':')[0].replace('"','')] = i.split(':')[1]
    table.insert_one(dic)
    return len(dic)

if __name__ == "__main__": 
    urllst = get_urls(14)
    lilst = []       
    brower = webdriver.Chrome()
    for u in urllst:  
        brower.get(u)
        time.sleep(1)
        ul = brower.find_element_by_class_name('list-list')
        lis = ul.find_elements_by_tag_name('a')
        for li in lis:
            lilst.append(li.get_attribute('href'))
    lilst = list(set(lilst))
            
    dic_h = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/70.0.3521.2 Safari/537.36'}
    dic_c1 = {}
    cookies1 = '''xxx'''#替換cookies
    for i in cookies1.split('; '):
        dic_c1[i.split('=')[0]] = i.split('=')[1]    
    dic_c2 = {}
    cookies2 = '''xxx'''#替換cookies
    for i in cookies2.split('; '):
        dic_c2[i.split('=')[0]] = i.split('=')[1]
        
    myclient = pymongo.MongoClient("mongodb://localhost:27017/")
    db = myclient['看電影了沒']
    datatable = db['視頻信息'+ str(dt.date.today())]
    starttime = time.time()
    errorlst = []
    datalst = []
    for ui in lilst:
        try:
            datalst.append(get_data(ui,dic_h,dic_c1,dic_c2,datatable))
            print('數(shù)據(jù)采集成功,總共采集%i條數(shù)據(jù)' % len(datalst))
        except:
            errorlst.append(ui)
            print('數(shù)據(jù)采集失敗,數(shù)據(jù)網(wǎng)址為:',ui)
        time.sleep(random.randint(1,3))
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容