通過urllib庫獲取網(wǎng)頁內(nèi)容,用BeautifulSoup和re解析網(wǎng)頁提取指定內(nèi)容,最后通過xlwt將處理后的數(shù)據(jù)保存到excel上。
from bs4 import BeautifulSoup
import urllib.request,urllib.error
import re,xlwt
def main():
baseurl = "https://movie.douban.com/top250?start="
global datalist
datalist = getData(baseurl)
savepath = r"doubanTop250.xls"
saveData(savepath)
#得到指定URL的網(wǎng)頁內(nèi)容
def askURL(url):
proxy = {
"http":"http://代理IP:端口",
"https":"http://代理IP:端口"
}
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 Edg/93.0.961.52"
}
proxy_handler = urllib.request.ProxyHandler(proxy)
opener = urllib.request.build_opener(proxy_handler)
req = urllib.request.Request(url=url,headers=header)
try:
response = opener.open(req).read().decode('utf-8')
except urllib.error.URLError as e:
if hasatter(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return response
#解析網(wǎng)頁,獲取指定內(nèi)容
findLink = re.compile(r'<a class="" href="(.*?)">')
findTitle = re.compile(r'<span class="title">(.*?)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>',re.S)
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
findNum = re.compile(r'<span>(\d*)人評價</span>')
findInq = re.compile(r'<span class="inq">(.*?)</span>')
def getData(baseurl):
datalist = []
for i in range(10):
url = baseurl + str(i*25)
html = askURL(url)
soup = BeautifulSoup(html,features='html.parser')
for item in soup.find_all('div',class_="info"):
data = []
item = str(item)
link = re.findall(findLink,item)[0]
data.append(link)
title = re.findall(findTitle,item)[0]
data.append(title)
bd = re.findall(findBd,item)[0]
bd = re.sub(r'<br(\s+)?/>(\s+)?','',bd)
bd = re.sub(r'/','',bd)
data.append(bd.strip())
rating = re.findall(findRating,item)
data.append(rating)
num = re.findall(findNum,item)
data.append(num)
inq = re.findall(findInq,item)
data.append(inq)
datalist.append(data)
return datalist
#保存數(shù)據(jù)
def savaData(savapath):
print("save...")
wb = xlwt.Workbook(encoding='utf-8',style_compression=0)
ws = wb.add_sheet('doubanTop250',cell_overwrite_ok=True)
col = ("電影詳情連接","電影中文名","概述","評分","評分?jǐn)?shù)","相關(guān)信息")
for i in range(6):
ws.write(0,i,col[i])
for r in range(250):
for c in range(6):
ws.write(r+1,c,datalist[r][c])
wb.save(savepath)
if __name__ == '__main__':
main()
print("爬取完畢!")