requests+正則表達(dá)式爬取貓眼電影排行
具體用法說明詳見代碼注釋:
運(yùn)行代碼結(jié)果會在當(dāng)前文件所在目錄生成一個result.txt的文件
注意:為了讓讀者看的更清晰,我已將正則表達(dá)式和匹配內(nèi)容一一對應(yīng)起來
1.單頁爬取
import re
import json
import requests
from requests.exceptions import RequestException
def get_one_page(url):
'''1.爬取網(wǎng)頁html文件'''
try:
# 作者發(fā)現(xiàn)貓眼電影現(xiàn)在對爬蟲做了封鎖,必須添加下面的headers才行,不然會被封禁
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
'''2.通過正則表達(dá)式對爬取的html文件進(jìn)行清洗(匹配)'''
# 正則表達(dá)式匹配排名:<dd>.*?board-index.*?>(\d+)</i>
# 封面圖:.*?data-src="(.*?)"
# 電影名稱:.*?name"><a.*?>(.*?)</a>
# 主演:.*?star">(.*?)</p>
# 上映時間:.*?releasetime">(.*?)</p>
# 評分左半部分:.*?integer">(.*?)</i>
# 評分右半部分:.*?fraction">(.*?)</i>
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>'+
'.*?data-src="(.*?)"'+
'.*?name"><a.*?>(.*?)</a>'+
'.*?star">(.*?)</p>'+
'.*?releasetime">(.*?)</p>'+
'.*?integer">(.*?)</i>'+
'.*?fraction">(.*?)</i>', re.S)
items = re.findall(pattern,html)
# 對爬取數(shù)據(jù)進(jìn)行整理
for item in items:
# print(item)
# print('index:'+item[0]),
# print('image:'+item[1]),
# print('title:'+item[2]),
# print('actor:'+item[3].strip()[3:]),
# print('time:'+item[4][5:]),
# print('scorce:'+item[5]+item[6]),
yield {
'排名':item[0],
'封面':item[1],
'電影名稱':item[2],
'主演':item[3].strip()[3:],
'上映時間':item[4][5:],
'評分':item[5]+item[6]
}
def write_to_file(content):
with open('result.txt','a',encoding='utf-8') as f:
# content是字典形式,需要通過json.dumps()將其轉(zhuǎn)換成字符串,并加上換行符
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def main():
'''3.主函數(shù)'''
url = 'http://maoyan.com/board/4'
html = get_one_page(url)
# 對爬取的結(jié)果進(jìn)行遍歷
for item in parse_one_page(html):
# print(item)
write_to_file(item)
if __name__ == '__main__':
main()
2.多頁爬取
多頁爬取只要根據(jù)網(wǎng)頁特點,拼接處頁面信息即可
觀察每頁的url:
首頁:http://maoyan.com/board/4?
第一頁:http://maoyan.com/board/4?offset=0
第二頁:http://maoyan.com/board/4?offset=10
第三頁:http://maoyan.com/board/4?offset=20
....
由此可見,頁面每頁就是設(shè)置個offset的參數(shù)即可,下面根據(jù)這個規(guī)律去修改我們的main()函數(shù)里面的url,其他不變
def main(offset):
'''3.主函數(shù)'''
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
# 對爬取的結(jié)果進(jìn)行遍歷
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(i*10)
3.多進(jìn)程的爬取
首先引入進(jìn)程池:from multiprocessing import Pool
然后對if__main__=='__main__'進(jìn)行修改即可
if __name__ == '__main__':
pool = Pool()
pool.map(main,[i*10 for i in range(10)])
完整代碼:
import re
import json
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException
def get_one_page(url):
'''1.爬取網(wǎng)頁html文件'''
try:
# 作者發(fā)現(xiàn)貓眼電影現(xiàn)在對爬蟲做了封鎖,必須添加下面的headers才行,不然會被封禁
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
'''2.通過正則表達(dá)式對爬取的html文件進(jìn)行清洗(匹配)'''
# 正則表達(dá)式匹配排名:<dd>.*?board-index.*?>(\d+)</i>
# 封面圖:.*?data-src="(.*?)"
# 電影名稱:.*?name"><a.*?>(.*?)</a>
# 主演:.*?star">(.*?)</p>
# 上映時間:.*?releasetime">(.*?)</p>
# 評分左半部分:.*?integer">(.*?)</i>
# 評分右半部分:.*?fraction">(.*?)</i>
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>'+
'.*?data-src="(.*?)"'+
'.*?name"><a.*?>(.*?)</a>'+
'.*?star">(.*?)</p>'+
'.*?releasetime">(.*?)</p>'+
'.*?integer">(.*?)</i>'+
'.*?fraction">(.*?)</i>', re.S)
items = re.findall(pattern,html)
# 對爬取數(shù)據(jù)進(jìn)行整理
for item in items:
# print(item)
# print('index:'+item[0]),
# print('image:'+item[1]),
# print('title:'+item[2]),
# print('actor:'+item[3].strip()[3:]),
# print('time:'+item[4][5:]),
# print('scorce:'+item[5]+item[6]),
yield {
'排名':item[0],
'封面':item[1],
'電影名稱':item[2],
'主演':item[3].strip()[3:],
'上映時間':item[4][5:],
'評分':item[5]+item[6]
}
def write_to_file(content):
with open('result.txt','a',encoding='utf-8') as f:
# content是字典形式,需要通過json.dumps()將其轉(zhuǎn)換成字符串,并加上換行符
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def main(offset):
'''3.主函數(shù)'''
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
# 對爬取的結(jié)果進(jìn)行遍歷
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
pool = Pool()
pool.map(main,[i*10 for i in range(10)])

效果圖