前言
- 蛋肥學習了如何提升爬蟲速度,打算分別嘗試單線程爬蟲、多線程爬蟲、多進程爬蟲、多協(xié)程爬蟲來進行數(shù)據(jù)抓取,并對比其實際抓取速度。
準備
爬取時間:2021/03/10
系統(tǒng)環(huán)境:Windows 10
所用工具:Jupyter Notebook\Python 3.0
涉及的庫:requests\lxml\selenium\time\threading\queue\multiprocessing\gevent\sys
獲取網(wǎng)址信息
優(yōu)設(shè)導(dǎo)航
https://hao.uisdc.com/
import requests
from lxml import etree
def getinfo(xpath):
url="https://hao.uisdc.com/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0"}
r=requests.get(url,headers=headers,timeout=10)
html=etree.HTML(r.text)
info=html.xpath(xpath)
return(info)
link=getinfo('//div[@class="item"]/a/@href')
title=getinfo('//div[@class="item"]/a/h3/text()')
獲取網(wǎng)頁截圖
單線程爬蟲
from selenium import webdriver
import time
def getshot(url,name):
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
start=time.time()
for i in range(len(url)):
try:
driver.get(url[i])
#等待頁面加載完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁截圖\img"+name[i]+".png")
except:
continue
end=time.time()
print("單線程爬蟲所用時間:",end-start)
getshot(link,title)
多線程爬蟲
參考資料
Python多線程
import threading
import time
import queue as Queue
from selenium import webdriver
start=time.time()
#截圖函數(shù),設(shè)置get的超時,以防一直取不到卡死
def getshot(name,url):
url=url.get(timeout=2)
picname=name+" "+str(time.time())
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
try:
driver.get(url)
#等待頁面加載完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁截圖\img"+picname+".png")
driver.quit()
except:
print(name+"出錯")
class myThread(threading.Thread):
def __init__(self,name,url):
threading.Thread.__init__(self)
self.name=name
self.url=url
def run(self):
while True:
try:
getshot(self.name,self.url)
except:
break
threadlist=["Thread-1","Thread-2","Thread-3","Thread-4","Thread-5"]
workQueue=Queue.Queue(200)
threads=[]
#創(chuàng)建新線程
for tName in threadlist:
thread=myThread(tName,workQueue)
thread.start()
threads.append(thread)
#填充隊列
for i in range(len(link)):
workQueue.put(link[i])
#等待所有線程完成
for t in threads:
t.join()
end=time.time()
print("Queue多線程爬蟲所用時間:",end-start)
多進程爬蟲
參考資料
多進程在運行的時候只有一個子進程會運行,怎么解決
用python進行多進程編程時,只有主進程可以運行,子進程貌似沒有運行是什么原因
面試總結(jié),多進程和多線程的區(qū)別
#如果CPU是單核,就無法進行多進程并行,需要先了解計算機CPU的核心數(shù)量
from multiprocessing import cpu_count
print(cpu_count()) #蛋肥的電腦是8核
#Windows 以下代碼需寫成.py文件,然后用cmd啟動(蛋肥用的Anaconda Powershell Prompt)
from multiprocessing import Process,Queue
import time
from selenium import webdriver
start=time.time()
#截圖函數(shù),設(shè)置get的超時,以防一直取不到卡死
def getshot(name,url):
url=url.get(timeout=2)
picname=name+" "+str(time.time())
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
try:
driver.get(url)
#等待頁面加載完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁截圖\img"+picname+".png")
driver.quit()
except:
print(name+"出錯")
class myProcess(Process):
def __init__(self,name,url):
Process.__init__(self)
self.name=name
self.url=url
def run(self):
while True:
try:
print(self.name)
getshot(self.name,self.url)
except:
break
#要寫if,具體原因還沒完全搞懂
if __name__=="__main__":
processlist=["Process-1","Process-2","Process-3","Process-4","Process-5"]
workQueue=Queue(200)
processes=[]
#填充隊列
for i in range(len(link)):
workQueue.put(link[i])
#創(chuàng)建新進程
for pName in processlist:
process=myProcess(pName,workQueue)
processes.append(process)
for t in processes:
t.start()
for t in processes:
t.join()
end=time.time()
print("Queue多進程爬蟲所用時間:",end-start)
多協(xié)程爬蟲
參考資料
Python中g(shù)event模塊使用及出現(xiàn)MonkeyPatchWarning
Python的最大遞歸深度錯誤maximum recursion depth exceeded while calling a Python object
#monkey必須放在最前面,必須在獲取網(wǎng)址信息代碼(requests)的前面
import gevent
from gevent import monkey
monkey.patch_all()
#設(shè)置最大遞歸深度限制
import sys
sys.setrecursionlimit(1000000)
from gevent.queue import Queue,Empty
import time
from selenium import webdriver
start=time.time()
#截圖函數(shù),設(shè)置get的超時,以防一直取不到卡死
def getshot(index):
while not workQueue.empty():
url=workQueue.get(timeout=2)
picname="Process-"+str(index)+str(time.time())
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
try:
driver.get(url)
#等待頁面加載完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁截圖\img"+picname+".png")
driver.quit()
except:
print("出錯")
def boss():
#填充隊列
for i in range(len(link)):
workQueue.put_nowait(link[i])
if __name__=="__main__":
workQueue=Queue(10000)
gevent.spawn(boss).join()
jobs=[]
for i in range(5):
jobs.append(gevent.spawn(getshot,i))
gevent.joinall(jobs)
end=time.time()
print("Queue多協(xié)程爬蟲所用時間:",end-start)
爬取結(jié)果

爬取時長對比

部分網(wǎng)頁截圖
進一步學習
總結(jié)
- 可通過多線程、多進程、多協(xié)程的方式提升數(shù)據(jù)爬取的速度,但需合理選擇數(shù)量,一味地增加可能會適得其反。
