Python爬蟲實戰(zhàn)(6)單線程、多線程、多進程、多協(xié)程對比

前言

  • 蛋肥學習了如何提升爬蟲速度,打算分別嘗試單線程爬蟲、多線程爬蟲、多進程爬蟲、多協(xié)程爬蟲來進行數(shù)據(jù)抓取,并對比其實際抓取速度。

準備

爬取時間:2021/03/10
系統(tǒng)環(huán)境:Windows 10
所用工具:Jupyter Notebook\Python 3.0
涉及的庫:requests\lxml\selenium\time\threading\queue\multiprocessing\gevent\sys

獲取網(wǎng)址信息

優(yōu)設(shè)導(dǎo)航
https://hao.uisdc.com/

import requests
from lxml import etree

def getinfo(xpath):
    url="https://hao.uisdc.com/"
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0"}
    r=requests.get(url,headers=headers,timeout=10)
    html=etree.HTML(r.text)
    info=html.xpath(xpath)
    return(info)

link=getinfo('//div[@class="item"]/a/@href')
title=getinfo('//div[@class="item"]/a/h3/text()')

獲取網(wǎng)頁截圖

單線程爬蟲

from selenium import webdriver
import time

def getshot(url,name):
    driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
    driver.maximize_window()
    start=time.time()
    for i in range(len(url)):
        try:
            driver.get(url[i])
            #等待頁面加載完成
            time.sleep(1)
            driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁截圖\img"+name[i]+".png")
        except:
            continue
    end=time.time()
    print("單線程爬蟲所用時間:",end-start)

getshot(link,title)

多線程爬蟲

參考資料
Python多線程

import threading
import time
import queue as Queue
from selenium import webdriver

start=time.time()
#截圖函數(shù),設(shè)置get的超時,以防一直取不到卡死
def getshot(name,url):
    url=url.get(timeout=2)
    picname=name+" "+str(time.time())
    driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
    driver.maximize_window()
    try:
        driver.get(url)
        #等待頁面加載完成
        time.sleep(1)
        driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁截圖\img"+picname+".png")
        driver.quit()
    except:
        print(name+"出錯")

class myThread(threading.Thread):
    def __init__(self,name,url):
        threading.Thread.__init__(self)
        self.name=name
        self.url=url
    def run(self):
        while True:
            try:
                getshot(self.name,self.url)
            except:
                break

threadlist=["Thread-1","Thread-2","Thread-3","Thread-4","Thread-5"]
workQueue=Queue.Queue(200)
threads=[]

#創(chuàng)建新線程
for tName in threadlist:
    thread=myThread(tName,workQueue)
    thread.start()
    threads.append(thread)
#填充隊列
for i in range(len(link)):
    workQueue.put(link[i])
#等待所有線程完成
for t in threads:
    t.join()

end=time.time()
print("Queue多線程爬蟲所用時間:",end-start)

多進程爬蟲

參考資料
多進程在運行的時候只有一個子進程會運行,怎么解決
用python進行多進程編程時,只有主進程可以運行,子進程貌似沒有運行是什么原因
面試總結(jié),多進程和多線程的區(qū)別

#如果CPU是單核,就無法進行多進程并行,需要先了解計算機CPU的核心數(shù)量
from multiprocessing import cpu_count
print(cpu_count()) #蛋肥的電腦是8核
#Windows 以下代碼需寫成.py文件,然后用cmd啟動(蛋肥用的Anaconda Powershell Prompt)
from multiprocessing import Process,Queue
import time
from selenium import webdriver

start=time.time()
#截圖函數(shù),設(shè)置get的超時,以防一直取不到卡死
def getshot(name,url):
    url=url.get(timeout=2)
    picname=name+" "+str(time.time())
    driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
    driver.maximize_window()
    try:
        driver.get(url)
        #等待頁面加載完成
        time.sleep(1)
        driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁截圖\img"+picname+".png")
        driver.quit()
    except:
        print(name+"出錯")

class myProcess(Process):
    def __init__(self,name,url):
        Process.__init__(self)
        self.name=name
        self.url=url
    def run(self):
        while True:
            try:
                print(self.name)
                getshot(self.name,self.url)
            except:
                break
 
#要寫if,具體原因還沒完全搞懂               
if __name__=="__main__": 
    processlist=["Process-1","Process-2","Process-3","Process-4","Process-5"]
    workQueue=Queue(200)
    processes=[]
    
    #填充隊列
    for i in range(len(link)):
        workQueue.put(link[i])
    #創(chuàng)建新進程
    for pName in processlist:
        process=myProcess(pName,workQueue)
        processes.append(process)
    for t in processes:
        t.start()
    for t in processes:
        t.join()

    end=time.time()
    print("Queue多進程爬蟲所用時間:",end-start)

多協(xié)程爬蟲

參考資料
Python中g(shù)event模塊使用及出現(xiàn)MonkeyPatchWarning
Python的最大遞歸深度錯誤maximum recursion depth exceeded while calling a Python object

#monkey必須放在最前面,必須在獲取網(wǎng)址信息代碼(requests)的前面
import gevent
from gevent import monkey
monkey.patch_all()
#設(shè)置最大遞歸深度限制
import sys  
sys.setrecursionlimit(1000000) 

from gevent.queue import Queue,Empty
import time
from selenium import webdriver

start=time.time()
#截圖函數(shù),設(shè)置get的超時,以防一直取不到卡死
def getshot(index):
    while not workQueue.empty():
        url=workQueue.get(timeout=2)
        picname="Process-"+str(index)+str(time.time())
        driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
        driver.maximize_window()
        try:
            driver.get(url)
            #等待頁面加載完成
            time.sleep(1)
            driver.save_screenshot(r"C:\Users\Archer\Desktop\網(wǎng)頁截圖\img"+picname+".png")
            driver.quit()
        except:
            print("出錯")

def boss():
    #填充隊列
    for i in range(len(link)):
        workQueue.put_nowait(link[i])

if __name__=="__main__": 
    workQueue=Queue(10000)
    gevent.spawn(boss).join()
    jobs=[]
    for i in range(5):
        jobs.append(gevent.spawn(getshot,i))
    gevent.joinall(jobs)

    end=time.time()
    print("Queue多協(xié)程爬蟲所用時間:",end-start)

爬取結(jié)果

爬取時長對比

部分網(wǎng)頁截圖

進一步學習

進程、線程和協(xié)程之間的區(qū)別和聯(lián)系
面向?qū)ο蠡靖拍?/a>

總結(jié)

  • 可通過多線程、多進程、多協(xié)程的方式提升數(shù)據(jù)爬取的速度,但需合理選擇數(shù)量,一味地增加可能會適得其反。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容