總體思路

使用多線程爬蟲可以提高爬取和儲(chǔ)存的速度，雖然python中的線程是假的，但對(duì)于io操作來說，多線程是起作用的。
總體思路用生產(chǎn)者與消費(fèi)者的模型來設(shè)計(jì)。

將要爬取的url放入urlQUeue的隊(duì)列中
負(fù)責(zé)爬取網(wǎng)頁信息的工人（線程），從url隊(duì)列獲取url，進(jìn)行請(qǐng)求，把爬取的網(wǎng)頁信息放入一個(gè)dataQueue的隊(duì)列中。
負(fù)責(zé)解析的工人，從dataQueue中獲取網(wǎng)頁信息，進(jìn)行解析后，存儲(chǔ)。

# coding=utf8
import urllib
import urllib2
from lxml import etree
import json
from threading import Thread
from Queue import Queue

CRAWL_EXIT = False
PARSE_EXIT = False

class ThreadCrawl(Thread):
    def __init__(self, threadName, pageQueue, dataQueue):
        super(ThreadCrawl, self).__init__()
        self.pageQueue = pageQueue
        self.dataQueue = dataQueue
        self.threadName = threadName
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}

    def run(self):
        print '%s啟動(dòng)' %self.threadName
        while not CRAWL_EXIT:
            try:
                # 默認(rèn)block為true，當(dāng)隊(duì)列空時(shí)堵塞，直到有新的元素加入隊(duì)列
                page = self.pageQueue.get()
                url = 'https://www.qiushibaike.com/text/page/%d/' % page
                request = urllib2.Request(url, headers=self.headers)
                response = urllib2.urlopen(request).read()
                self.dataQueue.put(response)
            except:
                pass


class ThreadParse(Thread):
    def __init__(self, parseName, dataQueue, fileName):
        super(ThreadParse, self).__init__()
        self.dataQueue = dataQueue
        self.parseName = parseName
        self.fileName = fileName

    def run(self):
        print '%s啟動(dòng)' %self.parseName
        while not PARSE_EXIT:
            try:
                html = self.dataQueue.get(False)
                self.parse(html)
            except:
                pass

    def parse(self, html):
        text = etree.HTML(html)

        # 創(chuàng)建 模糊查詢的根節(jié)點(diǎn)，包含每條段子的全部信息
        node_list = text.xpath('//div[contains(@id,"qiushi_tag")]')

        items = {}
        for node in node_list:
            # 內(nèi)容,取出標(biāo)簽下的內(nèi)容 第一個(gè)標(biāo)簽 text
            content = node.xpath('.//div[@class="content"]/span')[0].text

            # 用戶名
            try:
                username = node.xpath('./div[1]/a[2]/h2')[0].text
            except:
                print '沒有用戶'
            items = {'username': username,
                     'content': content}
            self.fileName.write(json.dumps(items, ensure_ascii=False).encode('utf8') + '\n')


def main():
    # 頁碼隊(duì)列
    pageQueue = Queue(10)
    for i in range(1, 11):
        pageQueue.put(i)

    # 表示采集好的html源碼隊(duì)列
    dataQueue = Queue()

    crawlList = ['采集線程一號(hào)', '采集線程二號(hào)', '采集線程三號(hào)']

    # 啟動(dòng)三個(gè)采集線程
    thread_carwl = []
    for tname in crawlList:
        thread = ThreadCrawl(tname, pageQueue, dataQueue)
        thread.start()
        thread_carwl.append(thread)

    praseList = ['解析線程一號(hào)', '解析線程二號(hào)', '解析線程三號(hào)']
    prase_thread = []
    fileName = open('duanzi.json', 'a')
    for tname in praseList:
        thread = ThreadParse(tname, dataQueue, fileName)
        thread.start()
        prase_thread.append(thread)

    # 頁碼對(duì)列不為空時(shí)
    while not pageQueue.empty():
        pass

    global CRAWL_EXIT
    CRAWL_EXIT = True

    while not dataQueue.empty():
        pass
    global PARSE_EXIT
    PARSE_EXIT = True

    # 主線程堵塞，等待采集線程完成
    for thread in thread_carwl:
        thread.join()
        print('采集完成')

    for thread in prase_thread:
        thread.join()
        print('寫入完成')

    fileName.close()


if __name__ == '__main__':
    main()

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

多線程爬蟲小練習(xí)

多線程爬蟲小練習(xí)

總體思路

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

多線程爬蟲小練習(xí)

總體思路

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av