總體思路
使用多線程爬蟲可以提高爬取和儲(chǔ)存的速度,雖然python中的線程是假的,但對(duì)于io操作來說,多線程是起作用的。
總體思路用生產(chǎn)者與消費(fèi)者的模型來設(shè)計(jì)。
- 將要爬取的url放入urlQUeue的隊(duì)列中
- 負(fù)責(zé)爬取網(wǎng)頁信息的工人(線程),從url隊(duì)列獲取url,進(jìn)行請(qǐng)求,把爬取的網(wǎng)頁信息放入一個(gè)dataQueue的隊(duì)列中。
- 負(fù)責(zé)解析的工人,從dataQueue中獲取網(wǎng)頁信息,進(jìn)行解析后,存儲(chǔ)。
# coding=utf8
import urllib
import urllib2
from lxml import etree
import json
from threading import Thread
from Queue import Queue
CRAWL_EXIT = False
PARSE_EXIT = False
class ThreadCrawl(Thread):
def __init__(self, threadName, pageQueue, dataQueue):
super(ThreadCrawl, self).__init__()
self.pageQueue = pageQueue
self.dataQueue = dataQueue
self.threadName = threadName
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
def run(self):
print '%s啟動(dòng)' %self.threadName
while not CRAWL_EXIT:
try:
# 默認(rèn)block為true,當(dāng)隊(duì)列空時(shí)堵塞,直到有新的元素加入隊(duì)列
page = self.pageQueue.get()
url = 'https://www.qiushibaike.com/text/page/%d/' % page
request = urllib2.Request(url, headers=self.headers)
response = urllib2.urlopen(request).read()
self.dataQueue.put(response)
except:
pass
class ThreadParse(Thread):
def __init__(self, parseName, dataQueue, fileName):
super(ThreadParse, self).__init__()
self.dataQueue = dataQueue
self.parseName = parseName
self.fileName = fileName
def run(self):
print '%s啟動(dòng)' %self.parseName
while not PARSE_EXIT:
try:
html = self.dataQueue.get(False)
self.parse(html)
except:
pass
def parse(self, html):
text = etree.HTML(html)
# 創(chuàng)建 模糊查詢的根節(jié)點(diǎn),包含每條段子的全部信息
node_list = text.xpath('//div[contains(@id,"qiushi_tag")]')
items = {}
for node in node_list:
# 內(nèi)容,取出標(biāo)簽下的內(nèi)容 第一個(gè)標(biāo)簽 text
content = node.xpath('.//div[@class="content"]/span')[0].text
# 用戶名
try:
username = node.xpath('./div[1]/a[2]/h2')[0].text
except:
print '沒有用戶'
items = {'username': username,
'content': content}
self.fileName.write(json.dumps(items, ensure_ascii=False).encode('utf8') + '\n')
def main():
# 頁碼隊(duì)列
pageQueue = Queue(10)
for i in range(1, 11):
pageQueue.put(i)
# 表示采集好的html源碼隊(duì)列
dataQueue = Queue()
crawlList = ['采集線程一號(hào)', '采集線程二號(hào)', '采集線程三號(hào)']
# 啟動(dòng)三個(gè)采集線程
thread_carwl = []
for tname in crawlList:
thread = ThreadCrawl(tname, pageQueue, dataQueue)
thread.start()
thread_carwl.append(thread)
praseList = ['解析線程一號(hào)', '解析線程二號(hào)', '解析線程三號(hào)']
prase_thread = []
fileName = open('duanzi.json', 'a')
for tname in praseList:
thread = ThreadParse(tname, dataQueue, fileName)
thread.start()
prase_thread.append(thread)
# 頁碼對(duì)列不為空時(shí)
while not pageQueue.empty():
pass
global CRAWL_EXIT
CRAWL_EXIT = True
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
# 主線程堵塞,等待采集線程完成
for thread in thread_carwl:
thread.join()
print('采集完成')
for thread in prase_thread:
thread.join()
print('寫入完成')
fileName.close()
if __name__ == '__main__':
main()