有緣網(wǎng)分布式爬蟲案例:
# clone github scrapy-redis源碼文件
git clone https://github.com/rolando/scrapy-redis.git
# 直接拿官方的項(xiàng)目范例,改名為自己的項(xiàng)目用(針對懶癌患者)
mv scrapy-redis/example-project ~/scrapy-youyuan
修改settings.py
下面列舉了修改后的配置文件中與scrapy-redis有關(guān)的部分,middleware、proxy等內(nèi)容在此就省略了。
# -*- coding: utf-8 -*-
# 指定使用scrapy-redis的調(diào)度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 指定使用scrapy-redis的去重
DUPEFILTER_CLASS = 'scrapy_redis.dupefilters.RFPDupeFilter'
# 指定排序爬取地址時(shí)使用的隊(duì)列,
# 默認(rèn)的 按優(yōu)先級排序(Scrapy默認(rèn)),由sorted set實(shí)現(xiàn)的一種非FIFO、LIFO方式。
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
# 可選的 按先進(jìn)先出排序(FIFO)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue'
# 可選的 按后進(jìn)先出排序(LIFO)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack'
# 在redis中保持scrapy-redis用到的各個(gè)隊(duì)列,從而允許暫停和暫停后恢復(fù),也就是不清理redis queues
SCHEDULER_PERSIST = True
# 只在使用SpiderQueue或者SpiderStack是有效的參數(shù),指定爬蟲關(guān)閉的最大間隔時(shí)間
# SCHEDULER_IDLE_BEFORE_CLOSE = 10
# 通過配置RedisPipeline將item寫入key為 spider.name : items 的redis的list中,供后面的分布式處理item
# 這個(gè)已經(jīng)由 scrapy-redis 實(shí)現(xiàn),不需要我們寫代碼
ITEM_PIPELINES = {
'example.pipelines.ExamplePipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400
}
# 指定redis數(shù)據(jù)庫的連接參數(shù)
# REDIS_PASS是我自己加上的redis連接密碼(默認(rèn)不做)
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
#REDIS_PASS = 'redisP@ssw0rd'
# LOG等級
LOG_LEVEL = 'DEBUG'
#默認(rèn)情況下,RFPDupeFilter只記錄第一個(gè)重復(fù)請求。將DUPEFILTER_DEBUG設(shè)置為True會(huì)記錄所有重復(fù)的請求。
DUPEFILTER_DEBUG =True
# 覆蓋默認(rèn)請求頭,可以自己編寫Downloader Middlewares設(shè)置代理和UserAgent
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, sdch'
}
查看pipeline.py
# -*- coding: utf-8 -*-
from datetime import datetime
class ExamplePipeline(object):
def process_item(self, item, spider):
#utcnow() 是獲取UTC時(shí)間
item["crawled"] = datetime.utcnow()
# 爬蟲名
item["spider"] = spider.name
return item
修改items.py
增加我們最后要保存的youyuanItem項(xiàng),這里只寫出來一個(gè)非常簡單的版本
# -*- coding: utf-8 -*-
from scrapy.item import Item, Field
class youyuanItem(Item):
# 個(gè)人頭像鏈接
header_url = Field()
# 用戶名
username = Field()
# 內(nèi)心獨(dú)白
monologue = Field()
# 相冊圖片鏈接
pic_urls = Field()
# 年齡
age = Field()
# 網(wǎng)站來源 youyuan
source = Field()
# 個(gè)人主頁源url
source_url = Field()
# 獲取UTC時(shí)間
crawled = Field()
# 爬蟲名
spider = Field()
編寫 spiders/youyuan.py
在spiders目錄下增加youyuan.py文件編寫我們的爬蟲,之后就可以運(yùn)行爬蟲了。 這里的提供一個(gè)簡單的版本:
# -*- coding:utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# 使用redis去重
from scrapy.dupefilters import RFPDupeFilter
from example.items import youyuanItem
import re
#
class YouyuanSpider(CrawlSpider):
name = 'youyuan'
allowed_domains = ['youyuan.com']
# 有緣網(wǎng)的列表頁
start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
# 搜索頁面匹配規(guī)則,根據(jù)response提取鏈接
list_page_lx = LinkExtractor(allow=(r'http://www.youyuan.com/find/.+'))
# 北京、18~25歲、女性 的 搜索頁面匹配規(guī)則,根據(jù)response提取鏈接
page_lx = LinkExtractor(allow =(r'http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/'))
# 個(gè)人主頁 匹配規(guī)則,根據(jù)response提取鏈接
profile_page_lx = LinkExtractor(allow=(r'http://www.youyuan.com/\d+-profile/'))
rules = (
# 匹配find頁面,跟進(jìn)鏈接,跳板
Rule(list_page_lx, follow=True),
# 匹配列表頁成功,跟進(jìn)鏈接,跳板
Rule(page_lx, follow=True),
# 匹配個(gè)人主頁的鏈接,形成request保存到redis中等待調(diào)度,一旦有響應(yīng)則調(diào)用parse_profile_page()回調(diào)函數(shù)處理,不做繼續(xù)跟進(jìn)
Rule(profile_page_lx, callback='parse_profile_page', follow=False),
)
# 處理個(gè)人主頁信息,得到我們要的數(shù)據(jù)
def parse_profile_page(self, response):
item = youyuanItem()
item['header_url'] = self.get_header_url(response)
item['username'] = self.get_username(response)
item['monologue'] = self.get_monologue(response)
item['pic_urls'] = self.get_pic_urls(response)
item['age'] = self.get_age(response)
item['source'] = 'youyuan'
item['source_url'] = response.url
#print "Processed profile %s" % response.url
yield item
# 提取頭像地址
def get_header_url(self, response):
header = response.xpath('//dl[@class=\'personal_cen\']/dt/img/@src').extract()
if len(header) > 0:
header_url = header[0]
else:
header_url = ""
return header_url.strip()
# 提取用戶名
def get_username(self, response):
usernames = response.xpath("http://dl[@class=\'personal_cen\']/dd/div/strong/text()").extract()
if len(usernames) > 0:
username = usernames[0]
else:
username = "NULL"
return username.strip()
# 提取內(nèi)心獨(dú)白
def get_monologue(self, response):
monologues = response.xpath("http://ul[@class=\'requre\']/li/p/text()").extract()
if len(monologues) > 0:
monologue = monologues[0]
else:
monologue = "NULL"
return monologue.strip()
# 提取相冊圖片地址
def get_pic_urls(self, response):
pic_urls = []
data_url_full = response.xpath('//li[@class=\'smallPhoto\']/@data_url_full').extract()
if len(data_url_full) <= 1:
pic_urls.append("");
else:
for pic_url in data_url_full:
pic_urls.append(pic_url)
if len(pic_urls) <= 1:
return "NULL"
# 每個(gè)url用|分隔
return '|'.join(pic_urls)
# 提取年齡
def get_age(self, response):
age_urls = response.xpath("http://dl[@class=\'personal_cen\']/dd/p[@class=\'local\']/text()").extract()
if len(age_urls) > 0:
age = age_urls[0]
else:
age = "0"
age_words = re.split(' ', age)
if len(age_words) <= 2:
return "0"
age = age_words[2][:-1]
# 從age字符串開始匹配數(shù)字,失敗返回None
if re.compile(r'[0-9]').match(age):
return age
return "0"
運(yùn)行程序:
- Master端打開 Redis:
redis-server - Slave端直接運(yùn)行爬蟲:
scrapy crawl youyuan - 多個(gè)Slave端運(yùn)行爬蟲順序沒有限制。

day57_爬蟲-scrapy-Redis實(shí)戰(zhàn)-01.png