有緣網(wǎng)分布式爬蟲案例：

# clone github scrapy-redis源碼文件
git clone https://github.com/rolando/scrapy-redis.git

# 直接拿官方的項(xiàng)目范例，改名為自己的項(xiàng)目用（針對懶癌患者)
mv scrapy-redis/example-project ~/scrapy-youyuan

修改settings.py

下面列舉了修改后的配置文件中與scrapy-redis有關(guān)的部分，middleware、proxy等內(nèi)容在此就省略了。

# -*- coding: utf-8 -*-

# 指定使用scrapy-redis的調(diào)度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# 指定使用scrapy-redis的去重
DUPEFILTER_CLASS = 'scrapy_redis.dupefilters.RFPDupeFilter'

# 指定排序爬取地址時(shí)使用的隊(duì)列，
# 默認(rèn)的 按優(yōu)先級排序(Scrapy默認(rèn))，由sorted set實(shí)現(xiàn)的一種非FIFO、LIFO方式。
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
# 可選的 按先進(jìn)先出排序（FIFO）
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue'
# 可選的 按后進(jìn)先出排序（LIFO）
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack'

# 在redis中保持scrapy-redis用到的各個(gè)隊(duì)列，從而允許暫停和暫停后恢復(fù)，也就是不清理redis queues
SCHEDULER_PERSIST = True

# 只在使用SpiderQueue或者SpiderStack是有效的參數(shù)，指定爬蟲關(guān)閉的最大間隔時(shí)間
# SCHEDULER_IDLE_BEFORE_CLOSE = 10

# 通過配置RedisPipeline將item寫入key為 spider.name : items 的redis的list中，供后面的分布式處理item
# 這個(gè)已經(jīng)由 scrapy-redis 實(shí)現(xiàn)，不需要我們寫代碼
ITEM_PIPELINES = {
    'example.pipelines.ExamplePipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400
}

# 指定redis數(shù)據(jù)庫的連接參數(shù)
# REDIS_PASS是我自己加上的redis連接密碼（默認(rèn)不做）
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
#REDIS_PASS = 'redisP@ssw0rd'

# LOG等級
LOG_LEVEL = 'DEBUG'

#默認(rèn)情況下,RFPDupeFilter只記錄第一個(gè)重復(fù)請求。將DUPEFILTER_DEBUG設(shè)置為True會(huì)記錄所有重復(fù)的請求。
DUPEFILTER_DEBUG =True

# 覆蓋默認(rèn)請求頭，可以自己編寫Downloader Middlewares設(shè)置代理和UserAgent
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Connection': 'keep-alive',
    'Accept-Encoding': 'gzip, deflate, sdch'
}

查看pipeline.py

# -*- coding: utf-8 -*-

from datetime import datetime

class ExamplePipeline(object):
    def process_item(self, item, spider):
        #utcnow() 是獲取UTC時(shí)間
        item["crawled"] = datetime.utcnow()
        # 爬蟲名
        item["spider"] = spider.name
        return item

修改items.py

增加我們最后要保存的youyuanItem項(xiàng)，這里只寫出來一個(gè)非常簡單的版本

# -*- coding: utf-8 -*-

from scrapy.item import Item, Field

class youyuanItem(Item):
    # 個(gè)人頭像鏈接
    header_url = Field()
    # 用戶名
    username = Field()
    # 內(nèi)心獨(dú)白
    monologue = Field()
    # 相冊圖片鏈接
    pic_urls = Field()
    # 年齡
    age = Field()

    # 網(wǎng)站來源 youyuan
    source = Field()
    # 個(gè)人主頁源url
    source_url = Field()

    # 獲取UTC時(shí)間
    crawled = Field()
    # 爬蟲名
    spider = Field()

編寫 spiders/youyuan.py

在spiders目錄下增加youyuan.py文件編寫我們的爬蟲，之后就可以運(yùn)行爬蟲了。這里的提供一個(gè)簡單的版本：


# -*- coding:utf-8 -*-

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# 使用redis去重
from scrapy.dupefilters import RFPDupeFilter

from example.items import youyuanItem
import re

#
class YouyuanSpider(CrawlSpider):
    name = 'youyuan'
    allowed_domains = ['youyuan.com']
    # 有緣網(wǎng)的列表頁
    start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']

    # 搜索頁面匹配規(guī)則，根據(jù)response提取鏈接
    list_page_lx = LinkExtractor(allow=(r'http://www.youyuan.com/find/.+'))

    # 北京、18~25歲、女性 的 搜索頁面匹配規(guī)則，根據(jù)response提取鏈接
    page_lx = LinkExtractor(allow =(r'http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/'))

    # 個(gè)人主頁 匹配規(guī)則，根據(jù)response提取鏈接
    profile_page_lx = LinkExtractor(allow=(r'http://www.youyuan.com/\d+-profile/'))

    rules = (
        # 匹配find頁面，跟進(jìn)鏈接，跳板
        Rule(list_page_lx, follow=True),

        # 匹配列表頁成功，跟進(jìn)鏈接，跳板
        Rule(page_lx, follow=True),

        # 匹配個(gè)人主頁的鏈接，形成request保存到redis中等待調(diào)度，一旦有響應(yīng)則調(diào)用parse_profile_page()回調(diào)函數(shù)處理，不做繼續(xù)跟進(jìn)
        Rule(profile_page_lx, callback='parse_profile_page', follow=False),
    )

    # 處理個(gè)人主頁信息，得到我們要的數(shù)據(jù)
    def parse_profile_page(self, response):
        item = youyuanItem()
        item['header_url'] = self.get_header_url(response)
        item['username'] = self.get_username(response)
        item['monologue'] = self.get_monologue(response)
        item['pic_urls'] = self.get_pic_urls(response)
        item['age'] = self.get_age(response)
        item['source'] = 'youyuan'
        item['source_url'] = response.url

        #print "Processed profile %s" % response.url
        yield item

    # 提取頭像地址
    def get_header_url(self, response):
        header = response.xpath('//dl[@class=\'personal_cen\']/dt/img/@src').extract()
        if len(header) > 0:
            header_url = header[0]
        else:
            header_url = ""
        return header_url.strip()

    # 提取用戶名
    def get_username(self, response):
        usernames = response.xpath("http://dl[@class=\'personal_cen\']/dd/div/strong/text()").extract()
        if len(usernames) > 0:
            username = usernames[0]
        else:
            username = "NULL"
        return username.strip()

    # 提取內(nèi)心獨(dú)白
    def get_monologue(self, response):
        monologues = response.xpath("http://ul[@class=\'requre\']/li/p/text()").extract()
        if len(monologues) > 0:
            monologue = monologues[0]
        else:
            monologue = "NULL"
        return monologue.strip()

    # 提取相冊圖片地址
    def get_pic_urls(self, response):
        pic_urls = []
        data_url_full = response.xpath('//li[@class=\'smallPhoto\']/@data_url_full').extract()
        if len(data_url_full) <= 1:
            pic_urls.append("");
        else:
            for pic_url in data_url_full:
                pic_urls.append(pic_url)
        if len(pic_urls) <= 1:
            return "NULL"
        # 每個(gè)url用|分隔
        return '|'.join(pic_urls)

    # 提取年齡
    def get_age(self, response):
        age_urls = response.xpath("http://dl[@class=\'personal_cen\']/dd/p[@class=\'local\']/text()").extract()
        if len(age_urls) > 0:
            age = age_urls[0]
        else:
            age = "0"
        age_words = re.split(' ', age)
        if len(age_words) <= 2:
            return "0"
        age = age_words[2][:-1]
        # 從age字符串開始匹配數(shù)字，失敗返回None
        if re.compile(r'[0-9]').match(age):
            return age
        return "0"

運(yùn)行程序：

Master端打開 Redis： redis-server
Slave端直接運(yùn)行爬蟲： scrapy crawl youyuan
多個(gè)Slave端運(yùn)行爬蟲順序沒有限制。

day57_爬蟲-scrapy-Redis實(shí)戰(zhàn)-01.png

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

（2018-05-23.Python從Zero到One）7、（爬蟲）scrapy-Redis實(shí)戰(zhàn)__1.7.2有緣網(wǎng)分布式爬蟲項(xiàng)目1

（2018-05-23.Python從Zero到One）7、（爬蟲）scrapy-Redis實(shí)戰(zhàn)__1.7.2有緣網(wǎng)分布式爬蟲項(xiàng)目1

有緣網(wǎng)分布式爬蟲案例：

修改settings.py

查看pipeline.py

修改items.py

編寫 spiders/youyuan.py

運(yùn)行程序：

將項(xiàng)目修改成 RedisCrawlSpider 類的分布式爬蟲，并嘗試在多個(gè)Slave端運(yùn)行。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

（2018-05-23.Python從Zero到One）7、（爬蟲）scrapy-Redis實(shí)戰(zhàn)__1.7.2有緣網(wǎng)分布式爬蟲項(xiàng)目1

有緣網(wǎng)分布式爬蟲案例：

修改settings.py

查看pipeline.py

修改items.py

編寫 spiders/youyuan.py

運(yùn)行程序：

將項(xiàng)目修改成 RedisCrawlSpider 類的分布式爬蟲，并嘗試在多個(gè)Slave端運(yùn)行。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

（2018-05-23.Python從Zero到One）7、（爬蟲）scrapy-Redis實(shí)戰(zhàn)__1.7.2有緣網(wǎng)分布式爬蟲項(xiàng)目1

將項(xiàng)目修改成 RedisCrawlSpider 類的分布式爬蟲，并嘗試在多個(gè)Slave端運(yùn)行。