(2018-05-23.Python從Zero到One)7、(爬蟲)scrapy-Redis實(shí)戰(zhàn)__1.7.3有緣網(wǎng)分布式爬蟲項(xiàng)目2

有緣網(wǎng)分布式爬蟲案例:

修改 spiders/youyuan.py

在spiders目錄下增加youyuan.py文件編寫我們的爬蟲,使其具有分布式:

# -*- coding:utf-8 -*-

from scrapy.linkextractors import LinkExtractor
#from scrapy.spiders import CrawlSpider, Rule

# 1. 導(dǎo)入RedisCrawlSpider類,不使用CrawlSpider
from scrapy_redis.spiders import RedisCrawlSpider
from scrapy.spiders import Rule


from scrapy.dupefilters import RFPDupeFilter
from example.items import youyuanItem
import re

# 2. 修改父類 RedisCrawlSpider
# class YouyuanSpider(CrawlSpider):
class YouyuanSpider(RedisCrawlSpider):
    name = 'youyuan'

# 3. 取消 allowed_domains() 和 start_urls
##### allowed_domains = ['youyuan.com']
##### start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']

# 4. 增加redis-key
    redis_key = 'youyuan:start_urls'

    list_page_lx = LinkExtractor(allow=(r'http://www.youyuan.com/find/.+'))
    page_lx = LinkExtractor(allow =(r'http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/'))
    profile_page_lx = LinkExtractor(allow=(r'http://www.youyuan.com/\d+-profile/'))

    rules = (
        Rule(list_page_lx, follow=True),
        Rule(page_lx, follow=True),
        Rule(profile_page_lx, callback='parse_profile_page', follow=False),
    )

# 5. 增加__init__()方法,動(dòng)態(tài)獲取allowed_domains()
    def __init__(self, *args, **kwargs):
        domain = kwargs.pop('domain', '')
        self.allowed_domains = filter(None, domain.split(','))
        super(youyuanSpider, self).__init__(*args, **kwargs)

    # 處理個(gè)人主頁信息,得到我們要的數(shù)據(jù)
    def parse_profile_page(self, response):
        item = youyuanItem()
        item['header_url'] = self.get_header_url(response)
        item['username'] = self.get_username(response)
        item['monologue'] = self.get_monologue(response)
        item['pic_urls'] = self.get_pic_urls(response)
        item['age'] = self.get_age(response)
        item['source'] = 'youyuan'
        item['source_url'] = response.url

        yield item

    # 提取頭像地址
    def get_header_url(self, response):
        header = response.xpath('//dl[@class=\'personal_cen\']/dt/img/@src').extract()
        if len(header) > 0:
            header_url = header[0]
        else:
            header_url = ""
        return header_url.strip()

    # 提取用戶名
    def get_username(self, response):
        usernames = response.xpath("http://dl[@class=\'personal_cen\']/dd/div/strong/text()").extract()
        if len(usernames) > 0:
            username = usernames[0]
        else:
            username = "NULL"
        return username.strip()

    # 提取內(nèi)心獨(dú)白
    def get_monologue(self, response):
        monologues = response.xpath("http://ul[@class=\'requre\']/li/p/text()").extract()
        if len(monologues) > 0:
            monologue = monologues[0]
        else:
            monologue = "NULL"
        return monologue.strip()

    # 提取相冊圖片地址
    def get_pic_urls(self, response):
        pic_urls = []
        data_url_full = response.xpath('//li[@class=\'smallPhoto\']/@data_url_full').extract()
        if len(data_url_full) <= 1:
            pic_urls.append("");
        else:
            for pic_url in data_url_full:
                pic_urls.append(pic_url)
        if len(pic_urls) <= 1:
            return "NULL"
        return '|'.join(pic_urls)

    # 提取年齡
    def get_age(self, response):
        age_urls = response.xpath("http://dl[@class=\'personal_cen\']/dd/p[@class=\'local\']/text()").extract()
        if len(age_urls) > 0:
            age = age_urls[0]
        else:
            age = "0"
        age_words = re.split(' ', age)
        if len(age_words) <= 2:
            return "0"
        age = age_words[2][:-1]
        if re.compile(r'[0-9]').match(age):
            return age
        return "0"

分布式爬蟲執(zhí)行方式:

6. 在Master端啟動(dòng)redis-server:

redis-server

7. 在Slave端分別啟動(dòng)爬蟲,不分先后:

scrapy runspider youyuan.py

8. 在Master端的redis-cli里push一個(gè)start_urls
  1. 爬蟲啟動(dòng),查看redis數(shù)據(jù)庫數(shù)據(jù)。
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容