day74-scrapy中間件及嵌套爬取

1嵌套爬取

嵌套爬取指爬蟲爬取數(shù)據(jù),獲得新的詳情頁面網(wǎng)址返回給引擎;引擎對詳情頁同時也開始爬取的類型。本次嵌套爬取爬取有妖氣的主頁并拿到漫畫詳情的章節(jié)頁面,再次拿到每章的信息。

class ComicSpider(scrapy.Spider):
    name = 'comic'
    allowed_domains = ['www.u17.com']
    start_urls = ['http://www.u17.com/']        
    def get_headers(self):
        headers = {
            'Referer': 'http://www.u17.com/comic/ajax.php?mod=comic_list&act=comic_list_new_fun&a=get_comic_list',
            'User-Agent': "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            'Host': 'www.u17.com',
            'Accept': 'application/json, text/javascript, */*;',  # headers中requests headers中的accept數(shù)據(jù)
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
            'Connection': 'keep-alive',
            'X-Requested-With': 'XMLHttpRequest',
        }
        return headers    
    def start_requests(self):
        headers = self.get_headers()
        url = 'http://www.u17.com/comic/ajax.php?mod=comic_list&act=comic_list_new_fun&a=get_comic_list'
        data = {'data[group_id]': 'no', 'data[theme_id]': 'no', 'data[is_vip]': 'no', 'data[accredit]': 'no',
                     'data[color]': 'no', 'data[comic_type]': 'no', 'data[series_status]': '1', 'data[order]': '1',
                     'data[page_num]': '1', 'data[read_mode]': 'no'
                     }
        for page in range(200):
            data['data[page_num]'] = str(page)
            print(page)
            yield scrapy.FormRequest(url=url,
                                 headers=headers,
                                 method='POST',
                                 formdata=data,
                                 callback=self.parse,
                                 )    
    def parse(self, response):
        result=json.loads(response.text)
        result_list = result['comic_list']
        pattern = re.compile('<font.*?>(.*?)<.*?', re.S)
        for item in result_list:
            u17_item = U17Item()
            u17_item['comic_id'] = item['comic_id']
            u17_item['name'] = item['name']
            u17_item['cover'] = item['cover']
            u17_item['category'] = re.findall(pattern, item['line2'])[0]
            yield u17_item    
            detail_url = 'http://www.u17.com/comic/%s.html'%item['comic_id']
            yield scrapy.Request(url=detail_url,
                                 headers=self.get_headers(),
                                 method='GET',
                                 callback=self.parse_detail, # 回調(diào)函數(shù)
                                 )    
    def parse_detail(self, response):
        chaptor_list = response.selector.css('#chapter').xpath('.//a')
        comic_id = response.url.split('/')[-1].split('.')[0]
        for chapter in chaptor_list:
            detail_item = U17DetailItem()
            detail_item['comic_id']=comic_id
            detail_item['title']=chapter.xpath('./@title').extract_first()
            detail_item['link']=chapter.xpath('./@href').extract_first()
            yield detail_item

spider文件中定義了兩個請求及兩個對爬取的數(shù)據(jù)進(jìn)行處理的方法;分別處理主頁及詳情頁的信息。

import scrapy        
class U17Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    collection_name = 'u17' # 用于管道存數(shù)據(jù)時判斷屬于哪個類
    comic_id = scrapy.Field()
    name = scrapy.Field()
    cover = scrapy.Field()
    category = scrapy.Field()     
class U17DetailItem(scrapy.Item):
    collection_name = 'u17_detail' # 用于管道存數(shù)據(jù)時判斷屬于哪個類
    comic_id = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()

在items中定義了兩個保存數(shù)據(jù)的類;其中collection_name為對象的屬性用于區(qū)分是哪個類,數(shù)據(jù)庫中并不保存該字段;故spider文件中不需要寫該字段

import pymysql
import pymongo    
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline            
class U17MysqlPipeline(object):
    def __init__(self, host, port, username, password, database):
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self.database = database    
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            host=crawler.settings.get('MYSQL_HOST'),
            port=crawler.settings.get('MYSQL_PORT'),
            database=crawler.settings.get('MYSQL_DATABASE'),
            username=crawler.settings.get('MYSQL_USERNAME'),
            password=crawler.settings.get('MYSQL_PASSWORD'),
        )    
    def open_spider(self, spider):
        self.db = pymysql.connect(self.host, self.username, self.password, self.database, charset='utf8',
                                  port=self.port)
        self.cursor = self.db.cursor()        
    def close_spider(self, spider):
        self.db.close()

    def process_item(self, item, spider):
        if item.collection_name == 'u17':
            sql = 'insert into manhua (comic_id, name, cover, category) values (%s,%s,%s,%s)'
            self.cursor.execute(sql, (item['comic_id'], item['name'], item['cover'], item['category']))
        else:
            sql = 'insert into detail_manhua (comic_id, title, link) values (%s,%s,%s)'
            self.cursor.execute(sql, (item['comic_id'], item['title'], item['link']))
        self.db.commit()
        return item                    
class U17MongoPipeline(object):
    def __init__(self, uri, database):
        self.uri = uri
        self.database = database    
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            uri=crawler.settings.get('MONGO_URI'),
            database=crawler.settings.get('MONGO_DB'),
        )    
    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.uri)
        self.db = self.client[self.database]        
    def close_spider(self, spider):
        self.client.close()        
    def process_item(self, item, spider):
        self.db[item.collection_name].insert(dict(item))
        return item            
class U17ImagePipeline(ImagesPipeline):
    # 準(zhǔn)備圖片文件名
    def file_path(self, request, response=None, info=None):
        url = request.url
        file_name = url.split('/')[-1]
        return file_name
    # 判斷圖片是否下載成功,丟棄不成功的
    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem('Image Downloaded Failed')
        return item    
    # 指明圖片下載鏈接,并包裝成request對象
    def get_media_requests(self, item, info):
        if item.collection_name == 'u17':
            yield Request(item['cover'])
        else:
            pass

管道pipelines中定義了兩個處理數(shù)據(jù)的三種方式。
if item.collection_name == 'u17':用于Mysql中判斷該對象所屬的類中是不是item中的u17類,因為是對象字段,故用.進(jìn)行引用
self.db[item.collection_name].insert(dict(item))用于mongodb中判斷屬于哪個表格,collection_name的值需與定義的表格名相同

2中間件

本次中間件爬取京東,在爬取前,需要在settings中添加或者修改成如下代碼

ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
   'jd.middlewares.SeleniumMiddleware': 543,
}
FEED_EXPORT_ENCODING = 'utf-8'
KEYWORDS = ['鞋子', '帽子', '電腦']
SELENIUM_TIMEOUT = 10

其中keywords為本次需要爬取的內(nèi)容

2.1spider請求

import scrapy
from urllib.parse import urlencode
from jd.items import JdItem    
class QiangSpider(scrapy.Spider):
    name = 'qiang'
    allowed_domains = ['search.jd.com']
    start_urls = 'https://search.jd.com/Search?'    
    def start_requests(self):
        # 循環(huán)需要遍歷的商品
        for keyword in self.settings.get('KEYWORDS'):
            data = {'keyword': keyword, 'wq': keyword, 'enc': 'utf-8'}
            param_str = urlencode(data)
            url = self.start_urls + param_str
            # 循環(huán)每種商品獲取的網(wǎng)頁數(shù)
            for page in range(1, 10):
                yield scrapy.Request(url=url, callback=self.parse,
                   meta={'page': page + 1}, dont_filter=True)    
    def parse(self, response):
        gl_items = response.selector.xpath('//div[@id="J_goodsList"]//li
              [@class="gl-item"]')
        for gl_item in gl_items:
            jd_item = JdItem()
            img_src = ''.join(gl_item.xpath('.//div[@class="p-img"]/a/img/@src').extract_first())
            title = ''.join(gl_item.xpath('.//div[@class="p-name p-name-type-
                  2"]//em//text()').extract())
            jd_item['title'] = title
            jd_item['img_src'] = img_src
            yield jd_item

callback=self.parse為回調(diào)函數(shù),函數(shù)名為對數(shù)據(jù)進(jìn)行處理的函數(shù)名
meta={'page': page + 1}數(shù)據(jù)傳輸方式

2.2中間件

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from scrapy import signals
import time
class JdSpiderMiddleware(object):    
    @classmethod
    def from_crawler(cls, crawler):
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s    
    def process_spider_input(self, response, spider):
        return None    
    def process_spider_output(self, response, result, spider):
        for i in result:
            yield i    
    def process_spider_exception(self, response, exception, spider):
        pass   
    def process_start_requests(self, start_requests, spider):
        for r in start_requests:
            yield r    
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)        
class SeleniumMiddleware(object):
    def __init__(self, timeout=None, service_args=[]):
        self.timeout = timeout
        chromeOptions = webdriver.ChromeOptions()
        self.browser = webdriver.Chrome(chrome_options = chromeOptions)            
        self.browser.set_window_size(1400, 700)
        self.wait = WebDriverWait(self.browser, self.timeout)    
    def __del__(self):
        self.browser.close()    
    @classmethod
    def from_crawler(cls, crawler):
        return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'))     
    def process_request(self, request, spider):
        page = request.meta.get('page', 1)
        if page == 1:
            self.browser.get(request.url)
        # 滑動到頁面底部,滑動指定次數(shù),讓商品加載
        str_js = 'var scrollHeight = document.body.scrollHeight;window.scrollTo(0, scrollHeight);'
        self.browser.execute_script(str_js)    
        for i in range(16, 0, -1):
            str_js = 'var scrollHeight = document.body.scrollHeight;window.scrollTo(0, (%d * scrollHeight) / 16);' % i
            time.sleep(2)
            self.browser.execute_script(str_js)    
        html = self.browser.page_source            
        # 準(zhǔn)備進(jìn)入點(diǎn)擊下一頁
        # 滾動分頁控制部分    
        input = self.browser.find_element_by_css_selector('#J_bottomPage input.input-txt')         
        str_js = 'var scrollHeight = document.body.scrollHeight;window.scrollTo(0, %d);' % (input.location['y'] - 50)
        self.browser.execute_script(str_js)    
        time.sleep(1)    
        # 輸入頁碼
        input = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage input.input-txt')))
        input.clear()
        input.send_keys(page + 1)    
        # 點(diǎn)擊下一頁
        submit = self.wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage .btn.btn-default')))
        submit.click()        
        return HtmlResponse(url=request.url, body=html, request=request, 
              encoding='utf-8',status=200)    
    def process_response(self, request, response, spider):
        return response    
    def process_exception(self, request, exception, spider):
        pass    
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

2.3items數(shù)據(jù)保存

import scrapy    
class JdItem(scrapy.Item):
    title = scrapy.Field()
    img_src = scrapy.Field()

2.4管道數(shù)據(jù)處理

class JdPipeline(object):
def process_item(self, item, spider):
    return item

在用scrapy爬取數(shù)據(jù)時,當(dāng)需要對中間件進(jìn)行修改時,一般修改的是class JdDownloaderMiddleware(object):將其修改成自己命的類名字SeleniumMiddleware(后面必須接Middleware);然后在下面定義init初始化方法(用于打開瀏覽器及傳入?yún)?shù)timeout用于控制瀏覽器wait時間),del方法用于控制在SeleniumMiddleware的實例化對象不在需要被調(diào)用的時候調(diào)用該方法(關(guān)閉瀏覽器),然后對類方法from_crawler進(jìn)行少量修改;主要修改process_request方法,用于控制頁面滾動及選擇爬取的頁面。最后將中間件添加至settings。

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

友情鏈接更多精彩內(nèi)容