源碼摘記如下：

-- coding: utf-8 --

import scrapy
import re
from scrapy.http import Request
from urllib import parse

class JobboleSpider(scrapy.Spider):
name = "jobbole"
allowed_domains = ["blog.jobbole.com/"]
start_urls = ['http://blog.jobbole.com/all-posts/']

def parse(self, response):
    """
    1.獲取文章列表中的文章url并交給scrapy進(jìn)行解析
    2.遞歸下一頁(yè)
    :param response:
    :return:
    """
    # 獲取文章列表中的文章url并交給scrapy進(jìn)行解析
    post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
    for post_url in post_urls:
        yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail, dont_filter=True)

    # 提取下一頁(yè)
    next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
    if next_url:
        yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse, dont_filter=True)

pass

def parse_detail(self, response):
    # 通過(guò)css選擇器提取字段
    title = response.css(".entry-header h1::text").extract()[0]
    create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip()
    praise_nums = response.css(".vote-post-up h10::text").extract()[0]
    fav_nums = response.css(".bookmark-btn::text").extract()[0]
    match_re = re.match(".*?(\d+).*", fav_nums)
    if match_re:
        fav_nums = int(match_re.group(1))
    else:
        fav_nums = 0

    comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
    match_re = re.match(".*?(\d+).*", comment_nums)
    if match_re:
        comment_nums = int(match_re.group(1))
    else:
        comment_nums = 0

    content = response.css("div.entry").extract()[0]

    tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
    tag_list = [element for element in tag_list if not element.strip().endswith("評(píng)論")]
    tags = ",".join(tag_list)

pass

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

爬取“伯樂(lè)在線”所有文章

爬取“伯樂(lè)在線”所有文章

-- coding: utf-8 --

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

爬取“伯樂(lè)在線”所有文章

-- coding: utf-8 --

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av