COMMANDS_MODULE
示例: COMMANDS_MODULE = "dabo.commands"
可配置自定義命令

項目
cmd語句直接

cmd執(zhí)行
配置文件 run()方法就是執(zhí)行的函數(shù)
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
#當(dāng)前類必須繼承ScrapyCommand,實際就是重寫內(nèi)部方法
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
"""
源碼入口
:param args:
:param opts:
:return:
"""
spider_list = self.crawler_process.spiders.list()
for name in spider_list:
self.crawler_process.crawl(name, **opts.__dict__)
self.crawler_process.start()
DUPEFILTER_CLASS
DUPEFILTER_CLASS = 'dabo.dup.UrlFilter'
重寫自定義復(fù)用操作,示例·:
scrapy默認(rèn)使用 scrapy.dupefilter.RFPDupeFilter 進(jìn)行去重,相關(guān)配置有:
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_DEBUG = False
JOBDIR = "保存范文記錄的日志路徑,如:/root/" # 最終路徑為 /root/requests.seen
class RepeatUrl:
def __init__(self):
self.visited_url = set()
@classmethod
def from_settings(cls, settings):
"""
初始化時,調(diào)用
:param settings:
:return:
"""
return cls()
def request_seen(self, request):
"""
檢測當(dāng)前請求是否已經(jīng)被訪問過
:param request:
:return: True表示已經(jīng)訪問過;False表示未訪問過
"""
if request.url in self.visited_url:
return True
self.visited_url.add(request.url)
return False
def open(self):
"""
開始爬去請求時,調(diào)用
:return:
"""
print('open replication')
def close(self, reason):
"""
結(jié)束爬蟲爬取時,調(diào)用
:param reason:
:return:
"""
print('close replication')
def log(self, request, spider):
"""
記錄日志
:param request:
:param spider:
:return:
"""
print('repeat', request.url)