1.訪問動態(tài)網(wǎng)頁的方法
首先在middlerware中間鍵中process_request函數(shù)中重新寫該方法
from selenium import webdriver
from scrapy.http import HtmlResponse
def process_request(self,request,spider):
path = r'example_path'
option = webdriver.ChromeOption()
option.add_argument('--headless')
deriver = webdriver.Chrome(excutable_path=path,option=option)
driver.get("example.com")
url = driver.current_url
body = driver.page_source
res = HtmlResponse(url=url,body=body,encoding='utf-8',request=request)
# 讓界面滾動的方法,同時需要在setting中middleware下載中加入
# "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None
js = "var q = document.documentElement.scrollTop = {}"
for i in range(10):
driver.excute_script(js.formate(i*200)
return res
2.ip代理的問題
首先需要在setting配置文件中先定義一個ip池
IPPOOL = [
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
{ 'ip':''},
]
在middleware中的process_request函數(shù)中重新寫
# 需要重新定義ip
def __init__(self,ip=''):
self.ip = ip
# 重新寫process_request方法
def process_request(self,request,spider)
ip = random.choice(IPPOOL)['ip']
request.meta['proxy'] = 'https://' + ip
# 需要在setting中配置
DOWNLOADER_MIDDLEWARES = {
# 'daili.middleware.ExampleDownloaderMiddleware': 543,
'daili.middlewares.ExampleDownloaderMiddleware':222,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':333,
}