scrapy yield發(fā)起網(wǎng)絡(luò)請求,網(wǎng)頁重定向到登錄頁面,導(dǎo)致request.url為登錄url
怎么解決
重新構(gòu)建middlewares.py中的process_response()
class Spider1688DownloaderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
if '驗(yàn)證' in response.text:
if 'offerlist' in response.url:
mall_id = re.findall('%3A%2F%2F(.*?).1688', response.url)[0]
requests_url = 'https://{}.1688.com/page/offerlist.htm'.format(mall_id)
request._set_url(requests_url)
return request
elif 'login' in response.url:
if '%3A%2F%2F' in response.url:
pagenum = ''
if 'pageNum' in response.url:
# try:
num = re.findall('pageNum%3D(.*?)&', response.url)[0]
pagenum = '?pageNum={}'.format(num)
mall_id = re.findall('%3A%2F%2F(.*?).1688', response.url)[0]
if 'login' in mall_id:
mall_id = re.findall('%253A%252F%252F(.*?).1688', response.url)[0]
requests_url = 'https://{}.1688.com/page/offerlist.htm{}'.format(mall_id[0], pagenum)
request._set_url(requests_url)
return request
else:
return response
判斷response.text中字段或者response.url中特殊字符判斷是否網(wǎng)頁重定向
從response.url中獲取字符串,截取需要構(gòu)建的url參數(shù)
使用request._set_url重新構(gòu)建request中的訪問URL
spider中加上
class Al1688MallSpiderSpider(scrapy.Spider):
name = 'al_1688_mall_spider'
allowed_domains = ['re.1688.com']
# start_urls = ['http://re.1688.com/']
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'spider_1688.middlewares.Spider1688DownloaderMiddleware': 120, # 中間件
},
'RETRY_HTTP_CODECS': 20, # 網(wǎng)絡(luò)請求重試次數(shù)
"COOKIES_ENABLED": False # 每次請求不攜帶cookies
}