bmw/spiders/bmw5.py
from scrapy.spidersimport CrawlSpider,Rule
from scrapy.linkextractorsimport LinkExtractor
from bmw.itemsimport BmwItem
class Bmw5Spider(CrawlSpider):
? ? name= 'bmw5'
? ? allowed_domains= ['car.autohome.com.cn']
????start_urls= ['https://car.autohome.com.cn/pic/series/159.html']
# https://car.autohome.com.cn/pic/series/159-10.html#pvareaid=2042222
# https://car.autohome.com.cn/pic/series/159-51-p2.html
? ? rules= (
????Rule(LinkExtractor(allow="https://car.autohome.com.cn/pic/series/159.+"),callback='parse_page',follow=True),
)
def parse_page(self,response):
? ? ? ? title= response.xpath("http://div[@class='uibox']/div/text()").get()
????????srcs= response.xpath("http://div[contains(@class,'uibox-con')]/ul/li//img/@src").getall()
????????urls= list(map(lambda x:response.urljoin(x.replace('240x180_0_q95_c42',"1024x0_1_q95")),srcs))
????????yield BmwItem(title=title,image_urls=urls)
bmw/items.py
import scrapy
class BmwItem(scrapy.Item):
? ? title= scrapy.Field()
????image_urls= scrapy.Field()
????images= scrapy.Field()
bmw/pipelines.py
import os
from urllibimport request
from scrapy.pipelines.imagesimport ImagesPipeline
from bmwimport settings
class BmwPipelines(ImagesPipeline):
? ? def get_media_requests(self,item,info):
? ? ? ? request_objs= super(BmwPipelines,self).get_media_requests(item,info)
????????for request_objin request_objs:
? ? ? ? ? ? request_obj.item= item
? ? ? ? return request_objs
def file_path(self,request,response=None,info=None):
? ? ? ? path= super(BmwPipelines,self).file_path(request,response,info)
????????title= request.item.get('title')
????????images_store= settings.IMAGES_STORE
????????title_path= os.path.join(images_store,title)
????????if not os.path.exists(title_path):
? ? ? ? ? ? os.mkdir(title_path)
????????image_name= path.replace("full/","")
????????image_path= os.path.join(title_path,image_name)
????????return image_path