以' 中國(guó)銀行保險(xiǎn)報(bào)' 資訊爬取MongoDB入庫(kù) 為例, 不講原理, 只為程序化的工作服務(wù).scrapy每次根據(jù)不同的網(wǎng)站只需更改少量地方就可實(shí)現(xiàn)信息抓取

WX20220104-123157.png
一, 準(zhǔn)備階段
1, 建立項(xiàng)目目錄
終端 (terminal) 輸入:scrapy startproject general_spider(name隨意)

WX20220104-115934@2x.png
2, 進(jìn)入項(xiàng)目, 看名字大概知道每個(gè)文件的含義:

WX20220104-120328.png
右鍵將項(xiàng)目作為根目錄:

WX20220104-120448.png
3, 修改配置文件settings,
#開(kāi)始改的不多, 這些本地調(diào)試足夠了, 后期功能增加再改來(lái)得及:
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'general_spider.pipelines.PipelineDB': 300,
}
MONGO_URL='xxxxxx'
DOWNLOADER_MIDDLEWARES = {
'general_spider.middlewares.MyUserAgentMiddleware': 600,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'general_spider.middlewares.ProxyMiddleware': 350,# 沒(méi)有代理可以注釋
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,# 沒(méi)有代理可以注釋
}
# 日志
LOG_LEVEL = 'WARNING'#DEBUG ,INFO ,ERROR ,CRITICAL ,
LOG_FILE = './log.log'
#UA 網(wǎng)上一大堆, 復(fù)制過(guò)來(lái)就行
USER_AGENT = [
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Mobile/14C92 MicroMessenger/6.5.16 NetType/WIFI Language/zh_CN',
'Mozilla/5.0 (Linux; U; Android 5.1.1; zh-cn; MI 4S Build/LMY47V) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/9.1.3',
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Mobile/14C92 MicroMessenger/6.5.16 NetType/WIFI Language/zh_CN',
'Mozilla/5.0 (Linux; U; Android 5.1.1; zh-cn; MI 4S Build/LMY47V) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/9.1.3',
'Mozilla/5.0 (Linux; U; Android 7.0; zh-CN; SM-G9550 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.7.0.953 Mobile Safari/537.36',
'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-CN; SM-C7000 Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.2.948 Mobile Safari/537.36',
]
4,修改中間件middlewares.py
# 保留這個(gè)換UA 的class, IP
class MyUserAgentMiddleware(UserAgentMiddleware):
'''
設(shè)置User-Agent
'''
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('USER_AGENT')
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
request.headers['User-Agent'] = agent
class ProxyMiddleware():
''' 換ip, 這里以阿布云為例'''
def __init__(self, settings):
self.logger = logging.getLogger(__name__)
self.settings = settings
self.proxy_type = self.settings.get('GET_PROXY_TYPE', 'no')
self.last_fetch_proxy_time = time.time()
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_request(self, request, spider):
# 設(shè)置headers Connection: close
request.headers['Connection'] = 'close'
proxyServer, proxyAuth = get_abuyun_pro()
request.meta["proxy"] = proxyServer
request.headers["Proxy-Authorization"] = proxyAuth
request.headers["Proxy-Switch-Ip"] = 'yes'
5, 修改pipeline.py, 增加入庫(kù)功能(這里以MongoDB為例):
from pymongo import MongoClient
from general_spider import settings
class PipelineDB(object):
def __init__(self, logger, client):
self.logger = logger
self.client = client
self.collection = self.client['pa_crawler']
@classmethod
def from_crawler(cls, crawler):
return cls(
logger=crawler.spider.logger,
client=MongoClient(settings.MONGO_URL, maxPoolSize=10)
)
def process_item(self, item, spider):
table_name = 'xxxx' #表明字
# 如果文章存在, 不存該條
if self.collection[table_name].find({'article_title': item["article_title"]}).count() == 0:
# 這里存的是格式化好 的數(shù)據(jù)
self.collection[table_name].insert(item)
self.logger.info(f'{table_name}-{item["article_title"]},提交數(shù)據(jù)到 mongo 成功')
return item
else:
self.logger.info(f'{table_name}-{item["article_title"]},mongo數(shù)據(jù)庫(kù)已經(jīng)存在, 舍棄')
return item
def close_spider(self, spider):
self.client.close()
6, 修改items.py, 入庫(kù)的字段
import scrapy
class BadouItem(scrapy.Item):
# define the fields for your item here like:
article_title = scrapy.Field()#標(biāo)題
article_content_html = scrapy.Field()#原始內(nèi)容
article_content_raw = scrapy.Field()#純文字內(nèi)容,去除html標(biāo)簽
article_content = scrapy.Field()#純文字內(nèi)容+圖片url(插在文章里)
article_cover = scrapy.Field()#封面
crawl_date = scrapy.Field()#抓取時(shí)間
ref_url = scrapy.Field()#文章url
publish_time = scrapy.Field()#發(fā)布時(shí)間
site = scrapy.Field()#平臺(tái)
pictures = scrapy.Field()#文章原始圖片
_id = scrapy.Field()#文章原始圖片
二, 編寫(xiě)爬蟲(chóng)代碼
在spiders里建一個(gè).py文件

WX20220104-122907.png
class Baoxianbao(scrapy.Spider):
'''
中國(guó)銀行保險(xiǎn)報(bào) 資訊
'''
name='baoxianbao'# spider的名字,
start_urls=['http://www.cbimc.cn/']# 域名
def start_requests(self):
self.headers = {
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': 'http://www.cbimc.cn/node_7037.html',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
for category in ['07','08',10,12,16,18,29,32,33,34,35,36,37,38,39]:#每個(gè)分類(lèi)
url=f'http://www.cbimc.cn/node_70{category}.html'
#post -->scrapy.FormRequest(...)
yield scrapy.Request(url, headers=self.headers, callback=self.parse)
def parse(self,response):
# 文章鏈接, 標(biāo)題, 封面, 發(fā)布時(shí)間
urls=response.xpath('//div[@class="list nav_left_a1"]//li[@class="list_item"]/a/@href').extract()
titles=response.xpath('//div[@class="list nav_left_a1"]//li[@class="list_item"]//h1/text()').extract()
article_covers=response.xpath('//div[@class="list nav_left_a1"]//li[@class="list_item"]//img/@src').extract()
publish_times=response.xpath('//div[@class="list nav_left_a1"]//li[@class="list_item"]//span/text()').extract()
for url,title,article_cover,publish_time in zip(urls,titles,article_covers,publish_times):
item=BadouItem()
item['publish_time']=publish_time
item['article_cover']=article_cover
item['article_title']=title
item['ref_url'] =url
item['crawl_date'] = str(datetime.datetime.now()).split('.')[0] # 抓取時(shí)間
_id=url.split('_')[-1].split('.')[0]
# meta為傳遞的變量, json格式
yield scrapy.Request(url, headers=self.headers, callback=self.parse_content,meta=item)
def parse_content(self,response):
item=response.meta
text=response.text
content=scrapy.Selector(text=text).css('.detail-d').extract_first()
item['article_content_html'] = content# 原始內(nèi)容
article_content_raw,pictures,article_content=parse_content(content)
item['article_content_raw']=article_content_raw#純文字內(nèi)容,去除html標(biāo)簽
item['article_content']=article_content#純文字內(nèi)容+圖片url(插在文章里)
item['site']='中國(guó)銀行保險(xiǎn)報(bào)'
item['pictures']=pictures#圖片
yield item
三, 編寫(xiě)run.py

WX20220104-133554.png
# 第一種運(yùn)行方法
from scrapy.cmdline import execute
execute(['scrapy', 'crawl', 'baoxianbao'])
# execute(['scrapy', 'crawl', '第二個(gè)爬蟲(chóng)名字'])# 多個(gè)爬蟲(chóng)復(fù)制一行就OK
# 第二種運(yùn)行方法
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl('baoxianbao')
#process.crawl('第二個(gè)爬蟲(chóng)名字')# 多個(gè)爬蟲(chóng)復(fù)制一行就OK
process.start()
run起來(lái), 結(jié)果剛好抓了100條(剔除了幾個(gè)老新聞):

WX20220104-133936@2x.png
看看MongoDB里badou這張表:

WX20220104-134259@2x.png
附: 有一個(gè)文章解析方法parse_content(), 目的是HTML提取純文本和圖片:
def parse_content(content):
'''去除HTML標(biāo)簽, 保留文字, 圖片'''
html_ = etree.HTML(content)
# 純文字
article_content_raw = re.sub('<.*?>', '', content).replace(' ', '\n') # html_.xpath('string(.)')
# 圖片
pictures_html = re.findall('<img.*?>', content)
pictures = []
# 圖文
for pic_html in pictures_html:
pic = re.findall(r'img.*?(http.*?)\"', pic_html)[0]
pictures.append(pic)
content = content.replace(pic_html, pic)
# content=etree.HTML(content)
article_content = re.sub('<.*?>', '', content).replace(' ', '')
return article_content_raw, pictures, article_content