參考:http://www.jtahstu.com/blog/scrapy_zhipin_spider.html
制作了多城市抓取

項目目錄.png
spider文件:BoosZhiPin_Spider.py
path:zhaopin/zhaopin/spiders/BoosZhiPin_Spider.py
import scrapy
from ..items import BoosZhiPinItem
import time
import json
from furl import furl
'''
用途:爬取BOSS直聘數(shù)據(jù)
參數(shù):地區(qū),職位信息
運行代碼:scrapy crawl BoosZhiPin
'''
class BoosZhiPin(scrapy.Spider):
name = 'BoosZhiPin' # 運行時爬蟲名稱
allowed_domains = ['www.zhipin.com'] # 當 OffsiteMiddleware 啟用時, 域名不在列表中的URL不會被跟進。
start_urls = ['https://www.zhipin.com/wapi/zpCommon/data/city.json'] # 默認制定url,獲取城市代碼url
city_name = ['烏魯木齊', '喀什'] # 需要抓取的城市
city_code_list = [] # 用于存儲城市代碼
query = 'python' # 需要查詢的職位
F = furl('https://www.zhipin.com/job_detail/?') # URL母版
# 發(fā)送 header,偽裝為瀏覽器
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
def parse(self, response):
self.get_city_code(response) # 獲取城市code
for c in self.city_code_list: # 根據(jù)生成的城市代碼 生成請求
yield self.request_city(c)
# 獲取城市code
def get_city_code(self, response):
city_code = json.loads(response.body_as_unicode())
for city_name in self.city_name:
for area in city_code['zpData']['cityList']: # 循環(huán)地區(qū)
for index, city in enumerate(area['subLevelModelList']): # 循環(huán)該城市
if city['name'] == city_name: # 查詢需要抓取的城市的code
self.city_code_list.insert(index, str(city['code']))
# 生成請求
def request_city(self, city_code, page=0):
'''構(gòu)造 爬取某個具體的城市 的請求對象'''
page += 1
url_data = {
'city': city_code,
'query': self.query,
'page': page
}
# 要爬取的頁面的URL
url = self.F.copy().add(url_data).url
req = scrapy.Request(url, callback=self.get_data, dont_filter=False, headers=self.headers)
# 使用 meta 傳遞附加數(shù)據(jù),在 callback 中可以通過 response.meta 取得
req.meta['city_code'] = city_code
req.meta['page'] = page
return req
# 獲取數(shù)據(jù)
def get_data(self, response):
job_list = response.css('div.job-list > ul > li')
for job in job_list:
item = BoosZhiPinItem()
job_primary = job.css('div.job-primary')
item['pid'] = job.css(
'div.info-primary > h3 > a::attr(data-jobid)').extract_first().strip()
item["positionName"] = job_primary.css(
'div.info-primary > h3 > a::text').extract_first().strip()
item["salary"] = job_primary.css(
'div.info-primary > h3 > a > span::text').extract_first().strip()
info_primary = job_primary.css(
'div.info-primary > p::text').extract()
item['city'] = info_primary[0].strip()
item['workYear'] = info_primary[1].strip()
item['education'] = info_primary[2].strip()
item['companyShortName'] = job_primary.css(
'div.info-company > div.company-text > h3 > a::text'
).extract_first().strip()
company_infos = job_primary.css(
'div.info-company > div.company-text > p::text').extract()
if len(company_infos) == 3: # 有一條招聘這里只有兩項,所以加個判斷
item['industryField'] = company_infos[0].strip()
item['financeStage'] = company_infos[1].strip()
item['companySize'] = company_infos[2].strip()
item['positionLables'] = job.css(
'li > div.job-tags > span::text').extract()
item['time'] = job.css('span.time::text').extract_first()
item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
yield item
city_name = response.meta['city_code']
page = response.meta['page']
if job_list: # 判斷是否有數(shù)據(jù)
# 發(fā)送下一頁請求
time.sleep(5) # ip多就可以注釋掉了
yield self.request_city(city_name, page=page + 1)
items文件:items.py
path: zhaopin/zhaopin/items.py
class BoosZhiPinItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pid = scrapy.Field()
positionName = scrapy.Field()
positionLables = scrapy.Field()
workYear = scrapy.Field()
salary = scrapy.Field()
city = scrapy.Field()
education = scrapy.Field()
companyShortName = scrapy.Field()
industryField = scrapy.Field()
financeStage = scrapy.Field()
companySize = scrapy.Field()
time = scrapy.Field()
updated_at = scrapy.Field()
settings文件:settings.py
path:zhaopin/zhaopin/settings.py
BOT_NAME = 'zhaopin'
SPIDER_MODULES = ['zhaopin.spiders']
NEWSPIDER_MODULE = 'zhaopin.spiders'
ROBOTSTXT_OBEY = False
#如果有mongo
#ITEM_PIPELINES = {
# 'zhaopin.pipelines.ZhaopinPipeline': 300,
#}
# MONGO_HOST = "127.0.0.1" # 主機IP
# MONGO_PORT = 27017 # 端口號
# MONGO_DB = "scrapy_mongo" # 庫名
# MONGO_COLL = "scrapy_collection" # collection名
好了現(xiàn)在可以運行了,記得終端的目錄是項目根目錄

運行.png
scrapy crawl BoosZhiPin

mongo.png

終端.png
(想把數(shù)據(jù)存到mongo里面的話看這塊)
把settings.py文件關于mongo的取消注釋 再在pipelines.py添加
一定要注意scrapy1.6和scrapy1.7不一樣,1.7取消了scrapy.conf這個包
換成了
from scrapy.utils.project import get_project_settings
scrapy1.7
from . import settings
import pymongo
from scrapy.utils.project import get_project_settings
class ZhaopinPipeline(object):
def __init__(self):
settings = get_project_settings()
# 鏈接數(shù)據(jù)庫
client = pymongo.MongoClient(host=settings.get('MONGO_HOST'), port=settings.get('MONGO_PORT'))
self.db = client[settings.get('MONGO_DB')] # 獲得數(shù)據(jù)庫的句柄
self.coll = self.db[settings.get('MONGO_COLL')] # 獲得collection的句柄
# 數(shù)據(jù)庫登錄需要帳號密碼的話
# self.db.authenticate(settings['MONGO_USER'], settings['MONGO_PSW'])
def process_item(self, item, spider):
postItem = dict(item) # 把item轉(zhuǎn)化成字典形式
self.coll.insert(postItem) # 向數(shù)據(jù)庫插入一條記錄
return item # 會在控制臺輸出原item數(shù)據(jù),可以選擇不寫
scrapy1.6
from . import settings
import pymongo
from scrapy.conf import settings
class ZhaopinPipeline(object):
def __init__(self):
# 鏈接數(shù)據(jù)庫
client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
self.db = client[settings['MONGO_DB']] # 獲得數(shù)據(jù)庫的句柄
self.coll = self.db[settings['MONGO_COLL']] # 獲得collection的句柄
# 數(shù)據(jù)庫登錄需要帳號密碼的話
# self.db.authenticate(settings['MONGO_USER'], settings['MONGO_PSW'])
def process_item(self, item, spider):
postItem = dict(item) # 把item轉(zhuǎn)化成字典形式
self.coll.insert(postItem) # 向數(shù)據(jù)庫插入一條記錄
return item # 會在控制臺輸出原item數(shù)據(jù),可以選擇不寫