scrapy BOSS直聘(多城市抓取)

參考:http://www.jtahstu.com/blog/scrapy_zhipin_spider.html
制作了多城市抓取

項目目錄.png

spider文件:BoosZhiPin_Spider.py
path:zhaopin/zhaopin/spiders/BoosZhiPin_Spider.py

import scrapy
from ..items import BoosZhiPinItem
import time
import json
from furl import furl

'''
用途:爬取BOSS直聘數(shù)據(jù)
參數(shù):地區(qū),職位信息
運行代碼:scrapy crawl BoosZhiPin
'''


class BoosZhiPin(scrapy.Spider):
    name = 'BoosZhiPin'  # 運行時爬蟲名稱
    allowed_domains = ['www.zhipin.com']  # 當 OffsiteMiddleware 啟用時, 域名不在列表中的URL不會被跟進。
    start_urls = ['https://www.zhipin.com/wapi/zpCommon/data/city.json']  # 默認制定url,獲取城市代碼url
    city_name = ['烏魯木齊', '喀什']  # 需要抓取的城市
    city_code_list = []  # 用于存儲城市代碼
    query = 'python'  # 需要查詢的職位
    F = furl('https://www.zhipin.com/job_detail/?')  # URL母版

    # 發(fā)送 header,偽裝為瀏覽器
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}

    def parse(self, response):
        self.get_city_code(response)  # 獲取城市code
        for c in self.city_code_list:  # 根據(jù)生成的城市代碼 生成請求
            yield self.request_city(c)

    # 獲取城市code
    def get_city_code(self, response):
        city_code = json.loads(response.body_as_unicode())
        for city_name in self.city_name:
            for area in city_code['zpData']['cityList']:  # 循環(huán)地區(qū)
                for index, city in enumerate(area['subLevelModelList']):  # 循環(huán)該城市
                    if city['name'] == city_name:  # 查詢需要抓取的城市的code
                        self.city_code_list.insert(index, str(city['code']))

    # 生成請求
    def request_city(self, city_code, page=0):
        '''構(gòu)造 爬取某個具體的城市 的請求對象'''
        page += 1
        url_data = {
            'city': city_code,
            'query': self.query,
            'page': page
        }
        # 要爬取的頁面的URL
        url = self.F.copy().add(url_data).url
        req = scrapy.Request(url, callback=self.get_data, dont_filter=False, headers=self.headers)
        # 使用 meta 傳遞附加數(shù)據(jù),在 callback 中可以通過 response.meta 取得
        req.meta['city_code'] = city_code
        req.meta['page'] = page
        return req

    # 獲取數(shù)據(jù)
    def get_data(self, response):
        job_list = response.css('div.job-list > ul > li')
        for job in job_list:
            item = BoosZhiPinItem()
            job_primary = job.css('div.job-primary')
            item['pid'] = job.css(
                'div.info-primary > h3 > a::attr(data-jobid)').extract_first().strip()
            item["positionName"] = job_primary.css(
                'div.info-primary > h3 > a::text').extract_first().strip()
            item["salary"] = job_primary.css(
                'div.info-primary > h3 > a > span::text').extract_first().strip()
            info_primary = job_primary.css(
                'div.info-primary > p::text').extract()
            item['city'] = info_primary[0].strip()
            item['workYear'] = info_primary[1].strip()
            item['education'] = info_primary[2].strip()
            item['companyShortName'] = job_primary.css(
                'div.info-company > div.company-text > h3 > a::text'
            ).extract_first().strip()
            company_infos = job_primary.css(
                'div.info-company > div.company-text > p::text').extract()
            if len(company_infos) == 3:  # 有一條招聘這里只有兩項,所以加個判斷
                item['industryField'] = company_infos[0].strip()
                item['financeStage'] = company_infos[1].strip()
                item['companySize'] = company_infos[2].strip()
            item['positionLables'] = job.css(
                'li > div.job-tags > span::text').extract()
            item['time'] = job.css('span.time::text').extract_first()
            item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            yield item

        city_name = response.meta['city_code']
        page = response.meta['page']
        if job_list:  # 判斷是否有數(shù)據(jù)
            # 發(fā)送下一頁請求
            time.sleep(5)  # ip多就可以注釋掉了
            yield self.request_city(city_name, page=page + 1)

items文件:items.py
path: zhaopin/zhaopin/items.py

class BoosZhiPinItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pid = scrapy.Field()
    positionName = scrapy.Field()
    positionLables = scrapy.Field()
    workYear = scrapy.Field()
    salary = scrapy.Field()
    city = scrapy.Field()
    education = scrapy.Field()
    companyShortName = scrapy.Field()
    industryField = scrapy.Field()
    financeStage = scrapy.Field()
    companySize = scrapy.Field()
    time = scrapy.Field()
    updated_at = scrapy.Field()

settings文件:settings.py
path:zhaopin/zhaopin/settings.py

BOT_NAME = 'zhaopin'
SPIDER_MODULES = ['zhaopin.spiders']
NEWSPIDER_MODULE = 'zhaopin.spiders'
ROBOTSTXT_OBEY = False
#如果有mongo
#ITEM_PIPELINES = {
#   'zhaopin.pipelines.ZhaopinPipeline': 300,
#}
# MONGO_HOST = "127.0.0.1"  # 主機IP
# MONGO_PORT = 27017  # 端口號
# MONGO_DB = "scrapy_mongo"  # 庫名
# MONGO_COLL = "scrapy_collection"  # collection名

好了現(xiàn)在可以運行了,記得終端的目錄是項目根目錄


運行.png
scrapy crawl BoosZhiPin
mongo.png

終端.png

(想把數(shù)據(jù)存到mongo里面的話看這塊)
把settings.py文件關于mongo的取消注釋 再在pipelines.py添加
一定要注意scrapy1.6和scrapy1.7不一樣,1.7取消了scrapy.conf這個包
換成了

from scrapy.utils.project import get_project_settings

scrapy1.7

from . import settings
import pymongo
from scrapy.utils.project import get_project_settings


class ZhaopinPipeline(object):

    def __init__(self):
        settings = get_project_settings()
        # 鏈接數(shù)據(jù)庫
        client = pymongo.MongoClient(host=settings.get('MONGO_HOST'), port=settings.get('MONGO_PORT'))
        self.db = client[settings.get('MONGO_DB')]  # 獲得數(shù)據(jù)庫的句柄
        self.coll = self.db[settings.get('MONGO_COLL')]  # 獲得collection的句柄
        # 數(shù)據(jù)庫登錄需要帳號密碼的話
        # self.db.authenticate(settings['MONGO_USER'], settings['MONGO_PSW'])

    def process_item(self, item, spider):
        postItem = dict(item)  # 把item轉(zhuǎn)化成字典形式
        self.coll.insert(postItem)  # 向數(shù)據(jù)庫插入一條記錄
        return item  # 會在控制臺輸出原item數(shù)據(jù),可以選擇不寫

scrapy1.6

from . import settings
import pymongo
from scrapy.conf import settings


class ZhaopinPipeline(object):
    def __init__(self):
        # 鏈接數(shù)據(jù)庫
        client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
        self.db = client[settings['MONGO_DB']]  # 獲得數(shù)據(jù)庫的句柄
        self.coll = self.db[settings['MONGO_COLL']]  # 獲得collection的句柄
        # 數(shù)據(jù)庫登錄需要帳號密碼的話
        # self.db.authenticate(settings['MONGO_USER'], settings['MONGO_PSW'])

    def process_item(self, item, spider):
        postItem = dict(item)  # 把item轉(zhuǎn)化成字典形式
        self.coll.insert(postItem)  # 向數(shù)據(jù)庫插入一條記錄
        return item  # 會在控制臺輸出原item數(shù)據(jù),可以選擇不寫
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容