拉鉤招聘信息爬蟲(chóng)
- 分析:難點(diǎn)是其cookie會(huì)一直改變并且具有時(shí)效性,并且我們?cè)诼毼坏木W(wǎng)站查看源代碼是查找不到想要的職位數(shù)據(jù)的,要進(jìn)行抓包分析。找出真正的原始網(wǎng)址。
- 需求:保存的數(shù)據(jù)為csv文件
直接上代碼:
# -*- coding: utf-8 -*-
import requests
import re
"""
"""
"""
需求1:獲取一下信息
'city': 城市
'companyFullName': 公司名
'companySize': 公司規(guī)模
'education': 學(xué)歷
'positionName': 職位名稱(chēng)
'salary': 薪資
'workYear': 工作時(shí)間
需求2:以逗號(hào)(,)分割信息內(nèi)容,寫(xiě)入文件。要求文件名為 `拉鉤職位信息.csv`。
例如:
上海,上海沸橙信息科技有限公司,150-500人,本科,python,8k-12k,不限
"""
# 構(gòu)造請(qǐng)求頭
class LGSpider:
def __init__(self):
self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
self.url_real = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
self.headers_real = {
'Host': 'www.lagou.com',
'Origin': 'https://www.lagou.com',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
# 獲取Cooike
def get_cookie(self):
"""
:return: 返回cookie 的jar值
"""
cookie = requests.get(url=self.url, headers=self.headers, allow_redirects=False).cookies
return cookie
# 構(gòu)建表單
def data(self, first='false', kd='python', pn='1'):
"""
:param first:
:param kd: 關(guān)鍵字 爬取的職位 默認(rèn)為爬取pyhton
:param pn: 爬取的頁(yè)數(shù) 默認(rèn)爬取一頁(yè)
:return:
"""
data_real = {
'first': first,
'pn': pn,
'kd': kd,
}
return data_real
# 請(qǐng)求原始網(wǎng)址
def post_lg(self, real_data):
print('*' * 50 + 'data:' + '*' * 50, real_data)
response = requests.post(url=self.url_real, headers=self.headers_real, data=real_data,
cookies=self.get_cookie())
return response.json()
# 數(shù)據(jù)解析
def parsel_lg(self, real_data, position):
"""
:param real_data: 構(gòu)造的請(qǐng)求表單,詳見(jiàn)main函數(shù)
:param position: 為了保存的信息
:return:
"""
response_json = self.post_lg(real_data)
lg_data_lis = response_json['content']['positionResult']['result']
for da in lg_data_lis:
lg_data = []
lg_data.append(da['city'])
lg_data.append(da['companyFullName'])
lg_data.append(da['companySize'])
lg_data.append(da['education'])
lg_data.append(da['positionName'])
lg_data.append(da['salary'])
lg_data.append([da['workYear'][:-1]])
lg_data_str = str(lg_data)
lg_data_str_done = re.sub("\[|\]|\'", "", lg_data_str)
print(lg_data_str_done)
self.save_lg_data(lg_data_str_done, position)
def write_head(self, position):
with open(f'{position}拉鉤.csv', mode='w', encoding='utf-8', newline='') as fp:
fp.write("city,companyFullName,companySize,education,positionName,salary,workYear" + '\n')
# 數(shù)據(jù)持久化 保存成csv格式
def save_lg_data(self, lg_data_str_done, position):
with open(f'{position}拉鉤.csv', mode='a', encoding='utf-8', newline='') as fp:
fp.writelines(lg_data_str_done + '\n')
# 啟動(dòng)函數(shù)
def main(self, page=5, kd='python'):
self.write_head(kd)
for page in range(page):
real_data = self.data(pn=str(page), kd=kd)
position = real_data.get('kd')
self.parsel_lg(real_data, position)
# position = real_data.get('kd')
print(f"----正在爬取第{page}頁(yè)的{position}職位的招聘信息-----")
if __name__ == '__main__':
Spider = LGSpider()
kd = input("你想爬取的職位:")
pn = int(input("爬取的頁(yè)數(shù):"))
Spider.main(kd=kd, page=pn)
github項(xiàng)目地址:https://github.com/Key-lei/lagouSpider