python在數(shù)據(jù)獲取和數(shù)據(jù)分析方面,有很大的優(yōu)勢(shì)和便利。用python來(lái)進(jìn)行Web數(shù)據(jù)挖掘已經(jīng)是每個(gè)優(yōu)秀數(shù)據(jù)分析師的必備技能,但對(duì)于沒(méi)有接受過(guò)系統(tǒng)技術(shù)培訓(xùn)的Python自學(xué)者而言,想要獨(dú)立進(jìn)行Web數(shù)據(jù)爬取與數(shù)據(jù)分析必然有難度,打下堅(jiān)實(shí)的基礎(chǔ)毋庸置疑是每個(gè)初學(xué)者最需要重視的事情。但如果想要在學(xué)習(xí)過(guò)程中嘗試?yán)肞ython做簡(jiǎn)單的數(shù)據(jù)爬取也是比較簡(jiǎn)單的。
主要代碼:
from requests_html import HTMLSession
import os, xlwt, xlrd, random
from xlutils.copy import copy
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties # 字體庫(kù)
import time
"""ua大列表"""
USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3451.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.2999.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.70 Safari/537.36',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36 OPR/31.0.1889.174',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; MS-RTC LM 8; InfoPath.2; Tablet PC 2.0)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.814.0 Safari/535.1',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; ja-jp) AppleWebKit/418.9.1 (KHTML, like Gecko) Safari/419.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0; Touch; MASMJS)',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1041.0 Safari/535.21',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3451.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.2999.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.70 Safari/537.36',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36 OPR/31.0.1889.174',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; MS-RTC LM 8; InfoPath.2; Tablet PC 2.0)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 OPR/55.0.2994.61',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.814.0 Safari/535.1',
'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; ja-jp) AppleWebKit/418.9.1 (KHTML, like Gecko) Safari/419.3',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0; Touch; MASMJS)',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1041.0 Safari/535.21',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4093.3 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko; compatible; Swurl) Chrome/77.0.3865.120 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4086.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:75.0) Gecko/20100101 Firefox/75.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) coc_coc_browser/91.0.146 Chrome/85.0.4183.146 Safari/537.36',
'Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 VivoBrowser/8.4.72.0 Chrome/62.0.3202.84',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.60',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:83.0) Gecko/20100101 Firefox/83.0',
'Mozilla/5.0 (X11; CrOS x86_64 13505.63.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:68.0) Gecko/20100101 Firefox/68.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36 OPR/72.0.3815.400',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.101 Safari/537.36'
]
session = HTMLSession()
class TencentSpider(object):
def __init__(self):
# 起始的請(qǐng)求地址
self.start_url = 'https://careers.tencent.com/tencentcareer/api/post/Query'
# 頁(yè)碼
self.page_num = 1
# 循環(huán)終止標(biāo)識(shí)
self.is_running = True
# 準(zhǔn)備工作地點(diǎn)大列表
self.addr_list = []
# 準(zhǔn)備崗位種類(lèi)大列表
self.category_list = []
def parse_start_url(self):
"""
解析起始的url地址
:return:
"""
# 條件循環(huán)請(qǐng)求頁(yè)面數(shù)據(jù)
while self.is_running:
# 構(gòu)造請(qǐng)求參數(shù)
params = {
# 捕捉當(dāng)前時(shí)間戳
'timestamp': str(int(time.time() * 1000)),
'countryId': '',
'cityId': '',
'bgIds': '',
'productId': '',
'categoryId': '',
'parentCategoryId': '',
'attrId': '',
'keyword': '',
'pageIndex': str(self.page_num),
'pageSize': '10',
'language': 'zh-cn',
'area': 'cn'
}
headers = {
'user-agent': random.choice(USER_AGENT_LIST)
}
response = session.get(url=self.start_url, headers=headers, params=params).json()
"""調(diào)用解析響應(yīng)方法"""
self.parse_response_json(response)
"""頁(yè)碼遞增"""
self.page_num += 1
"""頁(yè)碼終止條件"""
if self.page_num == 20:
self.is_running = False
"""采集完成,開(kāi)始生成分析圖"""
self.crate_img_four_func()
def crate_img_four_func(self):
"""
生成四張圖方法
:return:
"""
# 統(tǒng)計(jì)數(shù)量
data = {} # 大字典
addr_dict = {} # 工作地址字典
cate_dict = {} # 工作屬性字典
for k_addr, v_cate in zip(self.addr_list, self.category_list):
if k_addr in data:
# 大字典統(tǒng)計(jì)工作地址數(shù)據(jù)
data[k_addr] = data[k_addr] + 1
# 地址字典統(tǒng)計(jì)數(shù)據(jù)
addr_dict[k_addr] = addr_dict[k_addr] + 1
else:
data[k_addr] = 1
addr_dict[k_addr] = 1
if v_cate in data:
# 大字典統(tǒng)計(jì)工作屬性數(shù)據(jù)
data[v_cate] = data[v_cate] + 1
# 工作屬性字典統(tǒng)計(jì)數(shù)據(jù)
cate_dict[v_cate] = data[v_cate] + 1
else:
data[v_cate] = 1
cate_dict[v_cate] = 1
# 第一張圖:根據(jù)崗位地址和崗位屬性二者數(shù)量生成折線圖
# 下面兩行代碼解決圖中中文顯示問(wèn)題
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 由于二者數(shù)據(jù)數(shù)量不統(tǒng)一,在此進(jìn)行切片操作
x_axis_data = [i for i in addr_dict.values()][:5]
y_axis_data = [i for i in cate_dict.values()][:5]
# print(x_axis_data, y_axis_data)
# plot中參數(shù)的含義分別是橫軸值,縱軸值,線的形狀,顏色,透明度,線的寬度和標(biāo)簽
plt.plot(y_axis_data, x_axis_data, 'bo-', color='#4169E1', alpha=0.8, linewidth=1, label='數(shù)量')
# 顯示標(biāo)簽,如果不加這句,即使在plot中加了label='一些數(shù)字'的參數(shù),最終還是不會(huì)顯示標(biāo)簽
plt.legend(loc="upper right")
plt.xlabel('地點(diǎn)數(shù)量')
plt.ylabel('工作屬性數(shù)量')
plt.savefig('根據(jù)崗位地址和崗位屬性二者數(shù)量生成折線圖.png')
plt.show()
# 第二張圖:根據(jù)崗位地址數(shù)量生成餅圖
"""工作地址餅圖"""
addr_dict_key = [k for k in addr_dict.keys()]
addr_dict_value = [v for v in addr_dict.values()]
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
plt.pie(addr_dict_value, labels=addr_dict_key, autopct='%1.1f%%')
plt.title(f'崗位地址和崗位屬性百分比分布')
plt.savefig(f'崗位地址和崗位屬性百分比分布-餅圖')
plt.show()
# 第三張圖:根據(jù)崗位地址和崗位屬性二者數(shù)量生成散點(diǎn)圖
# 這兩行代碼解決 plt 中文顯示的問(wèn)題
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 輸入崗位地址和崗位屬性數(shù)據(jù)
production = [i for i in data.keys()]
tem = [i for i in data.values()]
colors = np.random.rand(len(tem)) # 顏色數(shù)組
plt.scatter(tem, production, s=200, c=colors) # 畫(huà)散點(diǎn)圖,大小為 200
plt.xlabel('數(shù)量') # 橫坐標(biāo)軸標(biāo)題
plt.ylabel('名稱(chēng)') # 縱坐標(biāo)軸標(biāo)題
plt.savefig(f'崗位地址和崗位屬性散點(diǎn)圖')
plt.show()
# 第四張圖:根據(jù)崗位地址和崗位屬性二者數(shù)量生成柱狀圖
matplotlib.use('TkAgg')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
zhfont1 = matplotlib.font_manager.FontProperties(fname=r'C:\Windows\Fonts\simsun.ttc')
name_list = [name for name in data.keys()]
num_list = [value for value in data.values()]
width = 0.5 # 柱子的寬度
index = np.arange(len(name_list))
plt.bar(index, num_list, width, color='steelblue', tick_label=name_list, label='崗位數(shù)量')
plt.legend(['分解能耗', '真實(shí)能耗'], prop=zhfont1, labelspacing=1)
for a, b in zip(index, num_list): # 柱子上的數(shù)字顯示
plt.text(a, b, '%.2f' % b, ha='center', va='bottom', fontsize=7)
plt.xticks(rotation=270)
plt.title('崗位數(shù)量和崗位屬性數(shù)量柱狀圖')
plt.ylabel('次')
plt.legend()
plt.savefig(f'崗位數(shù)量和崗位屬性數(shù)量柱狀圖-柱狀圖', bbox_inches='tight')
plt.show()
def parse_response_json(self, response):
"""
解析響應(yīng)
:param response:
:return:
"""
# 獲取崗位信息大列表
json_data = response['Data']['Posts']
# 判斷結(jié)果是否有數(shù)據(jù)
if json_data is None:
# 沒(méi)有數(shù)據(jù),設(shè)置循環(huán)條件為False
self.is_running = False
# 反之,開(kāi)始提取數(shù)據(jù)
else:
# 循環(huán)遍歷,取出列表中的每一個(gè)崗位字典
# 通過(guò)key取value值的方法進(jìn)行采集數(shù)據(jù)
for data in json_data:
# 工作地點(diǎn)
LocationName = data['LocationName']
# 往地址大列表中添加數(shù)據(jù),供生成圖表
self.addr_list.append(LocationName)
# 工作屬性
CategoryName = data['CategoryName']
# 往工作屬性大列表中添加數(shù)據(jù),供生成圖表
self.category_list.append(CategoryName)
# 崗位名稱(chēng)
RecruitPostName = data['RecruitPostName']
# 崗位職責(zé)
Responsibility = data['Responsibility']
# 發(fā)布時(shí)間
LastUpdateTime = data['LastUpdateTime']
# 崗位地址
PostURL = data['PostURL']
# 構(gòu)造保存excel所需要的格式字典
data_dict = {
# 該字典的key值與創(chuàng)建工作簿的sheet表的名稱(chēng)所關(guān)聯(lián)
'崗位詳情': [RecruitPostName, LocationName, CategoryName, Responsibility, LastUpdateTime, PostURL]
}
"""調(diào)用保存excel表格方法,數(shù)據(jù)字典作為參數(shù)"""
self.save_excel(data_dict)
# 提示輸出
print(f"第{self.page_num}頁(yè)--崗位{RecruitPostName}----采集完成----logging!?。?)
def save_excel(self, data_dict):
"""
保存excel
:param data_dict: 數(shù)據(jù)字典
:return:
"""
# 判斷保存到當(dāng)我文件目錄的路徑是否存在
os_path_1 = os.getcwd() + '/tencent_data/'
if not os.path.exists(os_path_1):
# 不存在,即創(chuàng)建這個(gè)目錄,即創(chuàng)建”數(shù)據(jù)“這個(gè)文件夾
os.mkdir(os_path_1)
# 判斷將數(shù)據(jù)保存到表格的這個(gè)表格是否存在,不存在,創(chuàng)建表格,寫(xiě)入表頭
os_path = os_path_1 + 'source.xls'
if not os.path.exists(os_path):
# 創(chuàng)建新的workbook(其實(shí)就是創(chuàng)建新的excel)
workbook = xlwt.Workbook(encoding='utf-8')
# 創(chuàng)建新的sheet表
worksheet1 = workbook.add_sheet("崗位詳情", cell_overwrite_ok=True)
excel_data_1 = ('崗位名稱(chēng)', '工作地點(diǎn)', '工作屬性', '崗位職責(zé)', '發(fā)布時(shí)間', '崗位地址')
for i in range(0, len(excel_data_1)):
worksheet1.col(i).width = 2560 * 3
# 行,列, 內(nèi)容, 樣式
worksheet1.write(0, i, excel_data_1[i])
workbook.save(os_path)
# 判斷工作表是否存在
# 存在,開(kāi)始往表格中添加數(shù)據(jù)(寫(xiě)入數(shù)據(jù))
if os.path.exists(os_path):
# 打開(kāi)工作薄
workbook = xlrd.open_workbook(os_path)
# 獲取工作薄中所有表的個(gè)數(shù)
sheets = workbook.sheet_names()
for i in range(len(sheets)):
for name in data_dict.keys():
worksheet = workbook.sheet_by_name(sheets[i])
# 獲取工作薄中所有表中的表名與數(shù)據(jù)名對(duì)比
if worksheet.name == name:
# 獲取表中已存在的行數(shù)
rows_old = worksheet.nrows
# 將xlrd對(duì)象拷貝轉(zhuǎn)化為xlwt對(duì)象
new_workbook = copy(workbook)
# 獲取轉(zhuǎn)化后的工作薄中的第i張表
new_worksheet = new_workbook.get_sheet(i)
for num in range(0, len(data_dict[name])):
new_worksheet.write(rows_old, num, data_dict[name][num])
new_workbook.save(os_path)
def run(self):
"""
啟動(dòng)運(yùn)行
:return:
"""
self.parse_start_url()
if __name__ == '__main__':
# 創(chuàng)建該類(lèi)的對(duì)象
t = TencentSpider()
# 通過(guò)實(shí)例方法,進(jìn)行調(diào)用
t.run()
【請(qǐng)求參數(shù)解析】:
-
timestamp:1639124304945【13位的時(shí)間戳,Python實(shí)現(xiàn)方法:int(time.time()*1000)】 -
countryId:1 【表示頁(yè)面國(guó)家/地區(qū) 編號(hào)】 -
cityId:1 【表示頁(yè)面城市 編號(hào)】 -
bgIds:29294 【表示頁(yè)面 事業(yè) 編號(hào) 】 -
productId: 【未知】 -
categoryId: 40001001,40001002,40001003,40001004,40001005,40001006 【表示頁(yè)面 職業(yè)類(lèi)別 編號(hào)】 -
parentCategoryId: 【表示父分類(lèi)的編號(hào),如技術(shù)、產(chǎn)品、設(shè)計(jì)等】 -
attrId: 1 【表示招聘類(lèi)型,社招之類(lèi),可以用于控制,靈活搜索查詢(xún)類(lèi)別】 -
keyword: python 【表示查詢(xún)關(guān)鍵字,可以用于控制,靈活搜索查詢(xún)職位】 -
pageIndex: 1 【表示當(dāng)前的頁(yè)碼,可以控制頁(yè)碼】 -
pageSize: 10 【表示每頁(yè)數(shù)量,每頁(yè)10條數(shù)據(jù)(固定不變)】 -
language: zh-cn 【表示當(dāng)前的使用的語(yǔ)言(固定不變)】 -
area:cn 【表示當(dāng)前地區(qū)(固定不變)】