- pyquery:語法規(guī)則類似于jQuery,可以對HTML進行解析
pq = pyquery(html文檔)
pq('css選擇器')
items():獲取到多個標簽時,使用items()將pyquery轉(zhuǎn)換為一個生成器
然后使用 for in 循環(huán)
filter('css選擇器'):過濾
text():獲取標簽
attr('屬性名'):獲取屬性值
from pyquery import PyQuery
import requests
def tencentjob(full_url):
html = load_data(full_url)
next_url = parse_page_data(html)
if 'javascript:;' != next_url :
next_url = 'https://hr.tencent.com/'+next_url
tencentjob(next_url)
def load_data(url):
'''
發(fā)起請求獲取職位列表頁頁面源碼
:param url:
:return:
'''
req_header = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
response = requests.get(url,headers=req_header)
if response.status_code ==200:
return response.text
def parse_page_data(html):
'''
解析頁面源碼數(shù)據(jù)
:param html:
:return:
'''
# 實例化pyquery對象
html_pq = PyQuery(html)
# 提取職位列表
# tr_even = html_pq('tr.even')
# filter過濾
tr_even = html_pq('tr').filter('.even')
tr_odd = html_pq('tr').filter('.odd')
tr_all = tr_even + tr_odd
tr_all = tr_all.items()
# tr_even = tr_even.items()
# tr_odd = tr_odd.items()
print(tr_even,tr_odd)
print(type(tr_odd),type(tr_even))
for tr in tr_all:
# print(tr)
jobinfo = {}
# 獲取標題
jobinfo['title'] = tr('td.l.square a').text()
# print(jobinfo['title'])
# 取詳情地址,a 標簽 href 屬性(.attr('屬性名'))
detail_url = 'https://hr.tencent.com/'+tr('td.l.square a').attr('href')
# print(detail_url)
# 職位類型 eq(1) 取指定索引的標簽 索引值從0開始
jobinfo['type'] = tr('td').eq(1).text()
# 職位人數(shù)
jobinfo['number'] = tr('td').eq(2).text()
# 地點
jobinfo['address'] = tr('td').eq(3).text()
# 發(fā)布時間
jobinfo['time'] = tr('td').eq(4).text()
# 工作詳情內(nèi)容
html = load_data(detail_url)
jobinfo['content']=parse_detail_data(html)
print(jobinfo)
# 提取下一頁的url地址
next_url = html_pq('a').filter('#next').attr('href')
return next_url
def parse_detail_data(html):
# 創(chuàng)建pyquery對象
html_pq = PyQuery(html)
# 取出詳情內(nèi)容所在的li標簽
lis = html_pq('ul.squareli li')
content = []
# 取出 li 標簽文本 放入列表中
for li in lis.items():
li_text = li.text()
content.append(li_text)
return ','.join(content)
if __name__ == '__main__':
# 設(shè)置起始偏移量
offset = 0
# 當(dāng)前分頁HTML源碼
full_url = 'https://hr.tencent.com/position.php?&start=' + str(offset)
tencentjob(full_url)