1.輸入python,點(diǎn)擊查詢,檢查,找接口數(shù)據(jù)



2.分析結(jié)果:
此接口的url ="https://fe-api.zhaopin.com/c/i/sou"
要爬取3頁的內(nèi)容? ? ? ?for i in range(1,4):
lastUrlQuery = {"p": i,"pageSize":"60","jl":"489","kw":"python","kt":"3"}
尋找params參數(shù),用于拼接
```
params = {"start": (i -1) *60,
"pageSize":"60",
"cityId":"489",
"workExperience":"-1",
"education":"-1",
"companyType":"-1",
"employmentType":"-1",
"jobWelfareTag":"-1",
"kw":"python",
"kt":"3"
? ? ? ? ? }
```
3.數(shù)據(jù)提取
```
from utilzimport util
import json,re,time
def getInfo():
url ="https://fe-api.zhaopin.com/c/i/sou"
? ? # print(url)
? ? for iin range(1,4):
lastUrlQuery = {"p": i,"pageSize":"60","jl":"489","kw":"python","kt":"3"}
params = {"start": (i -1) *60,
"pageSize":"60",
"cityId":"489",
"workExperience":"-1",
"education":"-1",
"companyType":"-1",
"employmentType":"-1",
"jobWelfareTag":"-1",
"kw":"python",
"kt":"3",
"lastUrlQuery": lastUrlQuery
}
r = util.get(url=url,parmas=params)
if r["code"] ==1:
jsondata = json.loads(r["msg"].decode())
result=jsondata["data"]["results"]
for itemin result:
jobName=item["jobName"]
salary=item["salary"]
eduname=item["eduLevel"]["name"]
workExpname=item["workingExp"]["name"]
city=item["city"]["display"]
companyName=item["company"]["name"]
url=item["positionURL"]
time.sleep(2)
print(url,jobName,salary,eduname,workExpname,city,companyName)
getDetail(url)
# def getDetail(url,jobName,salary,eduname,workExpname,city,companyName):
def getDetail(url):
r=util.get(url)
if r["code"]==1:
body=r["msg"].decode().replace('\n','').replace('\r','').replace('\t','')
res=re.findall('職位描述.*?崗位職責(zé)(.*?)工作地址:',body)
if len(res)>0:
# print("*"*100)
# print(res[0])
# print("=" * 100)
? ? ? ? ? ? pass
if __name__ =='__main__':
#爬取三頁信息
? ? # for i in range(1, 4):
? ? getInfo()
```
代碼補(bǔ)充,自己封裝的util如下
import datetime
import time
# https://www.cnblogs.com/tianyiliang/p/8270509.html
import uuid
import requests
import re
'''
如果要請(qǐng)求的網(wǎng)站,沒有太多限制,為了代碼寫起來更好看一些,進(jìn)行封裝,
如果涉及到登錄的時(shí)候,則不能使用此方法:
其中代理和timeout的設(shè)置是requests的,而非session
'''
def get(url, parmas=None, head=None, cookie=None, pro=None, verfiy=None):
ret = {}
ret["code"] =0
? ? ret["msg"] =""
? ? s = requests.session()
try:
if parmas !=None:
s.params = parmas
if head !=None:
s.headers = head
if cookie !=None:
s.cookies = cookie
if verfiy !=None:
s.verify = verfiy
if pro !=None:
r = s.get(url=url,proxies=pro,timeout=10)
else:
r = s.get(url=url,timeout=10)
ret["code"] =1
? ? ? ? ret["msg"] = r.content
except Exception as e:
print(e)
finally:
if s:
s.close()
return ret
'''
如果要請(qǐng)求的網(wǎng)站,沒有太多限制,為了代碼寫起來更好看一些,進(jìn)行封裝,
如果涉及到登錄的時(shí)候,則不能使用此方法:
post與get的不同在于:post方式有兩個(gè)必傳參數(shù),url,data,而get則沒有data參數(shù)
'''
def post(url, data, parmas=None, headers=None, cookie=None, pro=None, verfiy=None):
ret = {}
ret["code"] =0
? ? ret["msg"] =""
? ? ret["cookie"] =None
? ? s = requests.session()
try:
if parmas !=None:
s.params = parmas
if headers !=None:
s.headers = headers
if cookie !=None:
s.cookies = cookie
if verfiy !=None:
s.verify = verfiy
if pro !=None:
r = s.post(url=url,data=data,proxies=pro,timeout=10)
else:
r = s.post(url=url,data=data,timeout=10)
ret["code"] =1
? ? ? ? ret["msg"] = r.content
ret["cookie"] = r.cookies
except Exception as e:
print(e)
finally:
if s:
s.close()
return ret
def getNoHtml(body):
dr = re.compile(r'<[^>]+>', re.S)
dd = dr.sub('', body)
return dd
def getUUID():
'''
生成UUID
? ? :return:
'''
? ? return str(uuid.uuid4())
```