新人初次嘗試,就是訪問的次數(shù)多了 會(huì)被點(diǎn)評(píng) 反爬 需要瀏覽器滑動(dòng)驗(yàn)證 ,暫時(shí)還沒有學(xué)會(huì)怎么破解,

20191017092225724.png
import requests
import re
import csv
import time
mts = []
def marse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'cookie':'navCtgScroll=100; navCtgScroll=200; _lxsdk_cuid=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _lxsdk=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _hc.v="\"ab6667ff-ff89-4c88-9924-2865edbe01ee.1569741222\""; s_ViewType=10; mpmerchant_portal_shopid=18189287; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; cy=24; cye=shijiazhuang; _lxsdk_s=16dd2e5facb-327-0e0-88a%7C%7C190'
}
surl = 'http://www.dianping.com/shop/'
獲取大眾點(diǎn)評(píng)列表
response = requests.get(url,headers=headers)
text = response.content.decode('utf-8')
lis = re.findall(r'li class=""(.*?)</li>',text,re.DOTALL)
循環(huán)大眾點(diǎn)評(píng)商家列表
for li in lis:
# 獲取商家網(wǎng)頁鏈接
urls = re.findall(r'<div class="pic" >.*?data-shopid="(.*?)".*?',li,re.DOTALL)
# 獲取商家促銷信息列表
cxl= re.findall(r'<div class="svr-info">(.*?)</div>',li,re.DOTALL)
# 獲取商家促銷信息列表詳細(xì)內(nèi)容
listcx=[]
# 循環(huán)促銷列表
for cxs in cxl:
cxss = re.findall(r'>團(tuán)購(gòu):</span>(.*?)\n',cxs,re.DOTALL)# 促銷團(tuán)單標(biāo)題
cxurl = re.findall(r'<a target="_blank" ,cxs,re.DOTALL)# 促銷團(tuán)單URL用于訪問獲取銷售數(shù)
循環(huán)促銷信息URL列表獲取銷售數(shù)
for scxurl,c in zip(cxurl,cxss):
scxurl = href+scxurl
cxre = requests.get(scxurl,headers=headers)
cxre = cxre.content.decode('utf-8')
yishou = re.findall(r'<span>已售(.*?)<',cxre,re.DOTALL)
tuandan=(c,yishou)
listcx.append(tuandan)
獲取商家詳細(xì)
mt1 = []
# 循環(huán)商家url列表 從而獲取 星級(jí) 名稱 評(píng)分 地理位置 經(jīng)緯度
for ur in urls:
durl =surl+ur
res = requests.get(durl, headers=headers)
t = res.content.decode('utf-8')
name = re.findall(r'<h1 class="shop-name">(.*?) <a',t,re.DOTALL)
title = re.findall(r'<span title="(.*?)"', t, re.DOTALL)
reviewCount = re.findall(r'<span id="reviewCount" class="item">(.*?)<', t, re.DOTALL)
avg = re.findall(r'<span id="avgPriceTitle".*?>(.*?)</', t, re.DOTALL)
score = re.findall(r'<span id="comment_score">.*?"item">(.*?)</.*?"item">(.*?)</.*?"item">(.*?)</', t, re.DOTALL)
address = re.findall(r'itemprop="street-address" title="(.*?)">', t, re.DOTALL)
xy = re.findall(r'shopGlat: "(.*?)", shopGlng:"(.*?)",', t, re.DOTALL)
print(durl)
time.sleep(0)
mt2 = {
'name':name,
'title':title,
'reviewCount':reviewCount,
'avg':avg,
'score':score,
'address':address,
'xy':xy
}
print(mt2)
mt1.append(mt2)
mt = {
'mt':mt1,
'cx':listcx
}
mts.append(mt)
def main():
lll=[]
# 訪問 1-10 頁商家列表
for i in range(1,10):
url = 'http://www.dianping.com/search/keyword/24/0_%E9%BD%BF%E7%A7%91/p{}'.format(i)
print (url)
marse_page(url)
for xx in mts:
name = xx['mt'][0]['name']
title = xx['mt'][0]['title']
reviewCount = xx['mt'][0]['reviewCount']
avg = xx['mt'][0]['avg']
address = xx['mt'][0]['address']
score = xx['mt'][0]['score']
xy = xx['mt'][0]['xy']
cx = xx['cx']
ll = (name,title,reviewCount,avg,score,address,xy,cx)
lll.append(ll)
tou = ['醫(yī)院名', '星級(jí)', '評(píng)論數(shù)', '人均', '評(píng)分','地址','經(jīng)緯度','團(tuán)單']
with open('美團(tuán).csv', 'w', newline='')as fp:
writer = csv.writer(fp)
writer.writerow(tou)
writer.writerows(lll)
print(mts)
if __name__ == '__main__':
main()