python新人嘗試爬取大眾點(diǎn)評(píng)齒科信息 獲取評(píng)分 經(jīng)緯度 團(tuán)單銷量 等信

新人初次嘗試,就是訪問的次數(shù)多了 會(huì)被點(diǎn)評(píng) 反爬 需要瀏覽器滑動(dòng)驗(yàn)證 ,暫時(shí)還沒有學(xué)會(huì)怎么破解,

20191017092225724.png
import requests
import re
import csv
import time
mts = []
def marse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        'cookie':'navCtgScroll=100; navCtgScroll=200; _lxsdk_cuid=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _lxsdk=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _hc.v="\"ab6667ff-ff89-4c88-9924-2865edbe01ee.1569741222\""; s_ViewType=10; mpmerchant_portal_shopid=18189287; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; cy=24; cye=shijiazhuang; _lxsdk_s=16dd2e5facb-327-0e0-88a%7C%7C190'
    }
    surl = 'http://www.dianping.com/shop/'

    獲取大眾點(diǎn)評(píng)列表
    response = requests.get(url,headers=headers)
    text = response.content.decode('utf-8')
    lis = re.findall(r'li class=""(.*?)</li>',text,re.DOTALL)


     循環(huán)大眾點(diǎn)評(píng)商家列表
    for li in lis:
        # 獲取商家網(wǎng)頁鏈接
        urls = re.findall(r'<div class="pic" >.*?data-shopid="(.*?)".*?',li,re.DOTALL)
        # 獲取商家促銷信息列表
        cxl= re.findall(r'<div class="svr-info">(.*?)</div>',li,re.DOTALL)
        # 獲取商家促銷信息列表詳細(xì)內(nèi)容
        listcx=[]
        # 循環(huán)促銷列表
        for cxs in cxl:
            cxss = re.findall(r'>團(tuán)購(gòu):</span>(.*?)\n',cxs,re.DOTALL)# 促銷團(tuán)單標(biāo)題
            cxurl = re.findall(r'<a target="_blank" ,cxs,re.DOTALL)# 促銷團(tuán)單URL用于訪問獲取銷售數(shù)
            循環(huán)促銷信息URL列表獲取銷售數(shù)
            for scxurl,c in zip(cxurl,cxss):
                
                scxurl = href+scxurl
                cxre = requests.get(scxurl,headers=headers)
                cxre = cxre.content.decode('utf-8')
                yishou = re.findall(r'<span>已售(.*?)<',cxre,re.DOTALL)
                tuandan=(c,yishou)
                listcx.append(tuandan)
         獲取商家詳細(xì)
        mt1 = []
        # 循環(huán)商家url列表 從而獲取 星級(jí) 名稱 評(píng)分 地理位置 經(jīng)緯度
        for ur in urls:
            durl =surl+ur
            res = requests.get(durl, headers=headers)
            t = res.content.decode('utf-8')
            name = re.findall(r'<h1 class="shop-name">(.*?) <a',t,re.DOTALL)
            title = re.findall(r'<span title="(.*?)"', t, re.DOTALL)
            reviewCount = re.findall(r'<span id="reviewCount" class="item">(.*?)<', t, re.DOTALL)
            avg = re.findall(r'<span id="avgPriceTitle".*?>(.*?)</', t, re.DOTALL)
            score = re.findall(r'<span id="comment_score">.*?"item">(.*?)</.*?"item">(.*?)</.*?"item">(.*?)</', t, re.DOTALL)
            address = re.findall(r'itemprop="street-address" title="(.*?)">', t, re.DOTALL)
            xy = re.findall(r'shopGlat: "(.*?)", shopGlng:"(.*?)",', t, re.DOTALL)
            print(durl)
            time.sleep(0)

            mt2 = {
                'name':name,
                'title':title,
                'reviewCount':reviewCount,
                'avg':avg,
                'score':score,
                'address':address,
                'xy':xy
            }
            print(mt2)
            mt1.append(mt2)

        mt = {
            'mt':mt1,
            'cx':listcx
        }
        mts.append(mt)
def main():
    lll=[]
    # 訪問 1-10 頁商家列表
    for i in range(1,10):
        url = 'http://www.dianping.com/search/keyword/24/0_%E9%BD%BF%E7%A7%91/p{}'.format(i)
        print (url)
        marse_page(url)
    for xx in mts:
        name = xx['mt'][0]['name']
        title = xx['mt'][0]['title']
        reviewCount = xx['mt'][0]['reviewCount']
        avg = xx['mt'][0]['avg']
        address = xx['mt'][0]['address']
        score = xx['mt'][0]['score']
        xy = xx['mt'][0]['xy']
        cx = xx['cx']
        ll = (name,title,reviewCount,avg,score,address,xy,cx)
        lll.append(ll)
    tou = ['醫(yī)院名', '星級(jí)', '評(píng)論數(shù)', '人均', '評(píng)分','地址','經(jīng)緯度','團(tuán)單']
    with open('美團(tuán).csv', 'w', newline='')as fp:
        writer = csv.writer(fp)
        writer.writerow(tou)
        writer.writerows(lll)
    print(mts)
if __name__ == '__main__':
    main()
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容