import requests
from lxmlimport etree
import pandasas pd
import time
from pandasimport DataFrame,Series
headers = {
'Cookie':'************************************************',
? ? 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
? ? 'Connection':'keep-alive'
}
info_list = []
def get_url(url):
res = requests.get(url,headers = headers)
selector = etree.HTML(res.text)
hrefs = selector.xpath('//*[@id="classfy"]/a/@href')
for hrefin hrefs:
print(href)
for iin range(30):
new_href = href +'p' +str(i +1)
#print(i+1,new_href)
? ? ? ? ? ? get_href(new_href)
def get_href(new_href):
html = requests.get(new_href, headers=headers)
selector_2 = etree.HTML(html.text)
htmls = selector_2.xpath('//*[@id="shop-all-list"]/ul/li')
#print(htmls)
# //*[@id="shop-all-list"]/ul/li[1]/div[2]/div[1]/a[1]/h4
? ? for html_3in htmls:
prices = html_3.xpath('div[2]/div[2]/a[2]/b/text()')
shangqu = html_3.xpath('div[2]/div[3]/a[2]/span/text()')
pinglun = html_3.xpath('div[2]/div[2]/a[1]/b/text()')
kouwei = html_3.xpath('div[2]/span/span[1]/b/text()')
huanjing = html_3.xpath('div[2]/span/span[2]/b/text()')
fuwu = html_3.xpath('div[2]/span/span[3]/b/text()')
info = {
'店名': html_3.xpath('div[2]/div[1]/a[1]/h4/text()')[0],
? ? ? ? ? ? '星級': html_3.xpath('div[2]/div[2]/span/@title')[0],
? ? ? ? ? ? '評論數(shù)': pinglun[0]if len(pinglun) !=0 else " ",
? ? ? ? ? ? '均價': prices[0]if len(prices) !=0 else " ",
? ? ? ? ? ? '類型': html_3.xpath('div[2]/div[3]/a[1]/span/text()')[0],
? ? ? ? ? ? '商區(qū)': shangqu[0]if len(shangqu) !=0 else " ",
? ? ? ? ? ? '地址': html_3.xpath('div[2]/div[3]/span/text()')[0],
? ? ? ? ? ? '口味': kouwei[0]if len(kouwei) !=0 else " ",
? ? ? ? ? ? '環(huán)境': huanjing[0]if len(huanjing) !=0 else " ",
? ? ? ? ? ? '服務': fuwu[0]if len(fuwu) !=0 else " "
? ? ? ? }
info_list.append(info)
time.sleep(3)
if __name__ =='__main__':
url ='http://www.dianping.com/shenzhen/ch10'
? ? get_url(url)
data = pd.DataFrame(info_list,columns=['店名', "星級", "評論數(shù)", "均價", "類型", "商區(qū)","地址", "口味", "環(huán)境", "服務"])
print(data)
? ? data.to_csv(r'C:\Users\Administrator\Desktop\大眾點評.csv', header=True, index=False, mode='a+', encoding='gb18030')