#!/usr/bin/env python
# encoding: utf-8
"""
Author: ISeeMoon
Python: 3.6
Software: PyCharm
File: Lj_async.py
Time: 2018/5/6 15:26
"""
import requests
from lxml import etree
import asyncio
import aiohttp
import pandas
import re
import math
import time
loction_info = ''' 1→杭州
2→武漢
3→北京
按ENTER確認(rèn):'''
loction_select = input(loction_info)
loction_dic = {'1':'hz',
'2':'wh',
'3':'bj'}
city_url = 'https://{}.lianjia.com/ershoufang/'.format(loction_dic[loction_select])
down = input('請(qǐng)輸入價(jià)格下限(萬(wàn)):')
up = input('請(qǐng)輸入價(jià)格上限(萬(wàn)):')
inter_list = [(int(down),int(up))]
def half_inter(inter):
lower = inter[0]
upper = inter[1]
delta = int((upper-lower)/2)
inter_list.remove(inter)
print('已經(jīng)縮小價(jià)格區(qū)間',inter)
inter_list.append((lower, lower+delta))
inter_list.append((lower+delta, upper))
pagenum = {}
def get_num(inter):
url = city_url + 'bp{}ep{}/'.format(inter[0],inter[1])
r = requests.get(url).text
num = int(etree.HTML(r).xpath("http://h2[@class='total fl']/span/text()")[0].strip())
pagenum[(inter[0],inter[1])] = num
return num
totalnum = get_num(inter_list[0])
judge = True
while judge:
a = [get_num(x)>3000 for x in inter_list]
if True in a:
judge = True
else:
judge = False
for i in inter_list:
if get_num(i) > 3000:
half_inter(i)
print('價(jià)格區(qū)間縮小完畢!')
url_lst = []
url_lst_failed = []
url_lst_successed = []
url_lst_duplicated = []
for i in inter_list:
totalpage = math.ceil(pagenum[i]/30)
for j in range(1,totalpage+1):
url = city_url + 'pg{}bp{}ep{}/'.format(j,i[0], i[1])
url_lst.append(url)
print('url列表獲取完畢!')
info_lst = []
async def get_info(url):
async with aiohttp.ClientSession() as session:
async with session.get(url,timeout=5) as resp:
if resp.status != 200:
url_lst_failed.append(url)
else:
url_lst_successed.append(url)
r = await resp.text()
nodelist = etree.HTML(r).xpath("http://ul[@class='sellListContent']/li")
# print('-------------------------------------------------------------')
# print('開(kāi)始抓取第{}個(gè)頁(yè)面的數(shù)據(jù),共計(jì){}個(gè)頁(yè)面'.format(url_lst.index(url),len(url_lst)))
# print('開(kāi)始抓取第{}個(gè)頁(yè)面的數(shù)據(jù),共計(jì){}個(gè)頁(yè)面'.format(url_lst.index(url), len(url_lst)))
# print('開(kāi)始抓取第{}個(gè)頁(yè)面的數(shù)據(jù),共計(jì){}個(gè)頁(yè)面'.format(url_lst.index(url), len(url_lst)))
# print('-------------------------------------------------------------')
info_dic = {}
index = 1
print('開(kāi)始抓取{}'.format(resp.url))
print('開(kāi)始抓取{}'.format(resp.url))
print('開(kāi)始抓取{}'.format(resp.url))
for node in nodelist:
try:
info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
except:
info_dic['title'] = '/'
try:
info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
except:
info_dic['href'] = '/'
try:
info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ','').split('|')[0]
except:
info_dic['xiaoqu'] = '/'
try:
info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[1]
except:
info_dic['huxing'] = '/'
try:
info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
except:
info_dic['area'] = '/'
try:
info_dic['chaoxiang'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
except:
info_dic['chaoxiang'] = '/'
try:
info_dic['zhuangxiu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
except:
info_dic['zhuangxiu'] = '/'
try:
info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[5]
except:
info_dic['dianti'] = '/'
try:
info_dic['louceng'] = re.findall('\((.*)\)',node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['louceng'] = '/'
try:
info_dic['nianxian'] = re.findall('\)(.*?)年',node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['nianxian'] = '/'
try:
info_dic['guanzhu'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ','').split('/')[0]))
except:
info_dic['guanzhu'] = '/'
try:
info_dic['daikan'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
except:
info_dic['daikan'] = '/'
try:
info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
except:
info_dic['fabu'] = '/'
try:
info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
except:
info_dic['totalprice'] = '/'
try:
info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('單價(jià)','')
except:
info_dic['unitprice'] = '/'
if True in [info_dic['href'] in dic.values() for dic in info_lst]:
url_lst_duplicated.append(info_dic)
else:
info_lst.append(info_dic)
print('第{}條: {}→房屋信息抓取完畢!'.format(index,info_dic['title']))
index += 1
info_dic = {}
start = time.time()
#首次抓取url_lst中的信息,部分url沒(méi)有對(duì)其發(fā)起請(qǐng)求,不知道為什么
tasks = [asyncio.ensure_future(get_info(url)) for url in url_lst]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
#將沒(méi)有發(fā)起請(qǐng)求的url放入一個(gè)列表,對(duì)其進(jìn)行循環(huán)抓取,直到所有url都被發(fā)起請(qǐng)求
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed or url_lst_failed:
url_lst_unrequested.append(url)
while len(url_lst_unrequested) > 0:
tasks_unrequested = [asyncio.ensure_future(get_info(url)) for url in url_lst_unrequested]
loop.run_until_complete(asyncio.wait(tasks_unrequested))
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed:
url_lst_unrequested.append(url)
end = time.time()
print('當(dāng)前價(jià)格區(qū)間段內(nèi)共有{}套二手房源\(包含{}條重復(fù)房源\),實(shí)際獲得{}條房源信息。'.format(totalnum,len(url_lst_duplicated),len(info_lst)))
print('總共耗時(shí){}秒'.format(end-start))
df = pandas.DataFrame(info_lst)
df.to_csv(r"C:\test\ljwh.csv",encoding='gbk')
##################同步爬取##########################
# info_lst = []
#
# start1 = time.time()
# for url in url_lst:
# resp = requests.get(url)
# nodelist = etree.HTML(resp.text).xpath("http://ul[@class='sellListContent']/li")
# info_dic = {}
# index = 1
# print('開(kāi)始抓取{}'.format(resp.url))
# print('開(kāi)始抓取{}'.format(resp.url))
# print('開(kāi)始抓取{}'.format(resp.url))
# for node in nodelist:
# try:
# info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
# except:
# info_dic['title'] = '/'
# try:
# info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
# except:
# info_dic['href'] = '/'
# try:
# info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
# 0]
# except:
# info_dic['xiaoqu'] = '/'
# try:
# info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
# 1]
# except:
# info_dic['huxing'] = '/'
# try:
# info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
# except:
# info_dic['area'] = '/'
# try:
# info_dic['chaoxiang'] = \
# node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
# except:
# info_dic['chaoxiang'] = '/'
# try:
# info_dic['zhuangxiu'] = \
# node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
# except:
# info_dic['zhuangxiu'] = '/'
# try:
# info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
# 5]
# except:
# info_dic['dianti'] = '/'
# try:
# info_dic['louceng'] = re.findall('\((.*)\)', node.xpath(".//div[@class='positionInfo']/text()")[0])
# except:
# info_dic['louceng'] = '/'
# try:
# info_dic['nianxian'] = re.findall('\)(.*?)年', node.xpath(".//div[@class='positionInfo']/text()")[0])
# except:
# info_dic['nianxian'] = '/'
# try:
# info_dic['guanzhu'] = ''.join(
# re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[0]))
# except:
# info_dic['guanzhu'] = '/'
# try:
# info_dic['daikan'] = ''.join(
# re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
# except:
# info_dic['daikan'] = '/'
# try:
# info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
# except:
# info_dic['fabu'] = '/'
# try:
# info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
# except:
# info_dic['totalprice'] = '/'
# try:
# info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('單價(jià)', '')
# except:
# info_dic['unitprice'] = '/'
# if True in [info_dic['href'] in dic.values() for dic in info_lst]:
# url_lst_duplicated.append(info_dic)
# else:
# info_lst.append(info_dic)
# print('第{}條: {}→房屋信息抓取完畢!'.format(index, info_dic['title']))
# index += 1
# info_dic = {}
# end = time.time()
# print('實(shí)際獲得{}條房源信息。'.format(len(info_lst)))
# print('總共耗時(shí){}秒'.format(end-start))
鏈家爬蟲(chóng)代碼_asyncio
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。
相關(guān)閱讀更多精彩內(nèi)容
- 生而為人,從一開(kāi)始就是對(duì)這個(gè)世界有虧欠的,而所有的虧欠都可以理解為是另一種傷害,所以說(shuō),我們奔忙于世,無(wú)時(shí)無(wú)刻不在...
- 一家公司是否值得投資,需要看它的五大關(guān)鍵指標(biāo),分別是“現(xiàn)金流量、翻桌率、獲利能力、財(cái)務(wù)結(jié)構(gòu)和償債能力”。 今天來(lái)復(fù)...