鏈家爬蟲(chóng)代碼_asyncio

#!/usr/bin/env python
# encoding: utf-8
"""
Author: ISeeMoon
Python: 3.6
Software: PyCharm
File: Lj_async.py
Time: 2018/5/6 15:26
"""

import requests
from lxml import etree
import asyncio
import aiohttp
import pandas
import re
import math
import time


loction_info = '''    1→杭州
    2→武漢
    3→北京
    按ENTER確認(rèn):'''
loction_select = input(loction_info)
loction_dic = {'1':'hz',
               '2':'wh',
               '3':'bj'}
city_url = 'https://{}.lianjia.com/ershoufang/'.format(loction_dic[loction_select])
down = input('請(qǐng)輸入價(jià)格下限(萬(wàn)):')
up = input('請(qǐng)輸入價(jià)格上限(萬(wàn)):')

inter_list = [(int(down),int(up))]

def half_inter(inter):
    lower = inter[0]
    upper = inter[1]
    delta = int((upper-lower)/2)
    inter_list.remove(inter)
    print('已經(jīng)縮小價(jià)格區(qū)間',inter)
    inter_list.append((lower, lower+delta))
    inter_list.append((lower+delta, upper))

pagenum = {}
def get_num(inter):
    url = city_url + 'bp{}ep{}/'.format(inter[0],inter[1])
    r = requests.get(url).text
    num = int(etree.HTML(r).xpath("http://h2[@class='total fl']/span/text()")[0].strip())
    pagenum[(inter[0],inter[1])] = num
    return num

totalnum = get_num(inter_list[0])

judge = True
while judge:
    a = [get_num(x)>3000 for x in inter_list]
    if True in a:
        judge = True
    else:
        judge = False
    for i in inter_list:
        if get_num(i) > 3000:
            half_inter(i)
print('價(jià)格區(qū)間縮小完畢!')

url_lst = []
url_lst_failed = []
url_lst_successed = []
url_lst_duplicated = []

for i in inter_list:
    totalpage = math.ceil(pagenum[i]/30)
    for j in range(1,totalpage+1):
        url = city_url + 'pg{}bp{}ep{}/'.format(j,i[0], i[1])
        url_lst.append(url)
print('url列表獲取完畢!')

info_lst = []
async def get_info(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url,timeout=5) as resp:
            if resp.status != 200:
                url_lst_failed.append(url)
            else:
                url_lst_successed.append(url)
            r = await resp.text()
            nodelist = etree.HTML(r).xpath("http://ul[@class='sellListContent']/li")
            # print('-------------------------------------------------------------')
            # print('開(kāi)始抓取第{}個(gè)頁(yè)面的數(shù)據(jù),共計(jì){}個(gè)頁(yè)面'.format(url_lst.index(url),len(url_lst)))
            # print('開(kāi)始抓取第{}個(gè)頁(yè)面的數(shù)據(jù),共計(jì){}個(gè)頁(yè)面'.format(url_lst.index(url), len(url_lst)))
            # print('開(kāi)始抓取第{}個(gè)頁(yè)面的數(shù)據(jù),共計(jì){}個(gè)頁(yè)面'.format(url_lst.index(url), len(url_lst)))
            # print('-------------------------------------------------------------')
            info_dic = {}
            index = 1
            print('開(kāi)始抓取{}'.format(resp.url))
            print('開(kāi)始抓取{}'.format(resp.url))
            print('開(kāi)始抓取{}'.format(resp.url))
            for node in nodelist:
                try:
                    info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
                except:
                    info_dic['title'] = '/'
                try:
                    info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
                except:
                    info_dic['href'] = '/'
                try:
                    info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ','').split('|')[0]
                except:
                    info_dic['xiaoqu'] = '/'
                try:
                    info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[1]
                except:
                    info_dic['huxing'] = '/'
                try:
                    info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
                except:
                    info_dic['area'] = '/'
                try:
                    info_dic['chaoxiang'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
                except:
                    info_dic['chaoxiang'] = '/'
                try:
                    info_dic['zhuangxiu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
                except:
                    info_dic['zhuangxiu'] = '/'
                try:
                    info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[5]
                except:
                    info_dic['dianti'] = '/'
                try:
                    info_dic['louceng'] = re.findall('\((.*)\)',node.xpath(".//div[@class='positionInfo']/text()")[0])
                except:
                    info_dic['louceng'] = '/'
                try:
                    info_dic['nianxian'] = re.findall('\)(.*?)年',node.xpath(".//div[@class='positionInfo']/text()")[0])
                except:
                    info_dic['nianxian'] = '/'
                try:
                    info_dic['guanzhu'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ','').split('/')[0]))
                except:
                    info_dic['guanzhu'] = '/'
                try:
                    info_dic['daikan'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
                except:
                    info_dic['daikan'] = '/'
                try:
                    info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
                except:
                    info_dic['fabu'] = '/'
                try:
                    info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
                except:
                    info_dic['totalprice'] = '/'
                try:
                    info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('單價(jià)','')
                except:
                    info_dic['unitprice'] = '/'
                if True in [info_dic['href'] in dic.values() for dic in info_lst]:
                    url_lst_duplicated.append(info_dic)
                else:
                    info_lst.append(info_dic)
                print('第{}條:    {}→房屋信息抓取完畢!'.format(index,info_dic['title']))
                index += 1
                info_dic = {}

start = time.time()

#首次抓取url_lst中的信息,部分url沒(méi)有對(duì)其發(fā)起請(qǐng)求,不知道為什么
tasks = [asyncio.ensure_future(get_info(url)) for url in url_lst]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

#將沒(méi)有發(fā)起請(qǐng)求的url放入一個(gè)列表,對(duì)其進(jìn)行循環(huán)抓取,直到所有url都被發(fā)起請(qǐng)求
url_lst_unrequested = []
for url in url_lst:
    if url not in url_lst_successed or url_lst_failed:
        url_lst_unrequested.append(url)
while len(url_lst_unrequested) > 0:
    tasks_unrequested = [asyncio.ensure_future(get_info(url)) for url in url_lst_unrequested]
    loop.run_until_complete(asyncio.wait(tasks_unrequested))
    url_lst_unrequested = []
    for url in url_lst:
        if url not in url_lst_successed:
            url_lst_unrequested.append(url)
end = time.time()
print('當(dāng)前價(jià)格區(qū)間段內(nèi)共有{}套二手房源\(包含{}條重復(fù)房源\),實(shí)際獲得{}條房源信息。'.format(totalnum,len(url_lst_duplicated),len(info_lst)))
print('總共耗時(shí){}秒'.format(end-start))

df = pandas.DataFrame(info_lst)
df.to_csv(r"C:\test\ljwh.csv",encoding='gbk')

##################同步爬取##########################
# info_lst = []
#
# start1 = time.time()
# for url in url_lst:
#     resp = requests.get(url)
#     nodelist = etree.HTML(resp.text).xpath("http://ul[@class='sellListContent']/li")
#     info_dic = {}
#     index = 1
#     print('開(kāi)始抓取{}'.format(resp.url))
#     print('開(kāi)始抓取{}'.format(resp.url))
#     print('開(kāi)始抓取{}'.format(resp.url))
#     for node in nodelist:
#         try:
#             info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
#         except:
#             info_dic['title'] = '/'
#         try:
#             info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
#         except:
#             info_dic['href'] = '/'
#         try:
#             info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 0]
#         except:
#             info_dic['xiaoqu'] = '/'
#         try:
#             info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 1]
#         except:
#             info_dic['huxing'] = '/'
#         try:
#             info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
#         except:
#             info_dic['area'] = '/'
#         try:
#             info_dic['chaoxiang'] = \
#             node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
#         except:
#             info_dic['chaoxiang'] = '/'
#         try:
#             info_dic['zhuangxiu'] = \
#             node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
#         except:
#             info_dic['zhuangxiu'] = '/'
#         try:
#             info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 5]
#         except:
#             info_dic['dianti'] = '/'
#         try:
#             info_dic['louceng'] = re.findall('\((.*)\)', node.xpath(".//div[@class='positionInfo']/text()")[0])
#         except:
#             info_dic['louceng'] = '/'
#         try:
#             info_dic['nianxian'] = re.findall('\)(.*?)年', node.xpath(".//div[@class='positionInfo']/text()")[0])
#         except:
#             info_dic['nianxian'] = '/'
#         try:
#             info_dic['guanzhu'] = ''.join(
#                 re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[0]))
#         except:
#             info_dic['guanzhu'] = '/'
#         try:
#             info_dic['daikan'] = ''.join(
#                 re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
#         except:
#             info_dic['daikan'] = '/'
#         try:
#             info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
#         except:
#             info_dic['fabu'] = '/'
#         try:
#             info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
#         except:
#             info_dic['totalprice'] = '/'
#         try:
#             info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('單價(jià)', '')
#         except:
#             info_dic['unitprice'] = '/'
#         if True in [info_dic['href'] in dic.values() for dic in info_lst]:
#             url_lst_duplicated.append(info_dic)
#         else:
#             info_lst.append(info_dic)
#         print('第{}條:    {}→房屋信息抓取完畢!'.format(index, info_dic['title']))
#         index += 1
#         info_dic = {}
# end = time.time()
# print('實(shí)際獲得{}條房源信息。'.format(len(info_lst)))
# print('總共耗時(shí){}秒'.format(end-start))
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容