大香蕉操逼视频,AA亚洲视频

#!/usr/bin/env python
# encoding: utf-8
"""
Author: ISeeMoon
Python: 3.6
Software: PyCharm
File: Lj_async.py
Time: 2018/5/6 15:26
"""

import requests
from lxml import etree
import asyncio
import aiohttp
import pandas
import re
import math
import time


loction_info = '''    1→杭州
    2→武漢
    3→北京
    按ENTER確認(rèn)：'''
loction_select = input(loction_info)
loction_dic = {'1':'hz',
               '2':'wh',
               '3':'bj'}
city_url = 'https://{}.lianjia.com/ershoufang/'.format(loction_dic[loction_select])
down = input('請(qǐng)輸入價(jià)格下限（萬(wàn)）:')
up = input('請(qǐng)輸入價(jià)格上限（萬(wàn)）:')

inter_list = [(int(down),int(up))]

def half_inter(inter):
    lower = inter[0]
    upper = inter[1]
    delta = int((upper-lower)/2)
    inter_list.remove(inter)
    print('已經(jīng)縮小價(jià)格區(qū)間',inter)
    inter_list.append((lower, lower+delta))
    inter_list.append((lower+delta, upper))

pagenum = {}
def get_num(inter):
    url = city_url + 'bp{}ep{}/'.format(inter[0],inter[1])
    r = requests.get(url).text
    num = int(etree.HTML(r).xpath("http://h2[@class='total fl']/span/text()")[0].strip())
    pagenum[(inter[0],inter[1])] = num
    return num

totalnum = get_num(inter_list[0])

judge = True
while judge:
    a = [get_num(x)>3000 for x in inter_list]
    if True in a:
        judge = True
    else:
        judge = False
    for i in inter_list:
        if get_num(i) > 3000:
            half_inter(i)
print('價(jià)格區(qū)間縮小完畢！')

url_lst = []
url_lst_failed = []
url_lst_successed = []
url_lst_duplicated = []

for i in inter_list:
    totalpage = math.ceil(pagenum[i]/30)
    for j in range(1,totalpage+1):
        url = city_url + 'pg{}bp{}ep{}/'.format(j,i[0], i[1])
        url_lst.append(url)
print('url列表獲取完畢！')

info_lst = []
async def get_info(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url,timeout=5) as resp:
            if resp.status != 200:
                url_lst_failed.append(url)
            else:
                url_lst_successed.append(url)
            r = await resp.text()
            nodelist = etree.HTML(r).xpath("http://ul[@class='sellListContent']/li")
            # print('-------------------------------------------------------------')
            # print('開(kāi)始抓取第{}個(gè)頁(yè)面的數(shù)據(jù),共計(jì){}個(gè)頁(yè)面'.format(url_lst.index(url),len(url_lst)))
            # print('開(kāi)始抓取第{}個(gè)頁(yè)面的數(shù)據(jù),共計(jì){}個(gè)頁(yè)面'.format(url_lst.index(url), len(url_lst)))
            # print('開(kāi)始抓取第{}個(gè)頁(yè)面的數(shù)據(jù),共計(jì){}個(gè)頁(yè)面'.format(url_lst.index(url), len(url_lst)))
            # print('-------------------------------------------------------------')
            info_dic = {}
            index = 1
            print('開(kāi)始抓取{}'.format(resp.url))
            print('開(kāi)始抓取{}'.format(resp.url))
            print('開(kāi)始抓取{}'.format(resp.url))
            for node in nodelist:
                try:
                    info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
                except:
                    info_dic['title'] = '/'
                try:
                    info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
                except:
                    info_dic['href'] = '/'
                try:
                    info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ','').split('|')[0]
                except:
                    info_dic['xiaoqu'] = '/'
                try:
                    info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[1]
                except:
                    info_dic['huxing'] = '/'
                try:
                    info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
                except:
                    info_dic['area'] = '/'
                try:
                    info_dic['chaoxiang'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
                except:
                    info_dic['chaoxiang'] = '/'
                try:
                    info_dic['zhuangxiu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
                except:
                    info_dic['zhuangxiu'] = '/'
                try:
                    info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[5]
                except:
                    info_dic['dianti'] = '/'
                try:
                    info_dic['louceng'] = re.findall('\((.*)\)',node.xpath(".//div[@class='positionInfo']/text()")[0])
                except:
                    info_dic['louceng'] = '/'
                try:
                    info_dic['nianxian'] = re.findall('\)(.*?)年',node.xpath(".//div[@class='positionInfo']/text()")[0])
                except:
                    info_dic['nianxian'] = '/'
                try:
                    info_dic['guanzhu'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ','').split('/')[0]))
                except:
                    info_dic['guanzhu'] = '/'
                try:
                    info_dic['daikan'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
                except:
                    info_dic['daikan'] = '/'
                try:
                    info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
                except:
                    info_dic['fabu'] = '/'
                try:
                    info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
                except:
                    info_dic['totalprice'] = '/'
                try:
                    info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('單價(jià)','')
                except:
                    info_dic['unitprice'] = '/'
                if True in [info_dic['href'] in dic.values() for dic in info_lst]:
                    url_lst_duplicated.append(info_dic)
                else:
                    info_lst.append(info_dic)
                print('第{}條:    {}→房屋信息抓取完畢！'.format(index,info_dic['title']))
                index += 1
                info_dic = {}

start = time.time()

#首次抓取url_lst中的信息，部分url沒(méi)有對(duì)其發(fā)起請(qǐng)求，不知道為什么
tasks = [asyncio.ensure_future(get_info(url)) for url in url_lst]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

#將沒(méi)有發(fā)起請(qǐng)求的url放入一個(gè)列表，對(duì)其進(jìn)行循環(huán)抓取，直到所有url都被發(fā)起請(qǐng)求
url_lst_unrequested = []
for url in url_lst:
    if url not in url_lst_successed or url_lst_failed:
        url_lst_unrequested.append(url)
while len(url_lst_unrequested) > 0:
    tasks_unrequested = [asyncio.ensure_future(get_info(url)) for url in url_lst_unrequested]
    loop.run_until_complete(asyncio.wait(tasks_unrequested))
    url_lst_unrequested = []
    for url in url_lst:
        if url not in url_lst_successed:
            url_lst_unrequested.append(url)
end = time.time()
print('當(dāng)前價(jià)格區(qū)間段內(nèi)共有{}套二手房源\(包含{}條重復(fù)房源\),實(shí)際獲得{}條房源信息。'.format(totalnum,len(url_lst_duplicated),len(info_lst)))
print('總共耗時(shí){}秒'.format(end-start))

df = pandas.DataFrame(info_lst)
df.to_csv(r"C:\test\ljwh.csv",encoding='gbk')

##################同步爬取##########################
# info_lst = []
#
# start1 = time.time()
# for url in url_lst:
#     resp = requests.get(url)
#     nodelist = etree.HTML(resp.text).xpath("http://ul[@class='sellListContent']/li")
#     info_dic = {}
#     index = 1
#     print('開(kāi)始抓取{}'.format(resp.url))
#     print('開(kāi)始抓取{}'.format(resp.url))
#     print('開(kāi)始抓取{}'.format(resp.url))
#     for node in nodelist:
#         try:
#             info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
#         except:
#             info_dic['title'] = '/'
#         try:
#             info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
#         except:
#             info_dic['href'] = '/'
#         try:
#             info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 0]
#         except:
#             info_dic['xiaoqu'] = '/'
#         try:
#             info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 1]
#         except:
#             info_dic['huxing'] = '/'
#         try:
#             info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
#         except:
#             info_dic['area'] = '/'
#         try:
#             info_dic['chaoxiang'] = \
#             node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
#         except:
#             info_dic['chaoxiang'] = '/'
#         try:
#             info_dic['zhuangxiu'] = \
#             node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
#         except:
#             info_dic['zhuangxiu'] = '/'
#         try:
#             info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
#                 5]
#         except:
#             info_dic['dianti'] = '/'
#         try:
#             info_dic['louceng'] = re.findall('\((.*)\)', node.xpath(".//div[@class='positionInfo']/text()")[0])
#         except:
#             info_dic['louceng'] = '/'
#         try:
#             info_dic['nianxian'] = re.findall('\)(.*?)年', node.xpath(".//div[@class='positionInfo']/text()")[0])
#         except:
#             info_dic['nianxian'] = '/'
#         try:
#             info_dic['guanzhu'] = ''.join(
#                 re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[0]))
#         except:
#             info_dic['guanzhu'] = '/'
#         try:
#             info_dic['daikan'] = ''.join(
#                 re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
#         except:
#             info_dic['daikan'] = '/'
#         try:
#             info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
#         except:
#             info_dic['fabu'] = '/'
#         try:
#             info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
#         except:
#             info_dic['totalprice'] = '/'
#         try:
#             info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('單價(jià)', '')
#         except:
#             info_dic['unitprice'] = '/'
#         if True in [info_dic['href'] in dic.values() for dic in info_lst]:
#             url_lst_duplicated.append(info_dic)
#         else:
#             info_lst.append(info_dic)
#         print('第{}條:    {}→房屋信息抓取完畢！'.format(index, info_dic['title']))
#         index += 1
#         info_dic = {}
# end = time.time()
# print('實(shí)際獲得{}條房源信息。'.format(len(info_lst)))
# print('總共耗時(shí){}秒'.format(end-start))
色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

鏈家爬蟲(chóng)代碼_asyncio

鏈家爬蟲(chóng)代碼_asyncio

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

鏈家爬蟲(chóng)代碼_asyncio

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av