1.代碼可以直接運行,請下載anaconda并安裝,用spyder方便查看變量
或者可以查看生成的excel文件
2.依賴庫,命令行運行(WIN10打開命令行快捷鍵:windows+x組合鍵,然后按a鍵):
pip install BeautifulSoup4
pip install requests
3.爬取的網(wǎng)站是藍房網(wǎng)(廈門),可以進入http://house.lanfw.com/xm/search-y1/進行觀察
4.關(guān)于如何判斷代碼是python2還是python3,print('')為python3,print ''為python2
簡而言之就是print需要用括號的就是python3,下面代碼如是。
5.爬取53個頁面并進行解析,程序運行后需要等待大概30秒
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 15 21:35:52 2018
@author: Steven Lei
"""
def getHousesDetails(url):
from bs4 import BeautifulSoup
import requests
request = requests.get(url)
request.encoding = 'utf-8'
soup = BeautifulSoup(request.text,'lxml')
houses = soup.select('.lpList')
housesDetails = []
for house in houses:
#獲取樓盤名字
houseName = house.select('.title h2 a')[0].text
#獲取樓盤地址
address = house.select('.lpTxt div')[1].select('p')[1].text.strip('樓盤地址: 查看地圖')
if(len(address) >= 16):
houseDetailHref = house.select('.title h2 a')[0]['href']
request = requests.get(houseDetailHref)
request.encoding = 'utf-8'
soup = BeautifulSoup(request.text,'lxml')
address = soup.select('.toplpMsg ul li div i')[0].text.strip('樓盤地址:')
#獲取樓盤開盤時間
openTime = house.select('.lpTxt div')[1].select('p')[3].text.strip('開盤時間:')
#獲取樓盤價格
price = house.select('.price p b')[0].text
#獲取樓盤銷售狀態(tài)
def numberToString(number):
switcher = {
1: "在售",
3: "尾盤",
5: "未售",
15: "售罄"
}
return switcher.get(number,'未知')
saleStatusImg = house.select('.title p img')[0]['src']
saleStatusId = int(saleStatusImg.lstrip('/public/images/state_').rstrip('.jpg'))
saleStatus = numberToString(saleStatusId)
#將所有樓盤信息做成樓盤信息字典
houseDetails = {}
houseDetails['houseName'] = houseName
houseDetails['address'] = address
houseDetails['openTime'] = openTime
houseDetails['price'] = price
houseDetails['saleStatus'] = saleStatus
housesDetails.append(houseDetails)
return housesDetails
def getAllHousesDetails():
maxPageNumber = 54
urlBefore = 'http://house.lanfw.com/xm/search-y{}'
allHousesDetails = []
for i in range(1,maxPageNumber+1):
url = urlBefore.format(i)
allHousesDetails.extend(getHousesDetails(url))
import pandas
dataframe = pandas.DataFrame(allHousesDetails)
return dataframe
if __name__ == '__main__':
allHousesDetails = getAllHousesDetails()
allHousesDetails.to_excel('houseDetails2.xlsx')