本文先對鏈家網最新的武漢二手房數據進行爬取,提取房價面積和平米單價并以逗號分割存儲在txt文件中,最后對數據分別進行線性回歸和多項式回歸,并以繪制相關圖查看效果。
1. 數據爬取
import requests
from bs4 import BeautifulSoup
import bs4
import os
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getAreaPrice(url,path):
html = getHTMLText(url)
soup = BeautifulSoup(html, 'html.parser')
try:
ul = soup.find_all('ul', class_="sellListContent")[0]
with open(path,"a") as f:
for li in ul.children:
try:
if isinstance(li, bs4.element.Tag):
area = li('div', class_="houseInfo")[0].text.split('|')[2][:-3]
price = li('div', class_="unitPrice")[0].string[2:-4]
f.write(str(float(area)).strip() + "," + str(int(price)) + "\n")
except:
continue
except:
pass
if __name__ == '__main__':
list = []
path = "prices.txt"
if os.path.exists(path):
os.remove(path)
pageNum = 97
for i in range(pageNum):
url = "https://wh.lianjia.com/ershoufang/pg" + str(i)
print("\r房價數據正在下載,當前進度: {:.2f}%".format((i +1) * 100 / pageNum), end="")
getAreaPrice(url,path)

爬取的原始數據,2908條
2. 回歸分析
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
datasets_X = []
datasets_Y = []
fr = open('prices.txt','r',encoding= 'utf-8')
lines = fr.readlines()
for line in lines:
items = line.strip().split(',')
datasets_X.append(float(items[0]))
datasets_Y.append(int(items[1]))
length = len(datasets_X)
datasets_X = np.array(datasets_X).reshape([length,1])
datasets_Y = np.array(datasets_Y)
minX = min(datasets_X)
maxX = max(datasets_X)
X = np.arange(minX,maxX).reshape([-1,1])
# 線性回歸
linear = linear_model.LinearRegression()
linear.fit(datasets_X, datasets_Y)
# 多項式回歸
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(datasets_X)
lin_reg_2 = linear_model.LinearRegression()
lin_reg_2.fit(X_poly, datasets_Y)
# 繪圖
plt.figure(figsize=(12,7)) #設置圖片大小
plt.scatter(datasets_X, datasets_Y, color = 'green',marker='.')
plt.plot(X, linear.predict(X), color = 'red')
plt.plot(X, lin_reg_2.predict(poly_reg.fit_transform(X)), color = 'blue')
plt.xlabel('Area')
plt.ylabel('Price')
plt.show()

回歸圖
結論:如上圖所示,綠色點為樣本散點,紅色直線是線性回歸結果,藍色曲線是多項式回歸結果,很直觀的表明多項式曲線擬合得更好,從圖中可以看出大多數二手房面積在80-110平米之間,同時該區(qū)間房價基本是最低的,小戶型房價略有提升,隨著戶型增大,房價上升趨勢比較明顯。