最近需要從網(wǎng)上下載空氣質(zhì)量數(shù)據(jù),看到真氣網(wǎng)的歷史數(shù)據(jù)整理的不錯(cuò),因此想利用腳本下載;查看網(wǎng)站之后發(fā)現(xiàn)通過(guò)數(shù)據(jù)流隱藏了數(shù)據(jù),又懶得通過(guò)抓包分析,于是使用selenium來(lái)模擬瀏覽器進(jìn)行下載。具體代碼如下:
ChromeDriver
歷史空氣質(zhì)量數(shù)據(jù)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2019-09-14 22:22:06
# @Author : Your Name (you@example.org)
# @Link : http://example.org
# @Version : $Id$
import os
import time
import datetime
from urllib.parse import urlencode
import requests
import pandas as pd
from selenium import webdriver
# https://www.aqistudy.cn/historydata/daydata.php?city=%E5%8C%97%E4%BA%AC&month=2016-11
url_base = 'https://www.aqistudy.cn/historydata/daydata.php'
citys = [
'北京',
'天津',
'石家莊',
'保定',
'唐山',
'秦皇島',
'邢臺(tái)',
'邯鄲',
'滄州',
]
sdt = datetime.datetime(2013, 12, 1) # 開(kāi)始時(shí)間
edt = datetime.datetime.now() # 結(jié)束時(shí)間 datetime.datetime(2013, 12, 1)
months = pd.date_range(sdt, edt, freq='1m')
homedir = os.path.dirname(os.path.realpath(__file__))
output_path = os.path.join(homedir, 'data')
print(homedir)
print(output_path)
if not os.path.exists(output_path):
os.makedirs(output_path, exist_ok=True)
# open selenium, download from http://npm.taobao.org/mirrors/chromedriver/
# 下載后放在腳本所在目錄
driver = webdriver.Chrome()
for icity, vcity in enumerate(citys):
output_filename = os.path.join(output_path, '{}_{}_{}.csv'.format(
vcity, sdt.strftime('%Y%m'), edt.strftime('%Y%m')))
for imonth, vmonth in enumerate(months):
url = '{}?{}&month={}'.format(
url_base,
urlencode({'city':vcity}, 'utf-8'),
vmonth.strftime('%Y-%m')
)
driver.get(url)
time.sleep(5)
data = pd.read_html(driver.page_source, header=False)[0]
# output data
if os.path.exists(output_filename):
headers=False
mode='a'
else:
headers=True
mode='w'
data.to_csv(output_filename, index=False, header=headers, mode=mode)
print(vcity, vmonth, url)
time.sleep(2.5)
其他問(wèn)題自行解決!