main01
這個程序通過selenium采用ChromeDrive的無頭瀏覽器方式,從 http://www.landchina.com/default.aspx?tabid=263&ComName=default 獲取交易信息的url,并且存入redis中。
對于交易信息,可以加時間與地區(qū)做篩選:
def llq_main(start, end):
print(start, end)
time.sleep(2)
# 對時間條件進行賦值
driver.find_element_by_id('TAB_queryDateItem_270_1').clear()
driver.find_element_by_id('TAB_queryDateItem_270_1').send_keys(start)
driver.find_element_by_id('TAB_queryDateItem_270_2').clear()
driver.find_element_by_id('TAB_queryDateItem_270_2').send_keys(end)
# 進行行政區(qū)的選擇
driver.find_element_by_id('TAB_QueryConditionItem256').click()
driver.execute_script("document.getElementById('TAB_queryTblEnumItem_256_v').setAttribute('type', 'text');")
driver.find_element_by_id('TAB_queryTblEnumItem_256_v').clear()
driver.find_element_by_id('TAB_queryTblEnumItem_256_v').send_keys('3205') # 3701是濟南; 37是山東
driver.find_element_by_id('TAB_QueryButtonControl').click() # 查詢操作
page_zh(i, l)
if __name__ == '__main__':
llq_main('2005-01-01', '2006-7-30')
比如這里的3205就是地區(qū)碼,最后一行的兩個時間就表示起止時間。
設(shè)置代理很簡單,我直接用了以前買的ssr,占用的是localhost的1080端口
proxy = '127.0.0.1:1080'
proxies = {
'http': 'socks5://' + proxy,
'https': 'socks5://' + proxy
}
對于selenium用代理,需要如下設(shè)置:
options.add_argument('--proxy-server=http://' + proxy)
完整代碼如下:
# coding=utf-8
import time
import re
import redis
from bs4 import BeautifulSoup
from selenium import webdriver
proxy = '127.0.0.1:1080'
proxies = {
'http': 'socks5://' + proxy,
'https': 'socks5://' + proxy
}
r = redis.Redis(host='127.0.0.1', port=6379, db=0) # host自己的ip地址
options = webdriver.ChromeOptions()
options.add_argument('--proxy-server=http://' + proxy)
options.set_headless()
driver = webdriver.Chrome(chrome_options=options) # 打開chrome_headless瀏覽器
driver.get('http://www.landchina.com/default.aspx?tabid=263&ComName=default') # 打開界面
i = 1
l = 0
date_list = []
time.sleep(8)
driver.find_element_by_id('TAB_QueryConditionItem270').click()
def page_zh(i, l):
# 獲取本時間段內(nèi)的總頁數(shù)(方法)int(reg[0])
zys = driver.find_elements_by_css_selector(".pager")
if (zys != []):
str = zys[1].text;
reg = re.findall(r'\d+', str)
pages = int(reg[0])
print("總頁數(shù)為:" + reg[0])
tds = driver.find_elements_by_css_selector(".pager>input")
# 清空文本方法
tds[0].clear()
tds[0].send_keys(i)
print("第" + tds[0].get_attribute("value") + "頁")
tds[1].click()
elif (zys == []):
pages = 1
time.sleep(4)
# 獲取頁面html
html = driver.find_element_by_id('TAB_contentTable').get_attribute('innerHTML')
soup = BeautifulSoup(html, 'lxml') # 對html進行解析
href_ = soup.select('.queryCellBordy a')
for line in href_:
print("http://www.landchina.com/" + line['href'])
link = "http://www.landchina.com/" + line['href']
# 鏈接redis
r.sadd('mylist', "http://www.landchina.com/" + line['href'])
if (i < pages):
i = i + 1
page_zh(i, l)
else:
print("本次采集結(jié)束!!!")
# 關(guān)閉瀏覽器(selenium)
# driver.quit()
def llq_main(start, end):
print(start, end)
time.sleep(2)
# 對時間條件進行賦值
driver.find_element_by_id('TAB_queryDateItem_270_1').clear()
driver.find_element_by_id('TAB_queryDateItem_270_1').send_keys(start)
driver.find_element_by_id('TAB_queryDateItem_270_2').clear()
driver.find_element_by_id('TAB_queryDateItem_270_2').send_keys(end)
# 進行行政區(qū)的選擇
driver.find_element_by_id('TAB_QueryConditionItem256').click()
driver.execute_script("document.getElementById('TAB_queryTblEnumItem_256_v').setAttribute('type', 'text');")
driver.find_element_by_id('TAB_queryTblEnumItem_256_v').clear()
driver.find_element_by_id('TAB_queryTblEnumItem_256_v').send_keys('3205') # 3701是濟南; 37是山東
driver.find_element_by_id('TAB_QueryButtonControl').click() # 查詢操作
page_zh(i, l)
if __name__ == '__main__':
llq_main('2005-01-01', '2006-7-30')
mian02
該程序的思路很簡單:
0.selenium通過代理ip地址打開一個目標(biāo)網(wǎng)站的界面,構(gòu)造出一個可用的cookie:
def getCookie():
options = webdriver.ChromeOptions()
options.add_argument('--proxy-server=http://' + proxy)
options.set_headless()
driver = webdriver.Chrome(chrome_options=options) # 打開chrome_headless瀏覽器
driver.get('http://www.landchina.com/default.aspx?tabid=263&ComName=default') # 打開界面
time.sleep(5)
cookie = driver.get_cookies()
str1 = list(cookie)
cookieStr = ''
for i in range(0, 6):
cookieStr = strnew + str1[i]['name'] + '=' + str1[i]['value'] + ';'
print(cookieStr)
driver.quit()
return cookieStr
1.從redis取一個url:
通過r.spop()函數(shù)隨機取一個url,調(diào)用parse()函數(shù)進行頁面解析
def checkRedis(sleepCounter, headers): # 從redis讀url
while 1:
if r.scard('mylist') != 0:
url = r.spop('mylist')
# print(url)
time.sleep(2)
parse(url, headers)
elif sleepCounter < 100:
print('waiting...' + str(sleepCounter))
sleepCounter += 1
time.sleep(1)
else:
print('quit')
break
2.通過代理ip地址,用最開始構(gòu)造的cookie構(gòu)造一個適用的headers;發(fā)出請求,完成頁面解析:
def createHeaders(cookie):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.landchina.com',
'Origin': 'http://www.landchina.com',
'Upgrade-Insecure-Requests': '1',
'Cookie': cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
return headers
具體的頁面解析我就沒有放出來了,有興趣的自己寫寫
def parse(url, headers):
page = requests.get(url, headers=headers, proxies=proxies)
doc = pq(page.text)
最后介紹一個我覺得很有用的包,叫做retry。顧名思義,用于方法出錯后重試的包,對方法進行修飾。
我這里把整個爬蟲的過程寫在一個方法中,并加上了retry的修飾(出錯了等待2秒后就重試,最多重試5次):
@retry(tries=5, delay=2)
def doTheJob():
cookie = getCookie()
headers = createHeaders(cookie)
checkRedis(0, headers=headers)
doTheJob()
最后完整代碼如下:
# coding=utf-8
import requests
from pyquery import PyQuery as pq
from redis import StrictRedis
from selenium import webdriver
import time
from pymongo import MongoClient
from retry import retry
r = StrictRedis(host='127.0.0.1', port=6379, db=0)
client = MongoClient()
db = client['landchina_qd']
collection = db['landchina_qd']
proxy = '127.0.0.1:1080'
proxies = {
'http': 'socks5://' + proxy,
'https': 'socks5://' + proxy
}
def saveToMongo(data):
if collection.insert(data):
print('Saved to mongo.')
def checkRedis(sleepCounter, headers): # 從redis讀,并解析頁面
# 如果redis中暫無數(shù)據(jù),等待。等待時間超過100秒后退出程序。
while 1:
if r.scard('mylist') != 0:
url = r.spop('mylist')
# print(url)
time.sleep(2)
parse(url, headers)
elif sleepCounter < 100:
print('waiting...' + str(sleepCounter))
sleepCounter += 1
time.sleep(1)
else:
print('quit')
break
def getCookie():
options = webdriver.ChromeOptions()
options.add_argument('--proxy-server=http://' + proxy)
options.set_headless()
driver = webdriver.Chrome(chrome_options=options) # 打開chrome_headless瀏覽器
driver.get('http://www.landchina.com/default.aspx?tabid=263&ComName=default') # 打開界面
time.sleep(5)
cookie = driver.get_cookies()
str1 = list(cookie)
strnew = ''
for i in range(0, 6):
strnew = strnew + str1[i]['name'] + '=' + str1[i]['value'] + ';'
print(strnew)
driver.quit()
return strnew
def createHeaders(cookie):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.landchina.com',
'Origin': 'http://www.landchina.com',
'Upgrade-Insecure-Requests': '1',
'Cookie': cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
return headers
def parse(url, headers):
page = requests.get(url, headers=headers, proxies=proxies)
doc = pq(page.text)
#頁面解析部分感興趣可以自己寫
data = [{
'district': district,
'name': name,
'location': location,
'size': size,
'usage': usage,
'price': price,
'time': time,
'url': url
}]
saveToMongo(data)
@retry(tries=5, delay=2)
def doTheJob():
cookie = getCookie()
headers = createHeaders(cookie)
checkRedis(0, headers=headers)
doTheJob()
最后程序運行結(jié)果很穩(wěn)定,速度快的飛起

main01
該程序獲取交易信息的url,存入redis

main02
可以看到這里的main02 和 4 ,5 三個程序運行正常,他們都是跑的同一個程序。

分布式
勉強算個分布式爬蟲吧,畢竟只有一臺電腦。。。