一、目標(biāo)
爬取轉(zhuǎn)轉(zhuǎn)二手商品數(shù)據(jù)
二、 設(shè)計(jì)工作流程
- 手繪工作流程,首先應(yīng)從首頁獲得所有大類目鏈接。再從某大類里獲取所有商品的鏈接,并寫出第一個(gè)相關(guān)函數(shù),對應(yīng)爬蟲一,對應(yīng)數(shù)據(jù)庫文件一
- 編寫獲得具體商品信息的函數(shù),對應(yīng)爬蟲二,對應(yīng)數(shù)據(jù)庫文件二
- 爬蟲一獲得鏈接并存儲至數(shù)據(jù)庫一,爬蟲二從數(shù)據(jù)庫一獲取鏈接,將所有鏈接對應(yīng)商品的信息保存至數(shù)據(jù)庫二。
三、 具體步驟
1. 新建項(xiàng)目,獲取所有類目鏈接
新增文件 channel_extract.py
import requests
from bs4 import BeautifulSoup
start_url = 'http://bj.ganji.com/wu/'
def get_category_urls(start_url):
wb_data = requests.get(start_url)
soup = BeautifulSoup(wb_data.text, 'lxml')
part_category_urls = soup.select('div.main-pop dt a') # 此變量生成一個(gè)字典的集合,無法使用的,當(dāng)我們通過for in 時(shí),會調(diào)出單獨(dú)的字典,再使用字典函數(shù)get()等等
for part_category_url in part_category_urls:
category_url = 'http://' + start_url.split('/')[2] + str(part_category_url.get('href'))
print(category_url)
#get_category_urls(start_url)
category_url_list = '''
http://bj.ganji.com/shouji/
http://bj.ganji.com/shoujipeijian/
http://bj.ganji.com/bijibendiannao/
http://bj.ganji.com/taishidiannaozhengji/
http://bj.ganji.com/diannaoyingjian/
...................... '''
2. 爬取商品鏈接及商品具體信息
新建文件page_parsing.py
import requests
from bs4 import BeautifulSoup
import time
import pymongo
client = pymongo.MongoClient('localhost', 27017)
ganji1 = client['ganji1_db']
item_url_list = ganji1['item_url_list_db'] # 存儲所有商品的鏈接
item_info = ganji1['item_info_db'] # 存儲商品的詳細(xì)信息
#=========獲取一個(gè)分類下的所有商品鏈接=========#
def get_item_urls_from(category, pages):
category_url = '{}o{}/'.format(category, str(pages))
wb_data = requests.get(category_url)
time.sleep(1)
soup = BeautifulSoup(wb_data.text, 'lxml')
part_item_url1 = soup.select('tr.zzinfo.jz td.t a') # 通過這3行代碼,清除了鏈接中的商家商品,剩余均為個(gè)人商品
part_item_url2 = soup.select('td.t a')
part_item_url = list( set(part_item_url2) - set(part_item_url1) )
if soup.find('td','t'):
for i in part_item_url:
item_url = i.get('href').split('?')[0]
item_url_list.insert_one( {'url':item_url} )
print(item_url)
else:
pass # 通過這個(gè)判斷式來判斷爬取翻頁到頭時(shí)自動(dòng)停止,td t 是商品的標(biāo)題元素,沒它就證明該頁面沒商品了
#get_item_urls_from('http://bj.ganji.com/shouji/',1)
#=========獲取一個(gè)商品的詳情=========#
def get_item_info(url):
wb_data = requests.get(url)
if wb_data.status_code == 404: # status_code 是requests 自帶的一個(gè)方法,在一開始的時(shí)候檢測網(wǎng)頁是否有效存在
pass
else:
try:
soup = BeautifulSoup(wb_data.text, 'lxml')
data = {
'title': soup.title.text,
'price':soup.select('span.price_now i')[0].text,
'area': soup.select('.palce_li i')[0].text,
'url': url
}
item_info.insert_one(data)
print(data)
except AttributeError:
pass
#get_item_info('http://zhuanzhuan.ganji.com/detail/923140825311903756z.shtml')
3. 新建運(yùn)行主程序main.py
from multiprocessing import Pool
from page_parsing import get_item_urls_from
from page_parsing import get_item_info
from page_parsing import item_url_list
from page_parsing import item_info
from channel_extract import category_url_list
#=============斷點(diǎn)續(xù)傳-防止程序中斷===============#
db_urls = [ item['url'] for item in item_url_list.find() ] # 從數(shù)據(jù)庫中find字典元素,用item調(diào)用出key 'url'對應(yīng)的value,也就是網(wǎng)址,并形成列表
index_urls = [ item['url'] for item in item_info.find() ]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x - y
def get_all_urls_from(category): # 獲得所有鏈接
for i in range(1,5):
get_item_urls_from(category,i)
def get_all_item_info(url): # 獲得所有詳情頁
for i in rest_of_urls:
get_item_info(i)
if __name__=='__main__':
# =============獲取所有的鏈接==============#
# pool = Pool(processes= 4)
# pool.map(get_all_urls_from, category_url_list.split()) # split()會將長字符串變成一個(gè) 列表,map直接從列表里依次拿元素
# pool.close()
# pool.join()
#================獲取所有商品的詳情=================#
pool = Pool(processes= 4)
pool.map( get_all_item_info, rest_of_urls )
pool.close()
pool.join()
4. 建立監(jiān)控程序 counts.py
import time
from page_parsing import item_url_list
from page_parsing import item_info
while True:
time.sleep(1)
print(item_url_list.find().count())
print(item_info.find().count())
5. 忽略報(bào)錯(cuò)功能(防止程序中斷)
def try_to_make(ass):
try:
print(1/ass)
except (ZeroDivisionError,TypeError):
print('ok~')
try_to_make('0')
# 如上,通過 try except 實(shí)現(xiàn)忽略錯(cuò)誤,可添加在正常程序中,這里只是示范(重要程序有錯(cuò)必究,不能忽略的)
6. 運(yùn)行
打開終端,開啟3個(gè)窗口,切換到程序文件夾中,第一個(gè)窗口輸入mongod,mongo,好了,mongo已開啟
第二個(gè)窗口輸入 python3 counts.py
第三個(gè)窗口輸入python3 main.py
好了,開始抓取數(shù)據(jù)了,成功

運(yùn)行截圖