爬取網(wǎng)址:http://gz.58.com/sale.shtml
爬取信息:標題,價格,區(qū)域,瀏覽量,想買數(shù),寶貝描述內(nèi)容,寶貝留言數(shù)
爬取方式:進入分類的詳細頁面,使用lxml解析。
存儲方式:MongoDB存儲 & MySQL存儲
-
獲取轉(zhuǎn)轉(zhuǎn)各個分類的URL。
image.png -
進入分類頁面,以二手手機為例。分類鏈接為http://gz.58.com//shouji/,點擊多頁后發(fā)現(xiàn)該類別的個人分類規(guī)律為:http://gz.58.com//shouji/0/pn{}/,其中{}放置數(shù)字即可實現(xiàn)翻頁。
不過,置頂頁還是會出現(xiàn)商家信息,后期爬取時看能否過濾掉。
image.png -
詳細頁面爬取信息為:標題,瀏覽量,想買數(shù),價格,區(qū)域,寶貝描述內(nèi)容,寶貝留言數(shù)。
image.png
在計算機中新建一個文件夾,然后新建三個py文件:
- class_urls.py : 獲取各商品類別的URL
- page_spider.py:獲取詳細頁面URL,獲取頁面信息
- main.py:主程序
class_urls.py:這里僅使用簡單的二級目錄作為演示。
import requests
from lxml import etree
start_url = "http://cs.58.com/sale.shtml"
base_url = "http://cs.58.com"
r = requests.get(start_url)
html = etree.HTML(r.text)
infos = html.xpath('//li[@class="ym-tab"]')
for info in infos:
urls = info.xpath('span/a/@href') ##簡單二級目錄
#urls = info.xpath('ul/li/span/a/@href') ##復(fù)雜二級目錄
for url in urls:
class_url = base_url + url
print(class_url)
#打印出URL后,人工剔除一些不符合頁面規(guī)則的URL,然后將URL存儲為字符串數(shù)據(jù)。
##簡單的二級類目,共11個。
class_urls1 = '''
http://gz.58.com/shouji/
http://gz.58.com/danche/
http://gz.58.com//diandongche/
http://gz.58.com/diannao/
http://gz.58.com/shuma/
http://gz.58.com/jiadian/
http://gz.58.com/ershoujiaju/
http://gz.58.com/yingyou/
http://gz.58.com/fushi/
http://gz.58.com/meirong/
http://gz.58.com/wenti/
'''
page_spider.py:采用單進程時,用url_list進行返回。
采用多進程時,省略掉url_list,同時需要修改main.py的運行方式。
import requests
from bs4 import BeautifulSoup
import time
import pymongo
client = pymongo.MongoClient('localhost',27017)
mydb = client['mydb']
zhuanzhuan_url = mydb['zhuanzhuan_url']
zhuanzhuan_info = mydb['zhuanzhuan_info']
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3294.6 Safari/537.36'}
def get_page(url,page):
try:
detail_url = '{}0/pn{}/'.format(url,page)
r = requests.get(detail_url,headers=headers)
soup = BeautifulSoup(r.text,"lxml")
infos = soup.select("tr")
#url_list = [] #單進程時使用
for info in infos:
detail_url = info.select("a.t")[0].get("href")
if detail_url.startswith("http://zhuanzhuan.58.com/detail/"):
#url_list.append(detail_url) #單進程時使用
zhuanzhuan_url.insert_one({'詳細頁面鏈接':detail_url})
else:
pass
#return url_list #單進程時使用
except requests.exceptions.ConnectionError:
pass
def get_info(url):
try:
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text,"lxml")
title = soup.select("h1.info_titile")[0].text
price = soup.select("span.price_now i")[0].text
area = soup.select("div.palce_li span i")[0].text
view = soup.select("span.look_time")[0].text
want = soup.select("span.want_person")[0].text
content = soup.select("div.baby_kuang p")[0].text
left_num = soup.select("h3.box_title_h3 i")[0].text
info = {'標題':title,
'價格':price,
'區(qū)域':area,
'瀏覽量':view,
'想買數(shù)':want,
'寶貝描述內(nèi)容':content,
'寶貝留言數(shù)':left_num,
'鏈接':url
}
zhuanzhuan_info.insert_one(info)
time.sleep(2)
except InexError:
pass
main.py(單進程版):
from class_urls import class_urls1
from page_spider import get_page,get_info
url_list = class_urls1.split()
for url in url_list:
for page in range(1,101):
detail_url_list = get_page(url,page)
for detail_url in detail_url_list:
get_info(detail_url)
main.py(多進程版):
- ①先獲取詳細頁面鏈接:
from multiprocessing import Pool
from class_urls import class_urls1
from page_spider import get_page,get_info
def get_links_from(url):
for page in range(1,101):
get_page(url,page)
if __name__ == '__main__':
url_list = class_urls1.split() #將字符串轉(zhuǎn)變成列表
pool = Pool(processes=4) #創(chuàng)建進程池
pool.map(get_links_from, url_list) #調(diào)用進程池
- ②獲取轉(zhuǎn)轉(zhuǎn)信息(支持斷點續(xù)傳):
#多進程
from multiprocessing import Pool
from class_urls import class_urls1
from page_spider import get_page,get_info
from page_spider import zhuanzhuan_url,zhuanzhuan_info
def get_links_from(url):
for page in range(1,101):
get_page(url,page)
zz_urls = [item['詳細頁面鏈接'] for item in zhuanzhuan_url.find()] #獲取數(shù)據(jù)庫中的詳細頁面鏈接
zz_urls_2 = [item['鏈接'] for item in zhuanzhuan_info.find()] #數(shù)據(jù)表zhuanzhuan_info不存在時返回0
rest_urls = set(zz_urls) - set(zz_urls_2) #使用集合來過濾重復(fù)鏈接,支持斷點續(xù)傳
if __name__ == '__main__':
url_list = class_urls1.split() #將字符串轉(zhuǎn)變成列表
## pool = Pool(processes=4) #創(chuàng)建進程池
## pool.map(get_links_from, url_list) #調(diào)用進程池
pool = Pool(processes=4) #創(chuàng)建進程池
pool.map(get_info,rest_urls) #調(diào)用進程池
由于網(wǎng)絡(luò)或其他原因,抓取信息也不是一帆風順的,不妨再考慮加多一點反爬措施。獲取詳細頁面鏈接倒是比較順利,所以運行一次即可注釋掉。


