參考資源:
1.Web Scraping with Python&中文版《Python網(wǎng)絡數(shù)據(jù)采集》
2.廖雪峰hashlib教程
項目內容:
維基百科里面有很多內嵌的鏈接,我們想看看,從某個名人鏈接出發(fā),到達另外一個鏈接的界面,需要經(jīng)過那些鏈接。
我們項目是從李嘉誠到周杰倫,最后的結果:
['/wiki/Jay_Chou', '/wiki/Liu_Xiang_(hurdler)', '/wiki/Chinese_name', '/wiki/Li_Ka-shing']
總共訪問了大概210個網(wǎng)頁
項目思路:
這有點像一棵樹的遍歷,可以層序遍歷或者前序遍歷,或者說像圖的廣度遍歷和深度遍歷,本項目采用廣度遍歷,因為廣度遍歷可以計算出最短的計算距離。
代碼用一個列表 added_urls存放已經(jīng)訪問過的鏈接和待訪問的鏈接,這個過程熟悉圖的廣度遍歷的人應該知道,值得注意的是,為了在找到目的鏈接時可以追溯到底是經(jīng)過哪些節(jié)點找到目標節(jié)點的,列表的每個元素不僅僅保存鏈接還保存父鏈接在整個列表中的序號
另外為了方便調試,第一次訪問某個網(wǎng)站都會在硬盤保存對應的html文件,文件名為網(wǎng)站鏈接計算出來的哈希值,后面再訪問的時候根據(jù)鏈接首先計算出哈希值,再看看有沒有對應的文件,如果有,直接讀取硬盤上的文件即可,硬盤加載文件可比網(wǎng)絡要快得多。
def get_urls_md5_hash(self,url):
# 根據(jù)urls計算哈希值作為文件名
md5 = hashlib.md5()
md5.update(url.encode())
return md5.hexdigest()
# 第一次從網(wǎng)絡獲取并保存到硬盤,第二次從硬盤讀取
def get_urls_html(self,url):
# 檢查鏈接是否加入了'https://en.wikipedia.org/'
url = self.checkurl(url)
# _____________________________________________________________________________________________
print('正在訪問鏈接{url}...'.format(url = url))
md5_value = self.get_urls_md5_hash(url)
filename = '{md5}.html'.format(md5=md5_value)
if os.path.exists(filename):
with open(filename,'rb') as f:
return f.read()
else:
# 在網(wǎng)站爬取錯誤的時候輸入錯誤信息,然后繼續(xù)
try:
html = request.urlopen(url).read()
except:
html = None
print('讀取網(wǎng)頁失?。簕html}'.format(html=html))
if html:
with open(filename,'wb') as f:
f.write(html)
return html
else:
return html
本項目還采用集合來判斷鏈接之前是不是已經(jīng)加入added_urls了,集合查找的速度比較快。
if url not in self.set:
self.added_urls.append([url,url_index])
self.set.add(url)
全部代碼:
from urllib import request
from bs4 import BeautifulSoup
import hashlib
import re
import os
import time
# 隊列用來層序遍歷這棵樹
# 集合用來檢查鏈接是否已經(jīng)放入隊列過,set查找的速度比較快
LIKASHING = '/wiki/Li_Ka-shing'
JAYCHOU = '/wiki/Jay_Chou'
# 時間測量的裝飾器
def timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print('%s function took %0.3f s' % (f.__name__, (time2-time1)))
return ret
return wrap
class Spider(object):
def __init__(self,start_url,end_url):
self.start_url = start_url
self.end_url = end_url
self.added_urls = [[start_url,-1],]
self.set = set()
self.set.add(start_url)
def checkurl(self,url):
if url.startswith('https'):
return url
else:
return 'https://en.wikipedia.org' + url
def get_urls_md5_hash(self,url):
# 根據(jù)urls計算哈希值作為文件名
md5 = hashlib.md5()
md5.update(url.encode())
return md5.hexdigest()
# 第一次從網(wǎng)絡獲取并保存到硬盤,第二次從硬盤讀取
def get_urls_html(self,url):
# 檢查鏈接是否加入了'https://en.wikipedia.org/'
url = self.checkurl(url)
# _____________________________________________________________________________________________
print('正在訪問鏈接{url}...'.format(url = url))
md5_value = self.get_urls_md5_hash(url)
filename = '{md5}.html'.format(md5=md5_value)
if os.path.exists(filename):
with open(filename,'rb') as f:
return f.read()
else:
# 在網(wǎng)站爬取錯誤的時候輸入錯誤信息,然后繼續(xù)
try:
html = request.urlopen(url).read()
except:
html = None
print('讀取網(wǎng)頁失敗:{html}'.format(html=html))
if html:
with open(filename,'wb') as f:
f.write(html)
return html
else:
return html
@timing
def get_all_wiki_urls(self,url):
html = self.get_urls_html(url)
bs_obj = BeautifulSoup(html,'lxml')
body_content = bs_obj.find('div',id='bodyContent')
if not body_content:
print('網(wǎng)頁{url}找不到bodyContent'.format(url=url))
return None
all_a = body_content.findAll('a',href=re.compile('^(/wiki/)((?!:).)*$'))
retlist = []
for a_tag in all_a:
if 'href' in a_tag.attrs:
re.match(r'^(/wiki/)',a_tag['href'])
retlist.append(a_tag['href'])
return retlist
# 廣度優(yōu)先遍歷wiki
def find_wiki_degree_bs(self):
url_index = 0
while url_index < len(self.added_urls):
# _____________________________________________________________________________________________
print('當前的url_index{url_index},元素個數(shù){length}'.format(url_index = url_index,length = len(self.added_urls)))
url_and_father = self.added_urls[url_index]
urls = self.get_all_wiki_urls(url_and_father[0])
if self.end_url in urls:
print('在{url}找到目標鏈接了!!!'.format(url = url_and_father[0] ))
result = [self.end_url,url_and_father[0]]
temp_father_index = url_and_father[1]
while temp_father_index >= 0:
url_and_father = self.added_urls[temp_father_index]
result.append(url_and_father[0])
temp_father_index = url_and_father[1]
return result
else:
for url in urls:
if url not in self.set:
self.added_urls.append([url,url_index])
self.set.add(url)
url_index += 1
# we can't find until queue is empty
return None
def main():
myspider = Spider(LIKASHING,JAYCHOU)
result = myspider.find_wiki_degree_bs()
print(result)
if __name__ == '__main__':
main()
項目后序改進:
1.還可以用深度遍歷進行遍歷,但是求不出最短的路徑,找到有很大的隨機性
2.異步實現(xiàn)網(wǎng)絡加載,這樣子下載網(wǎng)頁就會快很多。
3.多進程解析網(wǎng)頁和保存網(wǎng)頁等等加快程序執(zhí)行速度。
4.自己實現(xiàn)布隆過濾器來過濾網(wǎng)站減少內存的占用,現(xiàn)在是采用python的集合來存儲和判斷某個鏈接是否已經(jīng)存在,占用內存很大