def get_page(url):
try:
import urllib2
req = urllib2.Request(url)
return urllib2.urlopen(req).read()
except:
return ""
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def crawl_web(seed): # returns index, graph of inlinks
tocrawl = [seed]
crawled = []
graph = {} # <url>, [list of pages it links to]
index = {}
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)
graph[page] = outlinks
union(tocrawl, outlinks)
crawled.append(page)
return index, graph
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def add_to_index(index, keyword, url):
if keyword in index:
index[keyword].append(url)
else:
index[keyword] = [url]
def lookup(index, keyword):
if keyword in index:
return index[keyword]
else:
return None
def compute_ranks(graph):
d = 0.8 # damping factor
numloops = 10
ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages
for i in range(0, numloops):
newranks = {}
for page in graph:
newrank = (1 - d) / npages
for node in graph:
if page in graph[node]:
newrank = newrank + d * (ranks[node] / len(graph[node]))
print page,newrank
newranks[page] = newrank
ranks = newranks
return ranks
def quick_sort(url_lst,ranks):
url_sorted_worse=[]
url_sorted_better=[]
if len(url_lst)<=1:
return url_lst
pivot=url_lst[0]
for url in url_lst[1:]:
if ranks[url]<=ranks[pivot]:
url_sorted_worse.append(url)
else:
url_sorted_better.append(url)
return quick_sort(url_sorted_better,ranks)+[pivot]+quick_sort(url_sorted_worse,ranks)
def ordered_search(index, ranks, keyword):
if keyword in index:
all_urls=index[keyword]
else:
return None
return quick_sort(all_urls,ranks)
index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
ranks = compute_ranks(graph)
print ranks
print ordered_search(index, ranks, 'Chef</h1>')
# Chef</h1>
搜索
最后編輯于 :
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。
相關(guān)閱讀更多精彩內(nèi)容
- 輕度集成,侵入性低,配合searchcore引擎一句代碼集成顯示文字高亮效果,匹配文字顯示高亮邏輯參考微信,滿(mǎn)足大...
- 依賴(lài) 建立索引 本次增加了Float、Int類(lèi)型的域 搜索排序 范圍搜索 查詢(xún)score范圍在1~5之間的文檔,對(duì)...
- 1、構(gòu)造數(shù)據(jù) 定義 mapping 查看 mapping 插入數(shù)據(jù) 查看數(shù)據(jù) 2、前綴搜索,搜索以 index 開(kāi)...
- 如果想構(gòu)建一個(gè)圖像搜索引擎,那如何對(duì)圖像進(jìn)行搜索呢?一種方式是依賴(lài)于與圖像相關(guān)聯(lián)的標(biāo)簽、關(guān)鍵字和文字描述,這種稱(chēng)為...
- 我在一所規(guī)模并不很大的成人職業(yè)培訓(xùn)學(xué)校任教,因此認(rèn)識(shí)了挺多來(lái)自五湖四海的同學(xué),他們當(dāng)中,有家庭很富有的當(dāng)?shù)乩?..