完整代碼

# encoding:UTF-8

# from bs4 import BeautifulSoup

import urlparse

import urllib2

import re

import robotparser

import datetime

import time

import itertools

import Queue? # 同步的、線程安全的隊列類

import lxml.html

import lxml.cssselect

import csv

def crawl_sitemap(url, scrape_callback=None):

? ? """

? ? 1.通過robots文件記錄的鏈接爬蟲

? ? :param url:

? ? :return:如果有回調(diào)函數(shù)，返回回調(diào)結(jié)果

? ? """

? ? sitemap = urllib2.urlopen(url).read()

? ? links = re.findall('<loc>(.*?)</loc>', sitemap)

? ? if scrape_callback:

? ? ? ? for link in links:

? ? ? ? ? ? html = urllib2.urlopen(link).read()

? ? ? ? ? ? scrape_callback(link, html)

def crawl_id(url, scrape_callback=None):

? ? """

? ? 2.通過ID爬蟲,連續(xù)發(fā)生多次下載錯誤才會退出

? ? :param url:含有ID的鏈接的公共部分,沒有/

? ? :return:

? ? """

? ? max_error = 5

? ? num_error = 0

? ? throttle = Throttle(5)

? ? for page in itertools.count(1):? # 迭代器，從1開始

? ? ? ? link = url + ("/-%d" % page)

? ? ? ? html = urllib2.urlopen(link).read()

? ? ? ? if html is None:

? ? ? ? ? ? num_error += 1

? ? ? ? ? ? if num_error == max_error:

? ? ? ? ? ? ? ? break

? ? ? ? else:? # 網(wǎng)頁存在

? ? ? ? ? ? throttle.wait(link)

? ? ? ? ? ? scrape_callback(link, html)

? ? ? ? ? ? num_error = 0

def link_crawler(seed_url, link_regex=None, delay=-1, max_depth=1, max_urls=-1,

? ? ? ? ? ? ? ? headers=None, user_agent="wswp", proxy=None, num_retries=2, scrape_callback=None):

? ? """

? ? 3.通過鏈接爬蟲,深度優(yōu)先，禁用某些功能可將其參數(shù)設(shè)為負(fù)數(shù)

? ? 待爬蟲隊列存在，逐一判斷robots訪問權(quán)限，在等待一定時間后進(jìn)行下載，

? ? 并根據(jù)訪問深度決定是否繼續(xù)進(jìn)行訪問。如繼續(xù)，根據(jù)正則表達(dá)式匹配獲

? ? 取鏈接集，逐一規(guī)范化后，若某鏈接沒有被訪問過，且域名和種子網(wǎng)址域名相同，

? ? 則歸入待爬蟲隊列。每完成一次訪問，鏈接總數(shù)+1

? ? :param seed_url:種子鏈接

? ? :param link_regex:目標(biāo)鏈接識別正則表達(dá)式

? ? :param user_agent:用戶代理

? ? :return:爬蟲結(jié)果

? ? """

? ? crawl_queue = Queue.deque([seed_url])

? ? seen = {seed_url: 0}

? ? num_urls = 0

? ? rp = get_robots(seed_url)

? ? throttle = Throttle(delay)

? ? headers = headers or {}

? ? if user_agent:

? ? ? ? headers['User-agent'] = user_agent

? ? while crawl_queue:

? ? ? ? url = crawl_queue.pop()

? ? ? ? depth = seen[url]

? ? ? ? if rp.can_fetch(user_agent, url):

? ? ? ? ? ? throttle.wait(url)

? ? ? ? ? ? html = download(url, headers, proxy=proxy, num_retries=num_retries)

? ? ? ? ? ? links = []

? ? ? ? ? ? if scrape_callback:

? ? ? ? ? ? ? ? # links.extend(scrape_callback(url, html) or [])

? ? ? ? ? ? ? ? scrape_callback(url, html)

? ? ? ? ? ? if depth != max_depth:

? ? ? ? ? ? ? ? # 獲取鏈接

? ? ? ? ? ? ? ? if link_regex:

? ? ? ? ? ? ? ? ? ? links.extend(link for link in get_links(html) if re.match(link_regex, link))

? ? ? ? ? ? ? ? # 如果沒有被訪問過，且域名相同，歸入連接誒隊列

? ? ? ? ? ? ? ? for link in links:

? ? ? ? ? ? ? ? ? ? link = normalize(seed_url, link)

? ? ? ? ? ? ? ? ? ? if link not in seen:

? ? ? ? ? ? ? ? ? ? ? ? seen[link] = depth + 1

? ? ? ? ? ? ? ? ? ? ? ? if same_domain(seed_url, link):

? ? ? ? ? ? ? ? ? ? ? ? ? ? crawl_queue.append(link)

? ? ? ? ? ? num_urls += 1

? ? ? ? ? ? if num_urls == max_urls:

? ? ? ? ? ? ? ? break

? ? ? ? else:

? ? ? ? ? ? print("Blocked by robots.txt:", url)

class Throttle:

? ? """

? ? 在兩次下載之間添加時間延遲

? ? """

? ? def __init__(self, delay):

? ? ? ? self.delay = delay? # 延遲多長時間

? ? ? ? self.domains = {}? # 字典記錄域名最后一次被訪問的時間地圖

? ? def wait(self, url):

? ? ? ? """

? ? ? ? 功能：頁面休眠

? ? ? ? urlparse將url（http：//開頭）解析成組件

? ? ? ? 組件：協(xié)議(scheme)、位置(netloc)、路徑(path)、可選參數(shù)(parameters)、查詢(query)、片段(fragment)

? ? ? ? :param url:

? ? ? ? :return:

? ? ? ? """

? ? ? ? domain = urlparse.urlparse(url).netloc

? ? ? ? last_accessed = self.domains.get(domain)

? ? ? ? if self.delay > 0 and last_accessed is not None:

? ? ? ? ? ? sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds

? ? ? ? ? ? if sleep_secs > 0:

? ? ? ? ? ? ? ? time.sleep(sleep_secs)

? ? ? ? self.domains[domain] = datetime.datetime.now()

class ScrapeCallback:

? ? def __init__(self):

? ? ? ? self.writer = csv.writer(open('countries.csv', 'w'))

? ? ? ? self.fields = ('area', 'population', 'iso', 'country', 'capital',

? ? ? ? ? ? ? ? ? ? ? 'continent', 'tld', 'currency_code', 'currency_name',

? ? ? ? ? ? ? ? ? ? ? 'phone', 'postal_code_format', 'postal_code_regex', 'languages')

? ? ? ? self.writer.writerow(self.fields)

? ? def __call__(self, url, html):

? ? ? ? """

? ? ? ? :param url:判斷是否是目標(biāo)鏈接

? ? ? ? :param html:下載數(shù)據(jù)的頁面

? ? ? ? :return:

? ? ? ? """

? ? ? ? if re.search('/view/', url):

? ? ? ? ? ? tree = lxml.html.fromstring(html)

? ? ? ? ? ? row = []

? ? ? ? ? ? for field in self.fields:

? ? ? ? ? ? ? ? row.append(tree.cssselect('table>tr#places_{}__row>td.w2p_fw'.format(field))[0].text_content())

? ? ? ? ? ? self.writer.writerow(row)

def download(url, headers, proxy, num_retries, data=None):

? ? """

? ? 設(shè)置一般的請求后，根據(jù)爬蟲代理參數(shù)選擇是否使用特定處理器來獲取鏈接

? ? 若遇到50X網(wǎng)頁暫時無法訪問的情況，嘗試多次后無果則退出

? ? :param url:鏈接

? ? :param user_agent:用戶代理

? ? :param proxy:協(xié)議，ip端口

? ? :param num_retries:出錯是嘗試訪問多少次

? ? :return: 整個網(wǎng)頁的源代碼

? ? """

? ? print("Downloading:", url)

? ? request = urllib2.Request(url, data, headers)

? ? opener = urllib2.build_opener()? # 用特定處理器來獲取urls

? ? if proxy:

? ? ? ? proxy_params = {urlparse.urlparse(url).scheme: proxy}

? ? ? ? opener.add_handler(urllib2.ProxyHandler(proxy_params))

? ? try:

? ? ? ? html = urllib2.urlopen(request).read()

? ? ? ? # # 數(shù)據(jù)獲取方式1：正則表達(dá)式（C、快、使用困難、靈活性差）

? ? ? ? # result = re.findall('<td class="w2p_fw">(.*?)</td>', html)

? ? ? ? # if result:

? ? ? ? #? ? print(result[1])

? ? ? ? # # 數(shù)據(jù)獲取方式2：通過beautifulsoup（Python、慢、安裝簡單）

? ? ? ? # soup = BeautifulSoup(html, 'html.parser')

? ? ? ? # tr = soup.find(attrs={'id': 'places_area__row'})

? ? ? ? # if tr:

? ? ? ? #? ? td = tr.find(attrs={'class': 'w2p_fw'})

? ? ? ? #? ? area = td.text

? ? ? ? #? ? print(area)

? ? ? ? # # 數(shù)據(jù)獲取方式3：通過lxml（快、大量數(shù)據(jù)抓取效果更明顯、安裝相對困難）

? ? ? ? # tree = lxml.html.fromstring(html)

? ? ? ? # td = tree.cssselect('tr#places_neighbours__row > td.w2p_fw')

? ? ? ? # if td:

? ? ? ? #? ? area = td[0].text_content()

? ? ? ? #? ? print(area)

? ? except urllib2.URLError as e:

? ? ? ? print("Download error:", e.reason)

? ? ? ? html = None

? ? ? ? if num_retries > 0:

? ? ? ? ? ? if hasattr(e, 'code') and 500 <= e.code <= 600:

? ? ? ? ? ? ? ? return download(url, headers, proxy, num_retries - 1, data)

? ? return html

def get_links(html):

? ? """

? ? 提取網(wǎng)頁中的所有鏈接

? ? :param html:

? ? :return:

? ? """

? ? webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? re.IGNORECASE)? # re.compile()函數(shù)將字符串形式的正則表達(dá)式轉(zhuǎn)換成模式

? ? return webpage_regex.findall(html)

def get_robots(url):

? ? """

? ? :param url:

? ? :return: 包含robots信息的對象

? ? """

? ? rp = robotparser.RobotFileParser()

? ? rp.set_url(urlparse.urljoin(url, '/robots.txt'))

? ? rp.read()

? ? return rp

def normalize(seed_url, link):

? ? """

? ? 鏈接規(guī)范化，相對路徑轉(zhuǎn)化成絕對路徑

? ? :param seed_link:

? ? :param link:

? ? :return:

? ? """

? ? link, _ = urlparse.urldefrag(link)? # 去掉碎部（鏈接#后的部分）

? ? return urlparse.urljoin(seed_url, link)

def same_domain(url1, url2):

? ? """

? ? 兩個鏈接的域名相同，為True

? ? :param url1:

? ? :param url2:

? ? :return:

? ? """

? ? return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc

def main():

? ? ## 1.通過robots文件記錄的鏈接爬蟲

? ? # crawl_sitemap('http://example.webscraping.com/sitemap.xml', scrape_callback=ScrapeCallback())

? ? # # 2.通過ID爬蟲

? ? # crawl_id('http://example.webscraping.com/places/default/view',scrape_callback=ScrapeCallback())

? ? # 3.通過鏈接爬蟲

? ? link_crawler('http://example.webscraping.com', '/places/default/(view|index)',

? ? ? ? ? ? ? ? delay=0, num_retries=5, max_depth=2, user_agent='GoodCrawler', scrape_callback=ScrapeCallback())

main()

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

【Python】簡單的網(wǎng)絡(luò)爬蟲

【Python】簡單的網(wǎng)絡(luò)爬蟲

完整代碼

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

【Python】簡單的網(wǎng)絡(luò)爬蟲

完整代碼

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av