爬蟲代理

具體自己做修改

#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import logging
from datetime import datetime, timedelta
from twisted.web._newclient import ResponseNeverReceived
from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError
import fetch_free_proxy

logger = logging.getLogger(__name__)


class HttpProxyMiddleware(object):
    # 遇到這些類型的錯誤直接當(dāng)做代理不可用處理掉, 不再傳給retrymiddleware
    DONT_RETRY_ERRORS = (TimeoutError, ConnectionRefusedError, ResponseNeverReceived, ConnectError, ValueError)

    def __init__(self, settings):
        # 保存上次不用代理直接連接的時間點
        self.last_no_proxy_time = datetime.now()
        # 一定分鐘數(shù)后切換回不用代理, 因為用代理影響到速度
        self.recover_interval = 10
        # 一個proxy如果沒用到這個數(shù)字就被發(fā)現(xiàn)老是超時, 則永久移除該proxy. 設(shè)為0則不會修改代理文件.
        self.dump_count_threshold = 20
        # 存放代理列表的文件, 每行一個代理, 格式為ip:port, 注意沒有http://, 而且這個文件會被修改, 注意備份
        self.proxy_file = "proxyes.dat"
        # 是否在超時的情況下禁用代理
        self.invalid_proxy_flag = True
        # 當(dāng)有效代理小于這個數(shù)時(包括直連), 從網(wǎng)上抓取新的代理, 可以將這個數(shù)設(shè)為為了滿足每個ip被要求輸入驗證碼后得到足夠休息時間所需要的代理數(shù)
        # 例如爬蟲在十個可用代理之間切換時, 每個ip經(jīng)過數(shù)分鐘才再一次輪到自己, 這樣就能get一些請求而不用輸入驗證碼.
        # 如果這個數(shù)過小, 例如兩個, 爬蟲用A ip爬了沒幾個就被ban, 換了一個又爬了沒幾次就被ban, 這樣整個爬蟲就會處于一種忙等待的狀態(tài), 影響效率
        self.extend_proxy_threshold = 10
        # 初始化代理列表
        self.proxyes = [{"proxy": None, "valid": True, "count": 0}]
        # 初始時使用0號代理(即無代理)
        self.proxy_index = 0
        # 表示可信代理的數(shù)量(如自己搭建的HTTP代理)+1(不用代理直接連接)
        self.fixed_proxy = len(self.proxyes)
        # 上一次抓新代理的時間
        self.last_fetch_proxy_time = datetime.now()
        # 每隔固定時間強(qiáng)制抓取新代理(min)
        self.fetch_proxy_interval = 120
        # 一個將被設(shè)為invalid的代理如果已經(jīng)成功爬取大于這個參數(shù)的頁面, 將不會被invalid
        self.invalid_proxy_threshold = 200
        # 從文件讀取初始代理
        if os.path.exists(self.proxy_file):
            with open(self.proxy_file, "r") as fd:
                lines = fd.readlines()
                for line in lines:
                    line = line.strip()
                    if not line or self.url_in_proxyes("http://" + line):
                        continue
                    self.proxyes.append({"proxy": "http://" + line,
                            "valid": True,
                            "count": 0})

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def url_in_proxyes(self, url):
        """
        返回一個代理url是否在代理列表中
        """
        for p in self.proxyes:
            if url == p["proxy"]:
                return True
        return False

    def reset_proxyes(self):
        """
        將所有count>=指定閾值的代理重置為valid,
        """
        logger.info("reset proxyes to valid")
        for p in self.proxyes:
            if p["count"] >= self.dump_count_threshold:
                p["valid"] = True

    def fetch_new_proxyes(self):
        """
        從網(wǎng)上抓取新的代理添加到代理列表中
        """
        logger.info("extending proxyes using fetch_free_proxyes.py")
        new_proxyes = fetch_free_proxy.fetch_all()
        logger.info("new proxyes: %s" % new_proxyes)
        self.last_fetch_proxy_time = datetime.now()

        for np in new_proxyes:
            if self.url_in_proxyes("http://" + np):
                continue
            else:
                self.proxyes.append({"proxy": "http://"  + np,
                                     "valid": True,
                                     "count": 0})
        if self.len_valid_proxy() < self.extend_proxy_threshold: # 如果發(fā)現(xiàn)抓不到什么新的代理了, 縮小threshold以避免白費(fèi)功夫
            self.extend_proxy_threshold -= 1

    def len_valid_proxy(self):
        """
        返回proxy列表中有效的代理數(shù)量
        """
        count = 0
        for p in self.proxyes:
            if p["valid"]:
                count += 1
        return count

    def inc_proxy_index(self):
        """
        將代理列表的索引移到下一個有效代理的位置
        如果發(fā)現(xiàn)代理列表只有fixed_proxy項有效, 重置代理列表
        如果還發(fā)現(xiàn)已經(jīng)距離上次抓代理過了指定時間, 則抓取新的代理
        """
        assert self.proxyes[0]["valid"]
        while True:
            self.proxy_index = (self.proxy_index + 1) % len(self.proxyes)
            if self.proxyes[self.proxy_index]["valid"]:
                break

        # 兩輪proxy_index==0的時間間隔過短, 說明出現(xiàn)了驗證碼抖動,擴(kuò)展代理列表
        if self.proxy_index == 0 and datetime.now() < self.last_no_proxy_time + timedelta(minutes=2):
            logger.info("captcha thrashing")
            self.fetch_new_proxyes()

        if self.len_valid_proxy() <= self.fixed_proxy or self.len_valid_proxy() < self.extend_proxy_threshold: # 如果代理列表中有效的代理不足的話重置為valid
            self.reset_proxyes()

        if self.len_valid_proxy() < self.extend_proxy_threshold: # 代理數(shù)量仍然不足, 抓取新的代理
            logger.info("valid proxy < threshold: %d/%d" % (self.len_valid_proxy(), self.extend_proxy_threshold))
            self.fetch_new_proxyes()

        logger.info("now using new proxy: %s" % self.proxyes[self.proxy_index]["proxy"])

        # 一定時間沒更新后可能出現(xiàn)了在目前的代理不斷循環(huán)不斷驗證碼錯誤的情況, 強(qiáng)制抓取新代理
        #if datetime.now() > self.last_fetch_proxy_time + timedelta(minutes=self.fetch_proxy_interval):
        #    logger.info("%d munites since last fetch" % self.fetch_proxy_interval)
        #    self.fetch_new_proxyes()

    def set_proxy(self, request):
        """
        將request設(shè)置使用為當(dāng)前的或下一個有效代理
        """
        proxy = self.proxyes[self.proxy_index]
        if not proxy["valid"]:
            self.inc_proxy_index()
            proxy = self.proxyes[self.proxy_index]

        if self.proxy_index == 0: # 每次不用代理直接下載時更新self.last_no_proxy_time
            self.last_no_proxy_time = datetime.now()

        if proxy["proxy"]:
            request.meta["proxy"] = proxy["proxy"]
        elif "proxy" in request.meta.keys():
            del request.meta["proxy"]
        request.meta["proxy_index"] = self.proxy_index
        proxy["count"] += 1

    def invalid_proxy(self, index):
        """
        將index指向的proxy設(shè)置為invalid,
        并調(diào)整當(dāng)前proxy_index到下一個有效代理的位置
        """
        if index < self.fixed_proxy: # 可信代理永遠(yuǎn)不會設(shè)為invalid
            self.inc_proxy_index()
            return

        if self.proxyes[index]["valid"]:
            logger.info("invalidate %s" % self.proxyes[index])
            self.proxyes[index]["valid"] = False
            if index == self.proxy_index:
                self.inc_proxy_index()

            if self.proxyes[index]["count"] < self.dump_count_threshold:
                self.dump_valid_proxy()

    def dump_valid_proxy(self):
        """
        保存代理列表中有效的代理到文件
        """
        if self.dump_count_threshold <= 0:
            return
        logger.info("dumping proxyes to file")
        with open(self.proxy_file, "w") as fd:
            for i in range(self.fixed_proxy, len(self.proxyes)):
                p = self.proxyes[i]
                if p["valid"] or p["count"] >= self.dump_count_threshold:
                    fd.write(p["proxy"][7:]+"\n") # 只保存有效的代理

    def process_request(self, request, spider):
        """
        將request設(shè)置為使用代理
        """
        if self.proxy_index > 0  and datetime.now() > (self.last_no_proxy_time + timedelta(minutes=self.recover_interval)):
            logger.info("After %d minutes later, recover from using proxy" % self.recover_interval)
            self.last_no_proxy_time = datetime.now()
            self.proxy_index = 0
        request.meta["dont_redirect"] = True  # 有些代理會把請求重定向到一個莫名其妙的地址

        # spider發(fā)現(xiàn)parse error, 要求更換代理
        if "change_proxy" in request.meta.keys() and request.meta["change_proxy"]:
            logger.info("change proxy request get by spider: %s"  % request)
            self.invalid_proxy(request.meta["proxy_index"])
            request.meta["change_proxy"] = False
        self.set_proxy(request)

    def process_response(self, request, response, spider):
        """
        檢查response.status, 根據(jù)status是否在允許的狀態(tài)碼中決定是否切換到下一個proxy, 或者禁用proxy
        """
        if "proxy" in request.meta.keys():
            logger.debug("%s %s %s" % (request.meta["proxy"], response.status, request.url))
        else:
            logger.debug("None %s %s" % (response.status, request.url))

        # status不是正常的200而且不在spider聲明的正常爬取過程中可能出現(xiàn)的
        # status列表中, 則認(rèn)為代理無效, 切換代理
        if response.status not in [200, 404] \
                and (not hasattr(spider, "website_possible_httpstatus_list") \
                             or response.status not in spider.website_possible_httpstatus_list):
            logger.info("response status not in spider.website_possible_httpstatus_list")
            self.invalid_proxy(request.meta["proxy_index"])
            new_request = request.copy()
            new_request.dont_filter = True
            return new_request
        else:
            return response

    def process_exception(self, request, exception, spider):
        """
        處理由于使用代理導(dǎo)致的連接異常
        """
        # logger.debug("%s exception: %s" % (self.proxyes[request.meta["proxy_index"]]["proxy"], exception))
        request_proxy_index = request.meta["proxy_index"]

        # 只有當(dāng)proxy_index>fixed_proxy-1時才進(jìn)行比較, 這樣能保證至少本地直連是存在的.
        if isinstance(exception, self.DONT_RETRY_ERRORS):
            if request_proxy_index > self.fixed_proxy - 1 and self.invalid_proxy_flag: # WARNING 直連時超時的話換個代理還是重試? 這是策略問題
                if self.proxyes[request_proxy_index]["count"] < self.invalid_proxy_threshold:
                    self.invalid_proxy(request_proxy_index)
                elif request_proxy_index == self.proxy_index:  # 雖然超時,但是如果之前一直很好用,也不設(shè)為invalid
                    self.inc_proxy_index()
            else:               # 簡單的切換而不禁用
                if request.meta["proxy_index"] == self.proxy_index:
                    self.inc_proxy_index()
            new_request = request.copy()
            new_request.dont_filter = True
            return new_request

使用方法:

settings.py
DOWNLOADER_MIDDLEWARES = {
    'myProject.HttpProxyMiddlewares.HttpProxyMiddleware': 543,
}
添加

組合拳:

#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import logging

logger = logging.getLogger(__name__)


def get_html(url):
    request = urllib2.Request(url)
    request.add_header("User-Agent",
                       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36")
    html = urllib2.urlopen(request)
    return html.read()


def get_soup(url):
    soup = BeautifulSoup(get_html(url), "lxml")
    return soup


def fetch_kxdaili(page):
    """
    從www.kxdaili.com抓取免費(fèi)代理
    """
    proxyes = []
    try:
        url = "http://www.kxdaili.com/dailiip/1/%d.html" % page
        soup = get_soup(url)
        table_tag = soup.find("table", attrs={"class": "segment"})
        trs = table_tag.tbody.find_all("tr")
        for tr in trs:
            tds = tr.find_all("td")
            ip = tds[0].text
            port = tds[1].text
            latency = tds[4].text.split(" ")[0]
            if float(latency) < 0.5:  # 輸出延遲小于0.5秒的代理
                proxy = "%s:%s" % (ip, port)
                proxyes.append(proxy)
    except:
        logger.warning("fail to fetch from kxdaili")
    return proxyes


def img2port(img_url):
    """
    mimvp.com的端口號用圖片來顯示, 本函數(shù)將圖片url轉(zhuǎn)為端口, 目前的臨時性方法并不準(zhǔn)確
    """
    code = img_url.split("=")[-1]
    if code.find("AO0OO0O") > 0:
        return 80
    else:
        return None


def fetch_mimvp():
    """
    從http://proxy.mimvp.com/free.php抓免費(fèi)代理
    """
    proxyes = []
    try:
        url = "http://proxy.mimvp.com/free.php?proxy=in_hp"
        soup = get_soup(url)
        table = soup.find("div", attrs={"id": "list"}).table
        tds = table.tbody.find_all("td")
        for i in range(0, len(tds), 10):
            id = tds[i].text
            ip = tds[i + 1].text
            port = img2port(tds[i + 2].img["src"])
            response_time = tds[i + 7]["title"][:-1]
            transport_time = tds[i + 8]["title"][:-1]
            if port is not None and float(response_time) < 1:
                proxy = "%s:%s" % (ip, port)
                proxyes.append(proxy)
    except:
        logger.warning("fail to fetch from mimvp")
    return proxyes


def fetch_xici():
    """
    http://www.xicidaili.com/nn/
    """
    proxyes = []
    try:
        url = "http://www.xicidaili.com/nn/"
        soup = get_soup(url)
        table = soup.find("table", attrs={"id": "ip_list"})
        trs = table.find_all("tr")
        for i in range(1, len(trs)):
            tr = trs[i]
            tds = tr.find_all("td")
            ip = tds[2].text
            port = tds[3].text
            speed = tds[7].div["title"][:-1]
            latency = tds[8].div["title"][:-1]
            if float(speed) < 3 and float(latency) < 1:
                proxyes.append("%s:%s" % (ip, port))
    except:
        logger.warning("fail to fetch from xici")
    return proxyes


def fetch_ip181():
    """
    http://www.ip181.com/
    """
    proxyes = []
    try:
        url = "http://www.ip181.com/"
        soup = get_soup(url)
        table = soup.find("table")
        trs = table.find_all("tr")
        for i in range(1, len(trs)):
            tds = trs[i].find_all("td")
            ip = tds[0].text
            port = tds[1].text
            latency = tds[4].text[:-2]
            if float(latency) < 1:
                proxyes.append("%s:%s" % (ip, port))
    except Exception as e:
        logger.warning("fail to fetch from ip181: %s" % e)
    return proxyes


def fetch_httpdaili():
    """
    http://www.httpdaili.com/mfdl/
    更新比較頻繁
    """
    proxyes = []
    try:
        url = "http://www.httpdaili.com/mfdl/"
        soup = get_soup(url)
        table = soup.find("div", attrs={"kb-item-wrap11"}).table
        trs = table.find_all("tr")
        for i in range(1, len(trs)):
            try:
                tds = trs[i].find_all("td")
                ip = tds[0].text
                port = tds[1].text
                type = tds[2].text
                if type == u"匿名":
                    proxyes.append("%s:%s" % (ip, port))
            except:
                pass
    except Exception as e:
        logger.warning("fail to fetch from httpdaili: %s" % e)
    return proxyes


def fetch_66ip():
    """
    http://www.66ip.cn/
    每次打開此鏈接都能得到一批代理, 速度不保證
    """
    proxyes = []
    try:
        # 修改getnum大小可以一次獲取不同數(shù)量的代理
        url = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip"
        content = get_html(url)
        urls = content.split("</script>")[-1].split("<br />")
        for u in urls:
            if u.strip():
                proxyes.append(u.strip())
    except Exception as e:
        logger.warning("fail to fetch from httpdaili: %s" % e)
    return proxyes


def fetch_nianshao():
    """
    http://www.kuaidaili.com/free
    """
    proxyes = []
    try:
        url = 'http://www.nianshao.me/'
        soup = get_soup(url)
        trs = soup.find("div", attrs={"mainPanel"}).table.tbody.find_all('tr')
        for i in range(1, len(trs)):
            try:
                tds = trs[i].find_all("td")
                ip = tds[0].text
                port = tds[1].text
                type = tds[3].text
                if type == u'高匿':
                    proxyes.append("%s:%s" % (ip, port))
            except:
                pass
    except Exception as e:
        logger.warning("fail to fetch from httpdaili: %s" % e)
    return proxyes


def check(proxy):
    import urllib2
    url = "http://www.baidu.com/js/bdsug.js?v=1.0.3.0"
    proxy_handler = urllib2.ProxyHandler({'http': "http://" + proxy})
    opener = urllib2.build_opener(proxy_handler, urllib2.HTTPHandler)
    try:
        response = opener.open(url, timeout=3)
        return response.code == 200
    except Exception:
        return False


def fetch_all(endpage=2):
    proxyes = []
    # for i in range(1, endpage):
    #     proxyes += fetch_kxdaili(i)
    # proxyes += fetch_mimvp()
    # proxyes += fetch_xici()
    # proxyes += fetch_ip181()
    proxyes += fetch_httpdaili()
    proxyes += fetch_66ip()
    proxyes += fetch_nianshao()
    valid_proxyes = []
    logger.info("checking proxyes validation")
    for p in proxyes:
        if check(p):
            valid_proxyes.append(p)
    return valid_proxyes


if __name__ == '__main__':
    import sys

    root_logger = logging.getLogger("")
    stream_handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter('%(name)-8s %(asctime)s %(levelname)-8s %(message)s', '%a, %d %b %Y %H:%M:%S', )
    stream_handler.setFormatter(formatter)
    root_logger.addHandler(stream_handler)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    proxyes = fetch_all()
    # print check("202.29.238.242:3128")
    for p in proxyes:
        print p

抓去代理模塊需要自己更新,部分網(wǎng)站會修改規(guī)則或停用,需要尋找新網(wǎng)站,抓去新代理

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容