IP代理池基于mongodb數(shù)據(jù)庫

代碼用的python2.7,抓取xici免費代理,檢測放入數(shù)據(jù)庫中,為以后爬蟲做準(zhǔn)備。下面直接上代碼

```

#-*-encoding=utf-8-*-

importrequests

fromlxmlimportetree

importtime

importpymongo

frommultiprocessingimportPool

classGetproxy(object):

def__init__(self):

self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}

self.url ='http://www.xicidaili.com/wt/'

self.client = pymongo.MongoClient('localhost',27017)

self.xici =self.client['xici']

self.xiciipinfo =self.xici['xiciipinfo']

#self.removeip = '127.0.0.1' #第一次運(yùn)行會檢測該變量,因為下面只有檢測失敗了才會賦值

defgetip(self,num):

#爬西祠所有代理,更新放入數(shù)據(jù)庫

url =self.url +str(num)

wb_data = requests.get(url,headers=self.headers)

html = etree.HTML(wb_data.text)

# htmls = etree.tostring(html)

ips = html.xpath('//tr[@class="odd"]/td[2]/text()')

ports = html.xpath('//tr[@class="odd"]/td[3]/text()')

protocols = html.xpath('//tr[@class="odd"]/td[6]/text()')

areas = html.xpath('//tr[@class="odd"]/td[4]/a/text()')

forip,port,protocol,areainzip(ips,ports,protocols,areas):

data = {

'ip': ip,

'port': port,

'protocol': protocol,

'area': area,

}

printdata

#self.xiciipinfo.insert_one(data)

#if self.removeip != ip: #此處加一個判斷,如果是下面檢測過的不可用的ip,就不更新進(jìn)入數(shù)據(jù)庫,可以節(jié)省下面的檢測時間

self.xiciipinfo.update({'ip':ip},{'$set':data},True)

defcount(self,num):

foriinrange(1,num):

self.getip(i)

time.sleep(2)

defdbclose(self):

self.client.close()

defgetiplist(self):

#將數(shù)據(jù)庫內(nèi)數(shù)據(jù)整理放入列表

ips =self.xiciipinfo.find()

proxylist = []

foriinips:

b ="http"+"://"+ i['ip'] +":"+ i['port']

proxies = {"http": b}

# print proxies

proxylist.append(proxies)

# print proxylist

returnproxylist

defiptest(self,proxy):

#檢測ip,并更新進(jìn)入數(shù)據(jù)庫,刪掉不可用的ip

ip = proxy['http'][7:].split(':')[0]

try:

requests.get('http://wenshu.court.gov.cn/',proxies=proxy,timeout=6)

except:

print'field...............>>>>>>>>>>>>>>>>>>>>>>>>'

#self.removeip = ip #賦值給類屬性

self.xiciipinfo.remove({'ip': ip})#用remove方法,將符合條件的刪掉

print'remove it now.....{}'.format(ip)

else:

print'<<<<<<<<<<<<<<<<<.............success'

printproxy

if__name__ =='__main__':

pool = Pool()

proxy = Getproxy()

proxy.count(2)

iplist = proxy.getiplist()

map(proxy.iptest,iplist)

proxy.dbclose()

```

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容