代碼用的python2.7,抓取xici免費代理,檢測放入數(shù)據(jù)庫中,為以后爬蟲做準(zhǔn)備。下面直接上代碼
```
#-*-encoding=utf-8-*-
importrequests
fromlxmlimportetree
importtime
importpymongo
frommultiprocessingimportPool
classGetproxy(object):
def__init__(self):
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
self.url ='http://www.xicidaili.com/wt/'
self.client = pymongo.MongoClient('localhost',27017)
self.xici =self.client['xici']
self.xiciipinfo =self.xici['xiciipinfo']
#self.removeip = '127.0.0.1' #第一次運(yùn)行會檢測該變量,因為下面只有檢測失敗了才會賦值
defgetip(self,num):
#爬西祠所有代理,更新放入數(shù)據(jù)庫
url =self.url +str(num)
wb_data = requests.get(url,headers=self.headers)
html = etree.HTML(wb_data.text)
# htmls = etree.tostring(html)
ips = html.xpath('//tr[@class="odd"]/td[2]/text()')
ports = html.xpath('//tr[@class="odd"]/td[3]/text()')
protocols = html.xpath('//tr[@class="odd"]/td[6]/text()')
areas = html.xpath('//tr[@class="odd"]/td[4]/a/text()')
forip,port,protocol,areainzip(ips,ports,protocols,areas):
data = {
'ip': ip,
'port': port,
'protocol': protocol,
'area': area,
}
printdata
#self.xiciipinfo.insert_one(data)
#if self.removeip != ip: #此處加一個判斷,如果是下面檢測過的不可用的ip,就不更新進(jìn)入數(shù)據(jù)庫,可以節(jié)省下面的檢測時間
self.xiciipinfo.update({'ip':ip},{'$set':data},True)
defcount(self,num):
foriinrange(1,num):
self.getip(i)
time.sleep(2)
defdbclose(self):
self.client.close()
defgetiplist(self):
#將數(shù)據(jù)庫內(nèi)數(shù)據(jù)整理放入列表
ips =self.xiciipinfo.find()
proxylist = []
foriinips:
b ="http"+"://"+ i['ip'] +":"+ i['port']
proxies = {"http": b}
# print proxies
proxylist.append(proxies)
# print proxylist
returnproxylist
defiptest(self,proxy):
#檢測ip,并更新進(jìn)入數(shù)據(jù)庫,刪掉不可用的ip
ip = proxy['http'][7:].split(':')[0]
try:
requests.get('http://wenshu.court.gov.cn/',proxies=proxy,timeout=6)
except:
print'field...............>>>>>>>>>>>>>>>>>>>>>>>>'
#self.removeip = ip #賦值給類屬性
self.xiciipinfo.remove({'ip': ip})#用remove方法,將符合條件的刪掉
print'remove it now.....{}'.format(ip)
else:
print'<<<<<<<<<<<<<<<<<.............success'
printproxy
if__name__ =='__main__':
pool = Pool()
proxy = Getproxy()
proxy.count(2)
iplist = proxy.getiplist()
map(proxy.iptest,iplist)
proxy.dbclose()
```