#coding:utf-8
import os
import sys
import tld
import time
import chardet
import get_header
import random
import socket
import requests
import builtwith
import dns.resolver
import urllib2
import pymongo
import urlparse
import mongo
from BeautifulSoup import BeautifulSoup
#from Config import FileConfig
add_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))
class Url_Check(object):
def __init__(self, url):
super(Url_Check, self).__init__()
self.cdninfo()
self.url = url
self.cnames = []
self.headers = []
def get_cnames(self): # get all cname
furl = urlparse.urlparse(self.url)
url = furl.netloc
# print url
rsv = dns.resolver.Resolver()
# rsv.nameservers = ['114.114.114.114']
try:
answer = dns.resolver.query(url,'CNAME')
except Exception as e:
self.cnames = None
# print "ERROR: %s" % e
else:
cname = [_.to_text() for _ in answer][0]
self.cnames.append(cname)
self.get_cname(cname)
def get_cname(self,cname): # get cname
try:
answer = dns.resolver.query(cname,'CNAME')
cname = [_.to_text() for _ in answer][0]
self.cnames.append(cname)
self.get_cname(cname)
except dns.resolver.NoAnswer:
pass
#----------------------------------------------------------------------
def conn_url(self):
""""""
try:
req = urllib2.Request(self.url,headers=get_header.get_header())
resp = urllib2.urlopen(req)
except Exception as e:
print '[-] self.url:' + self.url
print str(e)
else:
return resp
def get_headers(self): # get header
try:
resp = self.conn_url()
except Exception as e:
self.headers = None
# print "ERROR: %s" % e
else:
headers = str(resp.headers).lower()
self.headers = headers
#----------------------------------------------------------------------
def get_ip(self):
""""""
try:
domain_url = str(self.url.strip())[7:]
ip_url = socket.getaddrinfo(domain_url,'http')[0][4][0]
#ip_url = socket.gethostbyname(url)
return ip_url
except Exception,e:
pass
#----------------------------------------------------------------------
def get_title(self):
""""""
try:
html = urllib2.urlopen(self.url).read()
encoding = str(chardet.detect(html)['encoding'])
if encoding == 'GB2312':
soup = BeautifulSoup(html,fromEncoding="GB18030")
else:
soup = BeautifulSoup(html,fromEncoding=encoding)
#print url.strip() +':'+ str(encoding) +':'+ soup.title.string
return soup.title.string
except Exception,e:
print str(e)
#----------------------------------------------------------------------
def get_cms_url(self):
""""""
try:
cms_url = builtwith.parse(self.url)
except Exception,e:
pass
else:
#print 'cms_rule succccesss'
return cms_url
#----------------------------------------------------------------------
def matched(self, context, *args): # Matching string
if not isinstance(context, basestring):
context = str(context)
func = lambda x, y: y in x
# if any(func(context, pattern) for pattern in args):
# return True
# else:
# return False
for pattern in args:
if func(context,pattern):
return pattern
return False
def check(self):
try:
flag = None
self.get_cnames()
self.get_headers()
if self.cnames:
# print self.cnames
flag = self.matched(self.cnames,*self.cdn['cname'])
if flag:
print '[+] ' + self.url + flag
return {'Status':True, 'CDN':self.cdn['cname'].get(flag)}
if not flag and self.headers:
flag = self.matched(self.headers,*self.cdn['headers'])
if flag:
return {'Status':True, 'CDN':'unknown'}
return {'Status':False, 'CNAME':self.cnames, 'Headers':self.headers}
except Exception,e:
pass
def cdninfo(self):
self.cdn = {
'headers': set([
#----------------------------------------------------------------------
def update_mongo(self):
""""""
cms_url = self.get_cms_url()
title_url = self.get_title()
ip_url = self.get_ip()
cdn_url = self.check()
mongo.ls_Info.update({"URL":self.url},
{"$set": {'add_time':add_time,'title':title_url,
'IP':ip_url,'Info':cms_url,'CDN':cdn_url}},
upsert = True)
print self.url + ' end'
if __name__ == '__main__':
#url = 'http://www.163.com'
with open('test.txt') as f:
u = f.readlines()
for uu in u:
url = uu.strip('\r').strip('\n')
print url
cdn = Url_Check(url)
print cdn.check()
python 網(wǎng)站CDN
最后編輯于 :
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。
相關(guān)閱讀更多精彩內(nèi)容
- 作為新人,對于如何學(xué)好PYTHON也是一頭霧雨,也很想能得到別人的幫助。今天看到這篇文章,感覺學(xué)習(xí)起來有了一個方向...
- 目的:使用爬蟲抓取網(wǎng)站異步加載數(shù)據(jù) part1:什么是異步加載? 異步加載即網(wǎng)頁上沒有頁碼跳轉(zhuǎn)按鈕,鼠標(biāo)往下滾即可...