機(jī)器學(xué)習(xí)學(xué)習(xí)筆記--樸素貝葉斯檢測(cè)DGA域名

DGA(域名生成算法)是一種利用隨機(jī)字符來(lái)生成C&C域名,從而逃避域名黑名單檢測(cè)的技術(shù)手段。例如,一個(gè)由Cryptolocker創(chuàng)建的DGA生成域xeogrhxquuubt.com,如果我們的進(jìn)程嘗試其它建立連接,那么我們的機(jī)器就可能感染Cryptolocker勒索病毒。


# -*- coding:utf-8 -*-

import sys

import urllib

import urlparse

import re

from hmmlearn import hmm

import numpy as np

from sklearn.externals import joblib

import HTMLParser

import nltk

import csv

import matplotlib.pyplot as plt

import os

from sklearn.feature_extraction.text import CountVectorizer

from sklearn import cross_validation

import os

from sklearn.naive_bayes import GaussianNB

#處理域名的最小長(zhǎng)度

MIN_LEN=10

#狀態(tài)個(gè)數(shù)

N=8

#最大似然概率閾值

T=-50

#模型文件名

FILE_MODEL="9-2.m"

def load_alexa(filename):

domain_list=[]

csv_reader = csv.reader(open(filename))

for row in csv_reader:

domain=row[1]

if len(domain) >= MIN_LEN:

domain_list.append(domain)

return domain_list

def domain2ver(domain):

ver=[]

for i in range(0,len(domain)):

ver.append([ord(domain[i])])

return ver

def train_hmm(domain_list):

X = [[0]]

X_lens = [1]

for domain in domain_list:

ver=domain2ver(domain)

np_ver = np.array(ver)

X=np.concatenate([X,np_ver])

X_lens.append(len(np_ver))

remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)

remodel.fit(X,X_lens)

joblib.dump(remodel, FILE_MODEL)

return remodel

def load_dga(filename):

domain_list=[]

#xsxqeadsbgvpdke.co.uk,Domain used by Cryptolocker - Flashback DGA for 13 Apr 2017,2017-04-13,

# http://osint.bambenekconsulting.com/manual/cl.txt

with open(filename) as f:

for line in f:

domain=line.split(",")[0]

if len(domain) >= MIN_LEN:

domain_list.append(domain)

return? domain_list

def test_dga(remodel,filename):

x=[]

y=[]

dga_cryptolocke_list = load_dga(filename)

for domain in dga_cryptolocke_list:

domain_ver=domain2ver(domain)

np_ver = np.array(domain_ver)

pro = remodel.score(np_ver)

#print? "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)

x.append(len(domain))

y.append(pro)

return x,y

def test_alexa(remodel,filename):

x=[]

y=[]

alexa_list = load_alexa(filename)

for domain in alexa_list:

domain_ver=domain2ver(domain)

np_ver = np.array(domain_ver)

pro = remodel.score(np_ver)

#print? "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)

x.append(len(domain))

y.append(pro)

return x, y

def show_hmm():

domain_list = load_alexa("../data/top-1000.csv")

if not os.path.exists(FILE_MODEL):

remodel=train_hmm(domain_list)

remodel=joblib.load(FILE_MODEL)

x_3,y_3=test_dga(remodel, "../data/dga-post-tovar-goz-1000.txt")

x_2,y_2=test_dga(remodel,"../data/dga-cryptolocke-1000.txt")

x_1,y_1=test_alexa(remodel, "../data/test-top-1000.csv")

fig,ax=plt.subplots()

ax.set_xlabel('Domain Length')

ax.set_ylabel('HMM Score')

ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')

ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')

ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')

ax.legend(loc='best')

plt.show()

def get_aeiou(domain_list):

x=[]

y=[]

for domain in domain_list:

x.append(len(domain))

count=len(re.findall(r'[aeiou]',domain.lower()))

count=(0.0+count)/len(domain)

y.append(count)

return x,y

def show_aeiou():

x1_domain_list = load_alexa("../data/top-1000.csv")

x_1,y_1=get_aeiou(x1_domain_list)

x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")

x_2,y_2=get_aeiou(x2_domain_list)

x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")

x_3,y_3=get_aeiou(x3_domain_list)

fig,ax=plt.subplots()

ax.set_xlabel('Domain Length')

ax.set_ylabel('AEIOU Score')

ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')

ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')

ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')

ax.legend(loc='best')

plt.show()

def get_uniq_char_num(domain_list):

x=[]

y=[]

for domain in domain_list:

x.append(len(domain))

count=len(set(domain))

count=(0.0+count)/len(domain)

y.append(count)

return x,y

def show_uniq_char_num():

x1_domain_list = load_alexa("../data/top-1000.csv")

x_1,y_1=get_uniq_char_num(x1_domain_list)

x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")

x_2,y_2=get_uniq_char_num(x2_domain_list)

x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")

x_3,y_3=get_uniq_char_num(x3_domain_list)

fig,ax=plt.subplots()

ax.set_xlabel('Domain Length')

ax.set_ylabel('UNIQ CHAR NUMBER')

ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')

ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')

ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')

ax.legend(loc='best')

plt.show()

def count2string_jarccard_index(a,b):

x=set(' '+a[0])

y=set(' '+b[0])

for i in range(0,len(a)-1):

x.add(a[i]+a[i+1])

x.add(a[len(a)-1]+' ')

for i in range(0,len(b)-1):

y.add(b[i]+b[i+1])

y.add(b[len(b)-1]+' ')

return (0.0+len(x-y))/len(x|y)

def get_jarccard_index(a_list,b_list):

x=[]

y=[]

for a in a_list:

j=0.0

for b in b_list:

j+=count2string_jarccard_index(a,b)

x.append(len(a))

y.append(j/len(b_list))

return x,y

def show_jarccard_index():

x1_domain_list = load_alexa("../data/top-1000.csv")

x_1,y_1=get_jarccard_index(x1_domain_list,x1_domain_list)

x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")

x_2,y_2=get_jarccard_index(x2_domain_list,x1_domain_list)

x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")

x_3,y_3=get_jarccard_index(x3_domain_list,x1_domain_list)

fig,ax=plt.subplots()

ax.set_xlabel('Domain Length')

ax.set_ylabel('JARCCARD INDEX')

ax.scatter(x_3,y_3,color='b',label="dga_post-tovar-goz",marker='o')

ax.scatter(x_2, y_2, color='g', label="dga_cryptolock",marker='v')

ax.scatter(x_1, y_1, color='r', label="alexa",marker='*')

ax.legend(loc='lower right')

plt.show()

def nb_dga():

x1_domain_list = load_alexa("../data/top-1000.csv")

x2_domain_list = load_dga("../data/dga-cryptolocke-1000.txt")

x3_domain_list = load_dga("../data/dga-post-tovar-goz-1000.txt")

x_domain_list=np.concatenate((x1_domain_list, x2_domain_list,x3_domain_list))

y1=[0]*len(x1_domain_list)

y2=[1]*len(x2_domain_list)

y3=[2]*len(x3_domain_list)

y=np.concatenate((y1, y2,y3))

cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",

token_pattern=r"\w", min_df=1)

x= cv.fit_transform(x_domain_list).toarray()

clf = GaussianNB()

print? cross_validation.cross_val_score(clf, x, y, n_jobs=-1, cv=3)

if __name__ == '__main__':

nb_dga()


相關(guān)教程看這個(gè)

http://www.freebuf.com/articles/network/139697.html

https://arxiv.org/abs/1611.00791

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容