# -*- coding: utf-8 -*-
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
####knn最鄰近算法####
inputfile = 'd:/data/sales_data.xls'
data = pd.read_excel(inputfile, index_col = u'序號') #導入數(shù)據(jù)
#數(shù)據(jù)是類別標簽,要將它轉換為數(shù)據(jù)
#用1來表示“好”、“是”、“高”這三個屬性,用-1來表示“壞”、“否”、“低”
data[data == u'好'] = 1
data[data == u'是'] = 1
data[data == u'高'] = 1
data[data != 1] = -1
x = data.iloc[:,:3].as_matrix().astype(int)
y = data.iloc[:,3].as_matrix().astype(int)
#拆分訓練數(shù)據(jù)與測試數(shù)據(jù)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
#訓練KNN分類器
clf = KNeighborsClassifier(algorithm='kd_tree')
clf.fit(x_train, y_train)
#測試結果
answer = clf.predict(x_test)
print(x_test)
print(answer)
print(y_test)
print(np.mean( answer == y_test))
#準確率
precision, recall, thresholds = precision_recall_curve(y_train, clf.predict(x_train))
print(classification_report(y_test, answer, target_names = ['高', '低']))
####貝葉斯分類器####
#訓練貝葉斯分類器
clf = BernoulliNB()
clf.fit(x_train,y_train)
#測試結果
answer = clf.predict(x_test)
print(x_test)
print(answer)
print(y_test)
print(np.mean( answer == y_test))
print(classification_report(y_test, answer, target_names = ['低', '高']))
####決策樹####
from sklearn.tree import DecisionTreeClassifier as DTC
dtc = DTC(criterion='entropy') #建立決策樹模型,基于信息熵
dtc.fit(x_train, y_train) #訓練模型
#導入相關函數(shù),可視化決策樹。
#導出的結果是一個dot文件,需要安裝Graphviz才能將它轉換為pdf或png等格式。
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
with open("tree.dot", 'w') as f:
f = export_graphviz(dtc, out_file = f)
#測試結果
answer = dtc.predict(x_test)
print(x_test)
print(answer)
print(y_test)
print(np.mean( answer == y_test))
print(classification_report(y_test, answer, target_names = ['低', '高']))
####SVM####
from sklearn.svm import SVC
clf =SVC()
clf.fit(x_train, y_train)
#測試結果
answer = clf.predict(x_test)
print(x_test)
print(answer)
print(y_test)
print(np.mean( answer == y_test))
print(classification_report(y_test, answer, target_names = ['低', '高']))