不解釋什么是KNN,不說Python語法,不理解的地方請(qǐng)自行搜索,謝謝!
整理雜物時(shí),發(fā)現(xiàn)多年前一本機(jī)器學(xué)習(xí)入門書籍,閑來無事翻閱一下,看否能溫故知新。因日常工作大多都是和pandas打交道,突發(fā)奇想,基于pandas將書內(nèi)算法,按自己對(duì)的理解,從新實(shí)現(xiàn)。
數(shù)據(jù)鏈接: https://pan.baidu.com/s/1PTeekZFbpzEefquyq7-1cw 提取碼: di6p?
環(huán)境:Python 3.7
構(gòu)建函數(shù),完成數(shù)據(jù)讀取、數(shù)據(jù)處理、數(shù)據(jù)分類。
讀取數(shù)據(jù):
def readData(filename):
? ??with open(filename) as f:
? ? ? ? data = pd.read_csv(f , sep='\t' , header=None)
? ? labels = data.iloc[:,-1]????#將數(shù)據(jù)和label分開
? ? return data.iloc[:,:-1] , labels
拆分訓(xùn)練數(shù)據(jù)和測(cè)試數(shù)據(jù)集
def dataSplit(data , lables , hoRatio = 0.2):
? ? num_test = int(len(data)*hoRatio)
? ? data_train = data.iloc[:len(data)-num_test]
? ? data_test = data.iloc[-num_test:]
? ? label_train = labels.iloc[:len(data)-num_test]
? ? labels_test = labels.iloc[-num_test:]
? ? return data_train , label_train , data_test , labels_test
數(shù)據(jù)歸一化
def aotuNorm(data):
? ? minVals = data.min()
? ? maxVals = data.max()
? ? ranges = maxVals - minVals
? ? data = (data-minVals)/ranges
? ? return data , minVals , ranges
knn分類器
def classify(inX , data , labels , k=5 , exponent=2):
? ? df = pd.DataFrame([0]*len(data) , columns=['distance'] )
? ? for i in range(len(inX)):
? ? ? ? df['distance'] = df['distance'] + (data.iloc[:,i] - inX[i])**exponent
? ? df['distance'] = df['distance']**(1/exponent)
? ? df['class'] = labels
? ? df.sort_values('distance' , axis=0 , inplace=True)
? ? classVal = df['class'].iloc[:k].value_counts()
? ? return classVal.index.tolist()[0]
訓(xùn)練及預(yù)測(cè)
def datingClassTest(data_train=data_train ,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? label_train=label_train ,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? data_test=data_test ,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? labels_test=labels_test ,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? minVals=minVals ,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ranges=ranges ,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? k=5 ,
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? exponent=2 ):
? ? forecast = []
? ? for i in range(len(data_test)):
? ? ? ? forecast.append( classify((data_test.iloc[i]-minVals)/ranges , data_train , label_train , k=k , exponent=exponent) )?
? ? data_test['labels'] = labels_test
? ? data_test['test'] = forecast
? ? return data_test
約會(huì)網(wǎng)站數(shù)據(jù)分類預(yù)測(cè)
import pandas as pd
data , lables =?readData("datingTestSet.txt")
data_train , label_train , data_test , labels_test =?dataSplit(data , lables )
data_train, minVals , ranges = aotuNorm(data_train)
data_test? =?datingClassTest()
查看準(zhǔn)確率
print( '測(cè)試結(jié)果準(zhǔn)確率:{0:.2f}%'.format(sum(data_test['labels'] ==data_test['test'] )/len(data_test)*100 )