KNN，全稱K-Nearest Neighbors Algorithm，是一種非參數(shù)、監(jiān)督的分類方法。

那么引出一個問題：如何使用R語言編寫一個KNN算法呢？

首先，我們將KNN的編寫拆分為如下幾個問題，

1）observation和train數(shù)據(jù)集之間的距離如何計算？

2）得到distance matrix之后，如何判斷observation的所屬關系？

針對第一個問題，可以有許多種答案，本篇文章使用歐式距離進行距離計算，計算公式如下，
$d(x, y) = \sqrt{(x_{1}-y_{1})^2+(x_{2}-y_{2}^2)+……+(x_{n}-y_{n})^2}$
第二個問題，我的解決方案是，

給出一個向量（e.g. 從1開始，到20結束），作為K的所選值，隨后計算每一種K值的KNN分類結果準確率，并進行多次模擬，以箱線圖的結果對最終的K進行判斷，

即

1）set soft power parameter（參考WGCNA）

2）bootstraping

代碼部分

此部分自帶數(shù)據(jù)集鳶尾花為例，

這是對新手很硬核，但對R programmer非常easy的一部分內(nèi)容，

代碼總共可以拆分為如下幾個部分，

1）拆分原始數(shù)據(jù)集的函數(shù)：dataset.grouping

2）計算歐式距離的函數(shù)：euclidean_distance

3）KNN核心代碼函數(shù)：k.nearest.neighbors，k.picker

4）bootstrap函數(shù)：bootstrap.df

由于編寫代碼的過程中，忘記考慮到引入和test數(shù)據(jù)集的結合，因此最后重新編寫了一個函數(shù)k.nearest.neighbors.spec，

# test數(shù)據(jù)集
test_samples <- data.frame(Sepal.Length = c(6.1, 5.9, 6.7, 5.6, 7.0, 6.5),
                           Sepal.Width = c(2.5, 5.0, 4.0, 3.1, 3.6, 3.2),
                           Petal.Length = c(1.7, 2.0, 6.5, 1.5, 6.3, 4.8),
                           Petal.Width = c(0.3, 1.2, 2.2, 0.1, 2.5, 1.5),
                           row.names = paste('sample', 1:6, sep = ''))
test_samples


# Sub-functions
# 1. Generation of training dataset and testing dataset
dataset.grouping <- function(data, group) {
    # Cut dataset using group values
    index <- which(names(data)==group)
    grouping.vector <- names(table(data[, index]))

    # Build a empty dataframe
    train.df <- data.frame(matrix(0, nrow = nrow(data)*0.7, ncol = ncol(data)))
    test.df <- data.frame(matrix(0, nrow = nrow(data)*0.3, ncol = ncol(data)))

    # Create train dataset
    count.1 <- 1
    count.2 <- 1
    for (i in grouping.vector){
        df.i <- data[which(data[, index] == i), ]
        row.index = sample(nrow(df.i), nrow(data)*0.7*1/3, replace = FALSE)
        train.df.i <- df.i[row.index, ]
        test.df.i <- df.i[-row.index, ]
        
        # Append sample dataset to train and test dataset
        count.3 <- count.1 + nrow(train.df.i) - 1
        count.4 <- count.2 + nrow(test.df.i) - 1
        train.df[count.1:count.3, ] <- train.df.i
        test.df[count.2:count.4, ] <- test.df.i
        count.1 <- count.1 + nrow(train.df.i)
        count.2 <- count.2 + nrow(test.df.i)

    }
    
    # Reform the Grouping col
    case.vector <- unique(train.df[, index])
    for (i in 1:length(case.vector)){
        train.df[which(train.df[, index] == case.vector[i]), index] <- grouping.vector[i]
        test.df[which(test.df[, index] == case.vector[i]), index] <- grouping.vector[i]
    }
    # Return the train df and test df
    return(list(train.df, test.df))
}

# 2. Define basic Functions, e.g. distance
euclidean_distance <- function(trainline, testline){
  if(length(trainline) == length(testline)){
    sqrt(sum((trainline-testline)^2))  
  } else{
    stop('Vectors must be in the same length')
  }
}


# KNN
k.nearest.neighbors <- function(data, group, k){
    index <- which(names(data)==group)
    df.list <- dataset.grouping(data, group = group)
    train.df <- df.list[[1]]; test.df <- df.list[[2]]
    train.scale <- scale(train.df[, -index]); test.scale <- scale(test.df[, -index])
    train.classifier <- train.df[, index]; test.classifier <- test.df[, index]

    predict <- c()
    for (i in 1:nrow(test.df)) {
        dist.mat <- apply(train.scale, 1, euclidean_distance, test.scale[i, ])
        train.classifier <- train.classifier[order(dist.mat, decreasing = FALSE)]
        freq.df <- data.frame(table(train.classifier[1:k]))
        predict <- c(predict, as.character(freq.df$Var1[which(freq.df$Freq == max(freq.df$Freq))]))
    }

    info <- list(test.classifier, predict)
    names(info) <- c('original', 'predict')
    return(info)
}
# table(info)  # confusion matrix

# Pick the best K
k.picker <- function(data, group, case.vector) {
    # Note: case.vector is a serial vector to choose best K from
    err.vector <- c()
    for (i in case.vector){
        info <- k.nearest.neighbors(data, group, i)
        # print(length(info$original))
        # print(length(info$predict))
        err = mean(info$original != info$predict)
        # print(err)
        err.vector <- c(err.vector, err)
    }
    return(err.vector)
}
# Set soft power to pick
soft.power <- c(1:20)
# errs <- k.picker(iris, "Species", soft.power)
# errs

# Using bootstrap to pick the best to k
bootstrap.df <- data.frame(matrix(0, nrow = 1000, ncol = length(soft.power)))
for (i in 1:nrow(bootstrap.df)){
    bootstrap.df[i, ] <- k.picker(iris, "Species", soft.power)
}
boxplot(bootstrap.df)


k.nearest.neighbors.spec <- function(data, test, group, k){
    index <- which(names(data)==group)
    print(index)
    data.scale <- scale(data[, -index]); test.scale <- scale(test)
    data.classifier <- data[, index]
    # print(data.classifier)

    predict <- c()
    for (i in 1:nrow(test.scale)) {
        dist.mat <- apply(data.scale, 1, euclidean_distance, test.scale[i, ])
        data.classifier <- data.classifier[order(dist.mat, decreasing = FALSE)]
        freq.df <- data.frame(table(data.classifier[1:k]))
        print(freq.df)
        predict <- c(predict, as.character(freq.df$Var1[which(freq.df$Freq == max(freq.df$Freq))]))
    }

    info <- list(predict)
    names(info) <- c('predict')
    return(info)
}

k.nearest.neighbors.spec(iris, test_samples, "Species", 3)

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

不調(diào)包！自己整一個KNN函數(shù)（更新版）

不調(diào)包！自己整一個KNN函數(shù)（更新版）

代碼部分

相關閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

不調(diào)包！自己整一個KNN函數(shù)（更新版）

代碼部分

相關閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av