k-means聚類算法研究

????????聚類是機(jī)器學(xué)習(xí)中一種重要的無監(jiān)督算法,它可以將數(shù)據(jù)點(diǎn)歸結(jié)為一系列特定的組合。理論上歸為一類的數(shù)據(jù)點(diǎn)具有相同的特性,而不同類別的數(shù)據(jù)點(diǎn)則具有各不相同的屬性。在數(shù)據(jù)科學(xué)中聚類會(huì)從數(shù)據(jù)中發(fā)掘出很多分析和理解的視角,讓我們更深入的把握數(shù)據(jù)資源的價(jià)值、并據(jù)此指導(dǎo)生產(chǎn)生活。

K均值聚類

這一最著名的聚類算法主要基于數(shù)據(jù)點(diǎn)之間的均值和與聚類中心的聚類迭代而成。它主要的優(yōu)點(diǎn)是十分的高效,由于只需要計(jì)算數(shù)據(jù)點(diǎn)與劇類中心的距離,其計(jì)算復(fù)雜度只有O(n)。其工作原理主要分為以下四步:

1.首先我們需要預(yù)先給定聚類的數(shù)目同時(shí)隨機(jī)初始化聚類中心。我們可以初略的觀察數(shù)據(jù)并給出較為準(zhǔn)確的聚類數(shù)目;

2.每一個(gè)數(shù)據(jù)點(diǎn)通過計(jì)算與聚類中心的距離了來分類到最鄰近的一類中;

3.根據(jù)分類結(jié)果,利用分類后的數(shù)據(jù)點(diǎn)重新計(jì)算聚類中心;

4.重復(fù)步驟二三直到聚類中心不再變化。(可以隨機(jī)初始化不同的聚類中心以選取最好的結(jié)果)

python實(shí)現(xiàn)

1?point.py

class Point:

def __init__(self, latit_, longit_):

#self.id = id_? #an id which uniquely identifies a point

? ? ? ? self.latit = latit_

self.longit = longit_

2?clustering.py

import randomas rand

import mathas math

from pointimport Point

#import pkg_resources

#pkg_resources.require("matplotlib")

import numpyas np

from mpl_toolkits.mplot3dimport Axes3D

import matplotlib.pyplotas plt

class clustering:

def __init__(self, geo_locs_, k_):

self.geo_locations = geo_locs_

self.k = k_

self.clusters = []#clusters of nodes

? ? ? ? self.means = []#means of clusters

? ? ? ? self.debug =True? #debug flag

#this method returns the next random node

? ? def next_random(self, index, points, clusters):

#pick next node that has the maximum distance from other nodes

? ? ? ? print 'index: %d ' % (index)

dist = {}

for point_1in points:

#if self.debug:

#print 'point_1: %f %f' % (point_1.latit, point_1.longit)

#compute this node distance from all other points in cluster

? ? ? ? ? ? for clusterin clusters.values():

point_2 = cluster[0]

#if self.debug:

# print 'point_2: %f %f' % (point_2.latit, point_2.longit)

? ? ? ? ? ? ? ? if point_1not in dist:

dist[point_1] = math.sqrt(math.pow(point_1.latit - point_2.latit,2.0) + math.pow(point_1.longit - point_2.longit,2.0))

else:

dist[point_1] += math.sqrt(math.pow(point_1.latit - point_2.latit,2.0) + math.pow(point_1.longit - point_2.longit,2.0))

#if self.debug:

#for key, value in dist.items():

#print "(%f, %f) ==> %f" % (key.latit,key.longit,value)

#now let's return the point that has the maximum distance from previous nodes

? ? ? ? count_ =0

? ? ? ? max_ =0

? ? ? ? for key, valuein dist.items():

if count_ ==0:

max_ = value

max_point = key

count_ +=1

? ? ? ? ? ? else:

if value > max_:

max_ = value

max_point = key

return max_point

#this method computes the initial means

? ? def initial_means(self, points):

#pick the first node at random

? ? ? ? point_ = rand.choice(points)

if self.debug:

print 'point#0: %f %f' % (point_.latit, point_.longit)

clusters =dict()

clusters.setdefault(0, []).append(point_)

points.remove(point_)

#now let's pick k-1 more random points

? ? ? ? for iin range(1,self.k):

point_ =self.next_random(i, points, clusters)

if self.debug:

print 'point#%d: %f %f' % (i, point_.latit, point_.longit)

#clusters.append([point_])

? ? ? ? ? ? clusters.setdefault(i, []).append(point_)

points.remove(point_)

#compute mean of clusters

#self.print_clusters(clusters)

? ? ? ? self.means =self.compute_mean(clusters)

if self.debug:

print "initial means:"

? ? ? ? ? ? self.print_means(self.means)

def compute_mean(self, clusters):

means = []

for clusterin clusters.values():

mean_point = Point(0.0,0.0)

cnt =0.0

? ? ? ? ? ? for pointin cluster:

#print "compute: point(%f,%f)" % (point.latit, point.longit)

? ? ? ? ? ? ? ? mean_point.latit += point.latit

mean_point.longit += point.longit

cnt +=1.0

? ? ? ? ? ? mean_point.latit = mean_point.latit/cnt

mean_point.longit = mean_point.longit/cnt

means.append(mean_point)

return means

#this method assign nodes to the cluster with the smallest mean

? ? def assign_points(self, points):

if self.debug:

print "assign points"

? ? ? ? clusters =dict()

for pointin points:

dist = []

if self.debug:

print "point(%f,%f)" % (point.latit, point.longit)

#find the best cluster for this node

? ? ? ? ? ? for meanin self.means:

dist.append(math.sqrt(math.pow(point.latit - mean.latit,2.0) + math.pow(point.longit - mean.longit,2.0)))

#let's find the smallest mean

? ? ? ? ? ? if self.debug:

print dist

cnt_ =0

? ? ? ? ? ? index =0

? ? ? ? ? ? min_ = dist[0]

for din dist:

if d < min_:

min_ = d

index = cnt_

cnt_ +=1

? ? ? ? ? ? if self.debug:

print "index: %d" % index

clusters.setdefault(index, []).append(point)

return clusters

def update_means(self, means, threshold):

#check the current mean with the previous one to see if we should stop

? ? ? ? for iin range(len(self.means)):

mean_1 =self.means[i]

mean_2 = means[i]

if self.debug:

print "mean_1(%f,%f)" % (mean_1.latit, mean_1.longit)

print "mean_2(%f,%f)" % (mean_2.latit, mean_2.longit)

if math.sqrt(math.pow(mean_1.latit - mean_2.latit,2.0) + math.pow(mean_1.longit - mean_2.longit,2.0)) > threshold:

return False

? ? ? ? return True

? ? #debug function: print cluster points

? ? def print_clusters(self, clusters):

cluster_cnt =1

? ? ? ? for clusterin clusters.values():

print "nodes in cluster #%d" % cluster_cnt

cluster_cnt +=1

? ? ? ? ? ? for pointin cluster:

print "point(%f,%f)" % (point.latit, point.longit)

#print means

? ? def print_means(self, means):

for pointin means:

print "%f %f" % (point.latit, point.longit)

#k_means algorithm

? ? def k_means(self, plot_flag):

if len(self.geo_locations)

return -1? #error

? ? ? ? points_ = [pointfor pointin self.geo_locations]

#compute the initial means

? ? ? ? self.initial_means(points_)

stop =False

? ? ? ? while not stop:

#assignment step: assign each node to the cluster with the closest mean

? ? ? ? ? ? points_ = [pointfor pointin self.geo_locations]

clusters =self.assign_points(points_)

if self.debug:

self.print_clusters(clusters)

means =self.compute_mean(clusters)

if self.debug:

print "means:"

? ? ? ? ? ? ? ? print self.print_means(means)

print "update mean:"

? ? ? ? ? ? stop =self.update_means(means,0.01)

if not stop:

self.means = []

self.means = means

self.clusters = clusters

#plot cluster for evluation

? ? ? ? if plot_flag:

fig = plt.figure()

ax = fig.add_subplot(111)

markers = ['o','d','x','h','H',7,4,5,6,'8','p',',','+','.','s','*',3,0,1,2]

colors = ['r','k','b', [0,0,0], [0,0,1], [0,1,0], [0,1,1], [1,0,0], [1,0,1], [1,1,0], [1,1,1]]

cnt =0

? ? ? ? ? ? for clusterin clusters.values():

latits = []

longits = []

for pointin cluster:

latits.append(point.latit)

longits.append(point.longit)

ax.scatter(longits, latits,s=60,c=colors[cnt],marker=markers[cnt])

cnt +=1

? ? ? ? ? ? plt.show()

return 0

3 main.py

import randomas rand

from clusteringimport clustering

from pointimport Point

import csv

geo_locs = []

#loc_ = Point(0.0, 0.0)? #tuples for location

#geo_locs.append(loc_)

#read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object

f =open('./drinking_fountains.csv','r')

reader = csv.reader(f,delimiter=",")

for linein reader:

loc_ = Point(float(line[0]),float(line[1]))#tuples for location

? ? geo_locs.append(loc_)

#print len(geo_locs)

#for p in geo_locs:

#? ? print "%f %f" % (p.latit, p.longit)

#let's run k_means clustering. the second parameter is the no of clusters

cluster = clustering(geo_locs,8 )

flag = cluster.k_means(True)

if flag == -1:

print "Error in arguments!"

else:

#the clustering results is a list of lists where each list represents one cluster

? ? print "clustering results:"

? ? cluster.print_clusters(cluster.clusters)


4?drinking_fountains.csv 示例數(shù)據(jù)

1,1

2,2

3,3

4,4

5,5

6,6

7,7

8,8

9,9

10,10

11,11

12,12

13,13

14,14

15,15


運(yùn)行main.py ,圖片如下:


?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

  • The Inner Game of Tennis W Timothy Gallwey Jonathan Cape ...
    網(wǎng)事_79a3閱讀 12,881評(píng)論 3 20
  • 最近的日子總有些讓我感到倉皇不安,年近不惑之年原本不會(huì)感情用事,理應(yīng)致力于改善家庭幸福指數(shù)及多抽點(diǎn)時(shí)間陪陪年邁的父...
    王善龍閱讀 207評(píng)論 0 0
  • 今天學(xué)習(xí)到了什么? 阿米巴模式包括精細(xì)的經(jīng)營會(huì)計(jì)制度(京瓷會(huì)計(jì)學(xué))+經(jīng)營哲學(xué) 本質(zhì)是:所有的員工擁有統(tǒng)一的經(jīng)營哲學(xué)...
    青春沸騰閱讀 628評(píng)論 0 0
  • 滿紙荒唐言,一把辛酸淚。 都云作者癡,誰解其中味? 好的作家,都是在用生命寫作,遠(yuǎn)古有司馬...
    回歸自然HAO閱讀 173評(píng)論 6 6

友情鏈接更多精彩內(nèi)容