forked from agartland/utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmedoids.py
566 lines (473 loc) · 23 KB
/
kmedoids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
import numpy as np
from vectools import unique_rows
import scipy
import itertools
__all__ = ['kmedoids',
'fuzzycmedoids',
'tryallmedoids',
'precomputeWeightedDmat',
'assignClusters',
'computeInertia',
'computeMembership']
def kmedoids(dmat, k=3, weights=None, nPasses=1, maxIter=1000, initInds=None, potentialMedoidInds=None, seed=110820):
"""Identify the k points that minimize all intra-cluster distances.
The algorithm completes nPasses of the algorithm with random restarts.
Each pass consists of iteratively assigning/improving the medoids.
Uses Partioning Around Medoids (PAM) as the EM.
To apply to points in euclidean space pass dmat using:
dmat = sklearn.neighbors.DistanceMetric.get_metric('euclidean').pairwise(points_array)
Parameters
----------
dmat : array-like of floats, shape (n_samples, n_samples)
The pairwise distance matrix of observations to cluster.
weights : array-like of floats, shape (n_samples)
Relative weights for each observation in inertia computation.
k : int
The number of clusters to form as well as the number of
medoids to generate.
nPasses : int
Number of times the algorithm is restarted with random medoid initializations. The best solution is returned.
maxIter : int, optional, default None (inf)
Maximum number of iterations of the k-medoids algorithm to run.
initInds : ndarray
Medoid indices used for random initialization and restarts for each pass.
potentialMedoidInds : array of indices
If specified then medoids are constrained to be chosen from this array.
seed : int or False
If not False, sets np.random.seed for reproducible results.
Returns
-------
medoids : float ndarray with shape (k)
Indices into dmat that indicate medoids found at the last iteration of k-medoids.
labels : integer ndarray with shape (n_samples,)
label[i] is the code or index of the medoid the
i'th observation is closest to.
inertia : float
The final value of the inertia criterion (sum of squared distances to
the closest medoid for all observations).
nIter : int
Number of iterations run.
nFound : int
Number of unique solutions found (out of nPasses)"""
"""Use a seed to guarantee reproducibility"""
if seed:
np.random.seed(seed)
"""Number of points"""
N = dmat.shape[0]
if initInds is None:
initInds = np.arange(N)
wdmat = precomputeWeightedDmat(dmat, weights, squared=True)
if not potentialMedoidInds is None:
potentialMedoidSet = set(potentialMedoidInds)
initInds = np.array([i for i in potentialMedoidSet.intersection(set(initInds))], dtype=int)
else:
potentialMedoidSet = set(np.arange(N))
if len(initInds)==0:
print('No possible initInds provided.')
return
bestInertia = None
allMedoids = np.zeros((nPasses, k))
for passi in range(nPasses):
"""Pick k random medoids"""
currMedoids = np.random.permutation(initInds)[:k]
newMedoids = np.zeros(k, dtype=int)
labels = currMedoids[np.random.randint(k, size=N)]
for i in range(maxIter):
"""Assign each point to the closest cluster,
but don't reassign a point if the distance isn't an improvement."""
labels = assignClusters(dmat, currMedoids, oldLabels=labels)
"""If clusters are lost during (re)assignment step, pick random points
as new medoids and reassign until we have k clusters again"""
uLabels = np.unique(labels[potentialMedoidInds])
while uLabels.shape[0]<k:
for medi, med in enumerate(currMedoids):
if not med in uLabels:
choices = list(set(initInds).difference(set(uLabels)))
currMedoids[medi] = choices[np.random.randint(len(choices))]
labels = assignClusters(dmat, currMedoids, oldLabels=labels)
uLabels = np.unique(labels[potentialMedoidInds])
break
"""ISSUE: If len(unique(labels)) < k there is an error"""
"""Choose new medoids for each cluster, minimizing intra-cluster distance"""
totInertia = 0
for medi, med in enumerate(currMedoids):
clusterInd = np.where(labels == med)[0]
"""Limit medoids to those specified by indexing axis=0 with the intersection of potential medoids and all points in the cluster"""
potentialInds = np.array([poti for poti in potentialMedoidSet.intersection(set(clusterInd))])
"""Inertia is the sum of the distances (vec is shape (len(clusterInd))"""
inertiaVec = (wdmat[potentialInds,:][:, clusterInd]).sum(axis=1)
mnInd = np.argmin(inertiaVec)
newMedoids[medi] = potentialInds[mnInd]
"""Add inertia of this new medoid to the running total"""
totInertia += inertiaVec[mnInd]
if (newMedoids == currMedoids).all():
"""If the medoids didn't need to be updated then we're done!"""
allMedoids[passi,:] = sorted(currMedoids)
break
currMedoids = newMedoids.copy()
if bestInertia is None or totInertia < bestInertia:
"""For multiple passes, see if this pass was better than the others"""
bestInertia = totInertia
bestMedoids = currMedoids.copy()
bestLabels = labels.copy()
bestNIter = i + 1
"""nfound is the number of unique solutions (each row is a solution)"""
nfound = unique_rows(allMedoids).shape[0]
"""Return the results from the best pass"""
return bestMedoids, bestLabels, bestInertia, bestNIter, nfound
def precomputeWeightedDmat(dmat, weights, squared=False):
"""Compute the weighted distance matrix for kmedoids and FCMdd.
Adding weight to a point increases its impact on inertia linearly,
such that the algorithm will tend to favor minimization of distances
to that point.
Note: weights are applied along the rows so to correctly compute
inertia for a given medoid one would index the cluster along axis=1,
index the medoid along axis=0 and then sum along axis=1.
Parameters
----------
dmat : ndarray shape[N x N]
Pairwise distance matrix (unweighted).
weights : ndarray shape[N]
Should sum to one if one wants to preserve
inertial units with different weights.
Returns
-------
wdmat : ndarray shape[N x N]
Weighted distance matrix, ready for computing inertia."""
if squared:
dmat = dmat**2
if weights is None:
return dmat
else:
assert weights.shape[0] == dmat.shape[0]
return dmat * weights[None,:].values
def assignClusters(dmat, currMedoids, oldLabels=None):
"""Assigns/reassigns points to clusters based on the minimum (unweighted) distance.
Note: if oldLabels are specified then only reassigns points that
are not currently part of a cluster that minimizes their distance.
This ensures that when there are ties for best cluster with the current cluster,
the point is not reassigned to a new cluster.
Parameters
----------
dmat : ndarray shape[N x N]
Pairwise distance matrix (unweighted).
currMedoids : ndarray shape[k]
Index into points/dmat that specifies the k current medoids.
oldLabels : ndarray shape[N]
Old labels that will be reassigned.
Returns
-------
labels : ndarray shape[N]
New labels such that unique(labels) equals currMedoids."""
N = dmat.shape[0]
k = len(currMedoids)
"""Assign each point to the closest cluster,
but don't reassign a point if the distance isn't an improvement."""
if not oldLabels is None:
labels = oldLabels
oldD = dmat[np.arange(N), labels]
minD = (dmat[:, currMedoids]).min(axis=1)
"""Points where reassigning is neccessary"""
reassignInds = (minD<oldD) | ~np.any(np.tile(labels[:, None], (1, k)) == np.tile(currMedoids[None,:], (N, 1)), axis=1)
else:
reassignInds = np.arange(N)
labels = np.zeros(N)
labels[reassignInds] = currMedoids[np.argmin(dmat[reassignInds,:][:, currMedoids], axis=1)]
return labels
def computeInertia(wdmat2, labels, currMedoids):
"""Computes inertia for a set of clustered points using
a precomputed weighted distance matrix.
Note: wdmat needs to be summed along axis=1
assert all(sorted(unique(labels)) == sorted(currMedoids))
Parameters
----------
wdmat : ndarray shape[N x N]
Weighted and squared distance matrix, ready for computing inertia.
labels : ndarray shape[N]
Cluster assignment (medoid index) of each point
currMedoids : ndarray shape[k]
Index into points/dmat that specifies the k current medoids.
Returns
-------
inertia : float
Total inertia of all k clusters"""
assert np.all(np.unique(labels) == np.unique(currMedoids))
totInertia = 0
for medi, med in enumerate(currMedoids):
clusterInd = np.where(labels == med)[0]
"""Inertia is the sum of the distances"""
totInertia += wdmat2[med, clusterInd].sum()
return totInertia
def fuzzycmedoids(dmat, c, membershipMethod=('FCM', 2), weights=None, nPasses=1, maxIter=1000, initInds=None, potentialMedoidInds=None):
"""Implementation of fuzz c-medoids (FCMdd)
Krishnapuram, Raghu, Anupam Joshi, Liyu Yi, Computer Sciences, and Baltimore County.
"A Fuzzy Relative of the K-Medoids Algorithm with Application to Web Documents."
Electrical Engineering. doi:10.1109/FUZZY.1999.790086.
The algorithm completes nPasses of the algorithm with random restarts.
Each pass consists of iteratively assigning/improving the medoids.
To apply to points in euclidean space pass dmat using:
dmat = sklearn.neighbors.DistanceMetric.get_metric('euclidean').pairwise(points_array)
Parameters
----------
dmat : array-like of floats, shape (n_samples, n_samples)
Pairwise distance matrix of observations to cluster.
weights : array-like of floats, shape (n_samples)
Relative weights for each observation in inertia computation.
c : int
The number of clusters to form as well as the number of medoids to generate.
membershipMethod : tuple of (method str/int, param)
Method for computing membership matrix.
nPasses : int
Number of times the algorithm is restarted with random medoid initializations.
The best solution is returned.
maxIter : int, optional, default None (inf)
Maximum number of iterations of the c-medoids algorithm to run.
initInds : ndarray
Medoid indices used for random initialization and restarts for each pass.
potentialMedoidInds : array of indices
If specified, then medoids are constrained to be chosen from this array.
Returns
-------
medoids : float ndarray with shape (c)
Indices into dmat that indicate medoids found at the last iteration of FCMdd
membership : float ndarray with shape (n_samples, c)
Each row contains the membership of a point to each of the clusters.
nIter : int
Number of iterations run.
nFound : int
Number of unique solutions found (out of nPasses)"""
"""Number of points"""
N = dmat.shape[0]
if initInds is None:
initInds = np.arange(N)
wdmat = precomputeWeightedDmat(dmat, weights)
if not potentialMedoidInds is None:
initInds = np.array([i for i in initInds if i in potentialMedoidInds], dtype=int)
else:
potentialMedoidInds = np.arange(N)
if len(initInds) == 0:
print('No possible initInds provided.')
return
allMedoids = np.zeros((nPasses, c))
bestInertia = None
foundSame = 0
for passi in range(nPasses):
"""Pick c random medoids"""
currMedoids = np.random.permutation(initInds)[:c]
newMedoids = np.zeros(c, dtype=int)
for i in range(maxIter):
"""(Re)compute memberships [N x c]"""
membership = computeMembership(dmat, currMedoids, method=membershipMethod[0], param=membershipMethod[1])
"""Choose new medoid for each cluster, minimizing fuzzy objective function"""
totInertia = 0
for medi, med in enumerate(currMedoids):
"""Within each cluster find the new medoid
by minimizing the dissimilarities,
weighted by membership to the cluster"""
"""Inertia is the sum of the membership times the distance matrix over all points.
(membership for cluster medi [a column vector] is applied across the columns of wdmat
[and broadcast to all row vectors] before summing)"""
inertiaMat = np.tile(membership[:, medi][:, None].T, (len(potentialMedoidInds), 1)) * wdmat[potentialMedoidInds,:]
inertiaVec = inertiaMat.sum(axis=1)
mnInd = np.argmin(inertiaVec)
newMedoids[medi] = potentialMedoidInds[mnInd]
"""Add inertia of this new medoid to the running total"""
totInertia += inertiaVec[mnInd]
if (newMedoids == currMedoids).all():
"""If the medoids didn't need to be updated then we're done!"""
allMedoids[passi,:] = sorted(currMedoids)
break
currMedoids = newMedoids.copy()
if bestInertia is None or totInertia < bestInertia:
"""For multiple passes, see if this pass was better than the others"""
bestInertia = totInertia
bestMedoids = currMedoids.copy()
bestMembership = membership.copy()
bestNIter = i + 1
"""nfound is the number of unique solutions (each row is a solution)"""
nfound = unique_rows(allMedoids).shape[0]
"""Return the results from the best pass"""
return bestMedoids, bestMembership, bestNIter, nfound#, allMedoids
def fuzzyPartitionCoef(membership):
"""Fuzzy partition coefficient `fpc` relative to fuzzy c-partitioned
matrix membership. Measures 'fuzziness' in partitioned clustering.
Copied from from sckit-fuzzy:
https://github.com/scikit-fuzzy/scikit-fuzzy
Parameters
----------
membership : 2d array (C, N)
Fuzzy c-partitioned matrix; N = number of data points and C = number
of clusters.
Returns
-------
fpc : float
Fuzzy partition coefficient."""
n = membership.shape[1]
return np.trace(membership.dot(membership.T)) / float(n)
def computeMembership(dmat, medoids, method='FCM', param=2):
"""Compute membership of each instance in each cluster,
defined by the provided medoids.
Possible methods come from the manuscript by Krishnapuram et al.
and may include an additional parameter (typically a "fuzzifier")
Parameters
----------
dmat : ndarray shape[N x N]
Pairwise distance matrix (unweighted).
medoids : ndarray shape[c]
Index into points/dmat that specifies the c current medoids.
method : str
Method for computing memberships:
"FCM" (from fuzzy c-means)
Equations from Krishnapuram et al. (2, 3, 4 or 5)
param : float
Additional parameter required by the method.
Note: param must be shape[c,] for methods 4 or 5
Returns
-------
membership : ndarray shape[N x c]
New labels such that unique(labels) equals currMedoids."""
N = dmat.shape[0]
c = len(medoids)
r = dmat[:, medoids]
if method in ['FCM', 2, '2']:
assert param >= 1
tmp = (1 / r)**(1 / (param - 1))
elif method in [3, '3']:
assert param > 0
tmp = np.exp(-param * r)
elif method in [4, '4']:
assert param.shape == (c,)
tmp = 1/(1 + r/param[:, None])
elif method in [5, '5']:
assert param.shape == (c,)
tmp = np.exp(-r/param[:, None])
membership = tmp / tmp.sum(axis=1, keepdims=True)
for medi, med in enumerate(medoids):
membership[med,:] = 0.
membership[med, medi] = 1.
return membership
def tryallmedoids(dmat, c, weights=None, potentialMedoidInds=None, fuzzy=True, fuzzyParams=('FCM', 2)):
"""Brute force optimization of k-medoids or fuzzy c-medoids clustering.
To apply to points in euclidean space pass dmat using:
dmat = sklearn.neighbors.DistanceMetric.get_metric('euclidean').pairwise(points_array)
Parameters
----------
dmat : array-like of floats, shape (n_samples, n_samples)
Pairwise distance matrix of observations to cluster.
c : int
Number of clusters to form as well as the number of medoids to generate.
weights : array-like of floats, shape (n_samples)
Relative weights for each observation in inertia computation.
potentialMedoidInds : array of indices
If specified, then medoids are constrained to be chosen from this array.
fuzzy : boolean
If True, use fuzzy inertia function,
otherwis use crisp cluster definition.
fuzzyParams : tuple of (method str/int, param)
Method and parameter for computing fuzzy membership matrix.
Returns
-------
medoids : float ndarray with shape (c)
Indices into dmat that indicate optimal medoids.
membership or labels: float ndarray with shape (n_samples, c) or shape (n_samples,)
Each row contains the membership of a point to each of the clusters
OR with hard clusters, the medoid/cluster index of each point."""
if fuzzy:
wdmat = precomputeWeightedDmat(dmat, weights, squared=False)
else:
wdmat = precomputeWeightedDmat(dmat, weights, squared=True)
N = dmat.shape[0]
if potentialMedoidInds is None:
potentialMedoidInds = np.arange(N)
combinations = scipy.misc.comb(len(potentialMedoidInds), c)
if combinations > 1e7:
print("Too many combinations to try: %1.1g > 10M" % combinations)
bestInertia = None
for medInds in itertools.combinations(list(range(len(potentialMedoidInds))), c):
medoids = potentialMedoidInds[np.array(medInds)]
if fuzzy:
membership = computeMembership(dmat, medoids, method=fuzzyParams[0], param=fuzzyParams[1])
else:
membership = np.zeros((N, c))
membership[np.arange(N), np.argmin(dmat[:, medoids], axis=1)] = 1.
inertia = (wdmat[:, medoids] * membership).sum()
if bestInertia is None or inertia < bestInertia:
bestMedoids = medoids
bestInertia = inertia
bestMembership = membership
if not fuzzy:
membership = np.argmax(membership, axis=1)
return medoids, membership
def _rangenorm(vec, mx=1, mn=0):
"""Normazlize values of vec in-place to [mn, mx] interval"""
vec = vec - np.nanmin(vec)
vec = vec / np.nanmax(vec)
vec = vec * (mx-mn) + mn
return vec
def _test_plot(k=3, nPasses=20, maxIter=1000):
from sklearn import neighbors, datasets
from Bio.Cluster import kmedoids as biokmedoids
import time
import matplotlib.pyplot as plt
import palettable
import seaborn as sns
sns.set(style='darkgrid', palette='muted', font_scale=1.3)
cmap = palettable.colorbrewer.qualitative.Set1_9.mpl_colors
iris = datasets.load_iris()
obs = iris['data']
dmat = neighbors.DistanceMetric.get_metric('euclidean').pairwise(obs)
weights = np.random.rand(obs.shape[0])
plt.figure(2)
plt.clf()
plt.subplot(2, 2, 1)
startTime = time.time()
medoids, labels, inertia, niter, nfound = kmedoids(dmat, k=k, maxIter=maxIter, nPasses=nPasses)
et = time.time() - startTime
for medi, med in enumerate(medoids):
plt.scatter(obs[labels==med, 0], obs[labels==med, 1], color=cmap[medi])
plt.plot(obs[med, 0], obs[med, 1], 'sk', markersize=10, color=cmap[medi], alpha=0.5)
plt.title('K-medoids (%1.3f sec, %d iterations, %d solns)' % (et, niter, nfound))
plt.subplot(2, 2, 3)
startTime = time.time()
medoids, labels, inertia, niter, nfound = kmedoids(dmat, k=k, maxIter=maxIter, nPasses=nPasses, weights=weights)
et = time.time() - startTime
for medi, med in enumerate(medoids):
nWeights = _rangenorm(weights, mn=10, mx=200)
plt.scatter(obs[labels==med, 0], obs[labels==med, 1], color=cmap[medi], s=nWeights, edgecolor='black', alpha=0.5)
plt.plot(obs[med, 0], obs[med, 1], 'sk', markersize=10, color=cmap[medi])
plt.title('Weighted K-medoids (%1.3f sec, %d iterations, %d solns)' % (et, niter, nfound))
plt.subplot(2, 2, 2)
startTime = time.time()
biolabels, bioerror, bionfound = biokmedoids(dmat, nclusters=k, npass=nPasses)
biomedoids = np.unique(biolabels)
bioet = time.time() - startTime
for medi, med in enumerate(biomedoids):
plt.scatter(obs[biolabels==med, 0], obs[biolabels==med, 1], color=cmap[medi])
plt.plot(obs[med, 0], obs[med, 1], 'sk', color=cmap[medi], markersize=10, alpha = 0.5)
plt.title('Bio.Cluster K-medoids (%1.3f sec, %d solns)' % (bioet, bionfound))
plt.subplot(2, 2, 4)
startTime = time.time()
medoids, membership, niter, nfound = fuzzycmedoids(dmat, c=k, maxIter=maxIter, nPasses=nPasses)
labels = medoids[np.argmax(membership, axis=1)]
et = time.time() - startTime
for medi, med in enumerate(medoids):
ind = labels == med
sz = _rangenorm(membership[:, medi][ind], mn=10, mx=100)
sz[np.argmax(sz)] = 0.
plt.scatter(obs[ind, 0], obs[ind, 1], color=cmap[medi], s=sz, alpha=0.5)
plt.plot(obs[med, 0], obs[med, 1], 'sk', markersize=10, color=cmap[medi])
plt.title('Fuzzy c-medoids (%1.3f sec, %d iterations, %d solns)' % (et, niter, nfound))
def _test_kmedoids(nPasses=20, k=3, maxIter=1000):
from sklearn import neighbors, datasets
iris = datasets.load_iris()
obs = iris['data']
dmat = neighbors.DistanceMetric.get_metric('euclidean').pairwise(obs)
results = kmedoids(dmat, k=k, maxIter=maxIter, nPasses=nPasses)
return (dmat,) + results
def _test_FCMdd(nPasses=20, c=3, maxIter=1000, membershipMethod=('FCM', 2)):
from sklearn import neighbors, datasets
iris = datasets.load_iris()
obs = iris['data']
dmat = neighbors.DistanceMetric.get_metric('euclidean').pairwise(obs)
results = fuzzycmedoids(dmat, c=c, maxIter=maxIter, nPasses=nPasses, membershipMethod=membershipMethod)
return (dmat,) + results