1 import numpy
2
3 from Bio.Cluster.cluster import *
4
5 -def _treesort(order, nodeorder, nodecounts, tree):
6
7
8 nNodes = len(tree)
9 nElements = nNodes + 1
10 neworder = numpy.zeros(nElements)
11 clusterids = numpy.arange(nElements)
12 for i in range(nNodes):
13 i1 = tree[i].left
14 i2 = tree[i].right
15 if i1 < 0:
16 order1 = nodeorder[-i1-1]
17 count1 = nodecounts[-i1-1]
18 else:
19 order1 = order[i1]
20 count1 = 1
21 if i2 < 0:
22 order2 = nodeorder[-i2-1]
23 count2 = nodecounts[-i2-1]
24 else:
25 order2 = order[i2]
26 count2 = 1
27
28
29 if i1 < i2:
30 if order1 < order2:
31 increase = count1
32 else:
33 increase = count2
34 for j in range(nElements):
35 clusterid = clusterids[j]
36 if clusterid == i1 and order1 >= order2:
37 neworder[j] += increase
38 if clusterid == i2 and order1 < order2:
39 neworder[j] += increase
40 if clusterid == i1 or clusterid == i2:
41 clusterids[j] = -i-1
42 else:
43 if order1 <= order2:
44 increase = count1
45 else:
46 increase = count2
47 for j in range(nElements):
48 clusterid = clusterids[j]
49 if clusterid == i1 and order1 > order2:
50 neworder[j] += increase
51 if clusterid == i2 and order1 <= order2:
52 neworder[j] += increase
53 if clusterid == i1 or clusterid == i2:
54 clusterids[j] = -i-1
55 return numpy.argsort(neworder)
56
57
58 -def _savetree(jobname, tree, order, transpose):
59
60
61 if transpose == 0:
62 extension = ".gtr"
63 keyword = "GENE"
64 else:
65 extension = ".atr"
66 keyword = "ARRY"
67 nnodes = len(tree)
68 outputfile = open(jobname+extension, "w")
69 nodeindex = 0
70 nodeID = [''] * nnodes
71 nodecounts = numpy.zeros(nnodes, int)
72 nodeorder = numpy.zeros(nnodes)
73 nodedist = numpy.array([node.distance for node in tree])
74 for nodeindex in range(nnodes):
75 min1 = tree[nodeindex].left
76 min2 = tree[nodeindex].right
77 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1)
78 outputfile.write(nodeID[nodeindex])
79 outputfile.write("\t")
80 if min1 < 0:
81 index1 = -min1-1
82 order1 = nodeorder[index1]
83 counts1 = nodecounts[index1]
84 outputfile.write(nodeID[index1]+"\t")
85 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1])
86 else:
87 order1 = order[min1]
88 counts1 = 1
89 outputfile.write("%s%dX\t" % (keyword, min1))
90 if min2 < 0:
91 index2 = -min2-1
92 order2 = nodeorder[index2]
93 counts2 = nodecounts[index2]
94 outputfile.write(nodeID[index2]+"\t")
95 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2])
96 else:
97 order2 = order[min2]
98 counts2 = 1
99 outputfile.write("%s%dX\t" % (keyword, min2))
100 outputfile.write(str(1.0-nodedist[nodeindex]))
101 outputfile.write("\n")
102 counts = counts1 + counts2
103 nodecounts[nodeindex] = counts
104 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / counts
105 outputfile.close()
106
107 index = _treesort(order, nodeorder, nodecounts, tree)
108 return index
109
110
112 """Store gene expression data.
113
114 A Record stores the gene expression data and related information contained
115 in a data file following the file format defined for Michael Eisen's
116 Cluster/TreeView program. A Record has the following members:
117
118 data: a matrix containing the gene expression data
119 mask: a matrix containing only 1's and 0's, denoting which values
120 are present (1) or missing (0). If all elements of mask are
121 one (no missing data), then mask is set to None.
122 geneid: a list containing a unique identifier for each gene
123 (e.g., ORF name)
124 genename: a list containing an additional description for each gene
125 (e.g., gene name)
126 gweight: the weight to be used for each gene when calculating the
127 distance
128 gorder: an array of real numbers indicating the preferred order of the
129 genes in the output file
130 expid: a list containing a unique identifier for each experimental
131 condition
132 eweight: the weight to be used for each experimental condition when
133 calculating the distance
134 eorder: an array of real numbers indication the preferred order in the
135 output file of the experimental conditions
136 uniqid: the string that was used instead of UNIQID in the input file.
137
138 """
139
141 """Read gene expression data from the file handle and return a Record.
142
143 The file should be in the format defined for Michael Eisen's
144 Cluster/TreeView program.
145
146 """
147 self.data = None
148 self.mask = None
149 self.geneid = None
150 self.genename = None
151 self.gweight = None
152 self.gorder = None
153 self.expid = None
154 self.eweight = None
155 self.eorder = None
156 self.uniqid = None
157 if not handle:
158 return
159 line = handle.readline().strip("\r\n").split("\t")
160 n = len(line)
161 self.uniqid = line[0]
162 self.expid = []
163 cols = {0: "GENEID"}
164 for word in line[1:]:
165 if word == "NAME":
166 cols[line.index(word)] = word
167 self.genename = []
168 elif word == "GWEIGHT":
169 cols[line.index(word)] = word
170 self.gweight = []
171 elif word=="GORDER":
172 cols[line.index(word)] = word
173 self.gorder = []
174 else:
175 self.expid.append(word)
176 self.geneid = []
177 self.data = []
178 self.mask = []
179 needmask = 0
180 for line in handle:
181 line = line.strip("\r\n").split("\t")
182 if len(line) != n:
183 raise ValueError("Line with %d columns found (expected %d)" %
184 (len(line), n))
185 if line[0] == "EWEIGHT":
186 i = max(cols) + 1
187 self.eweight = numpy.array(line[i:], float)
188 continue
189 if line[0] == "EORDER":
190 i = max(cols) + 1
191 self.eorder = numpy.array(line[i:], float)
192 continue
193 rowdata = []
194 rowmask = []
195 n = len(line)
196 for i in range(n):
197 word = line[i]
198 if i in cols:
199 if cols[i] == "GENEID":
200 self.geneid.append(word)
201 if cols[i] == "NAME":
202 self.genename.append(word)
203 if cols[i] == "GWEIGHT":
204 self.gweight.append(float(word))
205 if cols[i] == "GORDER":
206 self.gorder.append(float(word))
207 continue
208 if not word:
209 rowdata.append(0.0)
210 rowmask.append(0)
211 needmask = 1
212 else:
213 rowdata.append(float(word))
214 rowmask.append(1)
215 self.data.append(rowdata)
216 self.mask.append(rowmask)
217 self.data = numpy.array(self.data)
218 if needmask:
219 self.mask = numpy.array(self.mask, int)
220 else:
221 self.mask = None
222 if self.gweight:
223 self.gweight = numpy.array(self.gweight)
224 if self.gorder:
225 self.gorder = numpy.array(self.gorder)
226
227 - def treecluster(self, transpose=0, method='m', dist='e'):
228 """Apply hierarchical clustering and return a Tree object.
229
230 The pairwise single, complete, centroid, and average linkage hierarchical
231 clustering methods are available.
232
233 transpose: if equal to 0, genes (rows) are clustered;
234 if equal to 1, microarrays (columns) are clustered.
235 dist : specifies the distance function to be used:
236 dist=='e': Euclidean distance
237 dist=='b': City Block distance
238 dist=='c': Pearson correlation
239 dist=='a': absolute value of the correlation
240 dist=='u': uncentered correlation
241 dist=='x': absolute uncentered correlation
242 dist=='s': Spearman's rank correlation
243 dist=='k': Kendall's tau
244 method : specifies which linkage method is used:
245 method=='s': Single pairwise linkage
246 method=='m': Complete (maximum) pairwise linkage (default)
247 method=='c': Centroid linkage
248 method=='a': Average pairwise linkage
249
250 See the description of the Tree class for more information about the Tree
251 object returned by this method.
252
253 """
254 if transpose == 0:
255 weight = self.eweight
256 else:
257 weight = self.gweight
258 return treecluster(self.data, self.mask, weight, transpose, method,
259 dist)
260
261 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e',
262 initialid=None):
263 """Apply k-means or k-median clustering.
264
265 This method returns a tuple (clusterid, error, nfound).
266
267 nclusters: number of clusters (the 'k' in k-means)
268 transpose: if equal to 0, genes (rows) are clustered;
269 if equal to 1, microarrays (columns) are clustered.
270 npass : number of times the k-means clustering algorithm is
271 performed, each time with a different (random) initial
272 condition.
273 method : specifies how the center of a cluster is found:
274 method=='a': arithmetic mean
275 method=='m': median
276 dist : specifies the distance function to be used:
277 dist=='e': Euclidean distance
278 dist=='b': City Block distance
279 dist=='c': Pearson correlation
280 dist=='a': absolute value of the correlation
281 dist=='u': uncentered correlation
282 dist=='x': absolute uncentered correlation
283 dist=='s': Spearman's rank correlation
284 dist=='k': Kendall's tau
285 initialid: the initial clustering from which the algorithm should start.
286 If initialid is None, the routine carries out npass
287 repetitions of the EM algorithm, each time starting from a
288 different random initial clustering. If initialid is given,
289 the routine carries out the EM algorithm only once, starting
290 from the given initial clustering and without randomizing the
291 order in which items are assigned to clusters (i.e., using
292 the same order as in the data matrix). In that case, the
293 k-means algorithm is fully deterministic.
294
295 Return values:
296 clusterid: array containing the number of the cluster to which each
297 gene/microarray was assigned in the best k-means clustering
298 solution that was found in the npass runs;
299 error: the within-cluster sum of distances for the returned k-means
300 clustering solution;
301 nfound: the number of times this solution was found.
302
303 """
304
305 if transpose == 0:
306 weight = self.eweight
307 else:
308 weight = self.gweight
309 return kcluster(self.data, nclusters, self.mask, weight, transpose,
310 npass, method, dist, initialid)
311
312 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02,
313 niter=1, dist='e'):
314 """Calculate a self-organizing map on a rectangular grid.
315
316 The somcluster method returns a tuple (clusterid, celldata).
317
318 transpose: if equal to 0, genes (rows) are clustered;
319 if equal to 1, microarrays (columns) are clustered.
320 nxgrid : the horizontal dimension of the rectangular SOM map
321 nygrid : the vertical dimension of the rectangular SOM map
322 inittau : the initial value of tau (the neighborbood function)
323 niter : the number of iterations
324 dist : specifies the distance function to be used:
325 dist=='e': Euclidean distance
326 dist=='b': City Block distance
327 dist=='c': Pearson correlation
328 dist=='a': absolute value of the correlation
329 dist=='u': uncentered correlation
330 dist=='x': absolute uncentered correlation
331 dist=='s': Spearman's rank correlation
332 dist=='k': Kendall's tau
333
334 Return values:
335 clusterid: array with two columns, while the number of rows is equal to
336 the number of genes or the number of microarrays depending on
337 whether genes or microarrays are being clustered. Each row in
338 the array contains the x and y coordinates of the cell in the
339 rectangular SOM grid to which the gene or microarray was
340 assigned.
341 celldata: an array with dimensions (nxgrid, nygrid, number of
342 microarrays) if genes are being clustered, or (nxgrid,
343 nygrid, number of genes) if microarrays are being clustered.
344 Each element [ix][iy] of this array is a 1D vector containing
345 the gene expression data for the centroid of the cluster in
346 the SOM grid cell with coordinates (ix, iy).
347
348 """
349
350 if transpose == 0:
351 weight = self.eweight
352 else:
353 weight = self.gweight
354 return somcluster(self.data, self.mask, weight, transpose,
355 nxgrid, nygrid, inittau, niter, dist)
356
358 """Calculate the cluster centroids and return a tuple (cdata, cmask).
359
360 The centroid is defined as either the mean or the median over all elements
361 for each dimension.
362
363 data : nrows x ncolumns array containing the expression data
364 mask : nrows x ncolumns array of integers, showing which data are
365 missing. If mask[i][j]==0, then data[i][j] is missing.
366 transpose: if equal to 0, gene (row) clusters are considered;
367 if equal to 1, microarray (column) clusters are considered.
368 clusterid: array containing the cluster number for each gene or
369 microarray. The cluster number should be non-negative.
370 method : specifies how the centroid is calculated:
371 method=='a': arithmetic mean over each dimension. (default)
372 method=='m': median over each dimension.
373
374 Return values:
375 cdata : 2D array containing the cluster centroids. If transpose==0,
376 then the dimensions of cdata are nclusters x ncolumns. If
377 transpose==1, then the dimensions of cdata are
378 nrows x nclusters.
379 cmask : 2D array of integers describing which elements in cdata,
380 if any, are missing.
381
382 """
383 return clustercentroids(self.data, self.mask, clusterid, method,
384 transpose)
385
386 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e',
387 transpose=0):
388 """Calculate the distance between two clusters.
389
390 index1 : 1D array identifying which genes/microarrays belong to the
391 first cluster. If the cluster contains only one gene, then
392 index1 can also be written as a single integer.
393 index2 : 1D array identifying which genes/microarrays belong to the
394 second cluster. If the cluster contains only one gene, then
395 index2 can also be written as a single integer.
396 transpose: if equal to 0, genes (rows) are clustered;
397 if equal to 1, microarrays (columns) are clustered.
398 dist : specifies the distance function to be used:
399 dist=='e': Euclidean distance
400 dist=='b': City Block distance
401 dist=='c': Pearson correlation
402 dist=='a': absolute value of the correlation
403 dist=='u': uncentered correlation
404 dist=='x': absolute uncentered correlation
405 dist=='s': Spearman's rank correlation
406 dist=='k': Kendall's tau
407 method : specifies how the distance between two clusters is defined:
408 method=='a': the distance between the arithmetic means of the
409 two clusters
410 method=='m': the distance between the medians of the two
411 clusters
412 method=='s': the smallest pairwise distance between members
413 of the two clusters
414 method=='x': the largest pairwise distance between members of
415 the two clusters
416 method=='v': average of the pairwise distances between
417 members of the clusters
418 transpose: if equal to 0: clusters of genes (rows) are considered;
419 if equal to 1: clusters of microarrays (columns) are
420 considered.
421
422 """
423
424 if transpose == 0:
425 weight = self.eweight
426 else:
427 weight = self.gweight
428 return clusterdistance(self.data, self.mask, weight,
429 index1, index2, method, dist, transpose)
430
432 """Calculate the distance matrix and return it as a list of arrays
433
434 transpose: if equal to 0: calculate the distances between genes (rows);
435 if equal to 1: calculate the distances beteeen microarrays
436 (columns).
437 dist : specifies the distance function to be used:
438 dist=='e': Euclidean distance
439 dist=='b': City Block distance
440 dist=='c': Pearson correlation
441 dist=='a': absolute value of the correlation
442 dist=='u': uncentered correlation
443 dist=='x': absolute uncentered correlation
444 dist=='s': Spearman's rank correlation
445 dist=='k': Kendall's tau
446
447 Return value:
448 The distance matrix is returned as a list of 1D arrays containing the
449 distance matrix between the gene expression data. The number of columns
450 in each row is equal to the row number. Hence, the first row has zero
451 elements. An example of the return value is
452 matrix = [[],
453 array([1.]),
454 array([7., 3.]),
455 array([4., 2., 6.])]
456 This corresponds to the distance matrix
457 [0., 1., 7., 4.]
458 [1., 0., 3., 2.]
459 [7., 3., 0., 6.]
460 [4., 2., 6., 0.]
461
462 """
463 if transpose == 0:
464 weight = self.eweight
465 else:
466 weight = self.gweight
467 return distancematrix(self.data, self.mask, weight, transpose, dist)
468
469 - def save(self, jobname, geneclusters=None, expclusters=None):
470 """Save the clustering results.
471
472 The saved files follow the convention for the Java TreeView program,
473 which can therefore be used to view the clustering result.
474
475 Arguments:
476 jobname: The base name of the files to be saved. The filenames are
477 jobname.cdt, jobname.gtr, and jobname.atr for
478 hierarchical clustering, and jobname-K*.cdt,
479 jobname-K*.kgg, jobname-K*.kag for k-means clustering
480 results.
481 geneclusters=None: For hierarchical clustering results, geneclusters
482 is a Tree object as returned by the treecluster method.
483 For k-means clustering results, geneclusters is a vector
484 containing ngenes integers, describing to which cluster a
485 given gene belongs. This vector can be calculated by
486 kcluster.
487 expclusters=None: For hierarchical clustering results, expclusters
488 is a Tree object as returned by the treecluster method.
489 For k-means clustering results, expclusters is a vector
490 containing nexps integers, describing to which cluster a
491 given experimental condition belongs. This vector can be
492 calculated by kcluster.
493
494 """
495 (ngenes,nexps) = numpy.shape(self.data)
496 if self.gorder == None:
497 gorder = numpy.arange(ngenes)
498 else:
499 gorder = self.gorder
500 if self.eorder == None:
501 eorder = numpy.arange(nexps)
502 else:
503 eorder = self.eorder
504 if geneclusters!=None and expclusters!=None and \
505 type(geneclusters) != type(expclusters):
506 raise ValueError("found one k-means and one hierarchical "
507 + "clustering solution in geneclusters and "
508 + "expclusters")
509 gid = 0
510 aid = 0
511 filename = jobname
512 postfix = ""
513 if type(geneclusters) == Tree:
514
515 geneindex = _savetree(jobname, geneclusters, gorder, 0)
516 gid = 1
517 elif geneclusters!=None:
518
519 filename = jobname + "_K"
520 k = max(geneclusters+1)
521 kggfilename = "%s_K_G%d.kgg" % (jobname, k)
522 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0)
523 postfix = "_G%d" % k
524 else:
525 geneindex = numpy.argsort(gorder)
526 if type(expclusters) == Tree:
527
528 expindex = _savetree(jobname, expclusters, eorder, 1)
529 aid = 1
530 elif expclusters!=None:
531
532 filename = jobname + "_K"
533 k = max(expclusters+1)
534 kagfilename = "%s_K_A%d.kag" % (jobname, k)
535 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1)
536 postfix += "_A%d" % k
537 else:
538 expindex = numpy.argsort(eorder)
539 filename = filename + postfix
540 self._savedata(filename,gid,aid,geneindex,expindex)
541
542 - def _savekmeans(self, filename, clusterids, order, transpose):
543
544 if transpose == 0:
545 label = self.uniqid
546 names = self.geneid
547 else:
548 label = "ARRAY"
549 names = self.expid
550 try:
551 outputfile = open(filename, "w")
552 except IOError:
553 raise IOError("Unable to open output file")
554 outputfile.write(label + "\tGROUP\n")
555 index = numpy.argsort(order)
556 n = len(names)
557 sortedindex = numpy.zeros(n, int)
558 counter = 0
559 cluster = 0
560 while counter < n:
561 for j in index:
562 if clusterids[j] == cluster:
563 outputfile.write("%s\t%s\n" % (names[j], cluster))
564 sortedindex[counter] = j
565 counter += 1
566 cluster += 1
567 outputfile.close()
568 return sortedindex
569
570 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
571
572 if self.genename == None:
573 genename = self.geneid
574 else:
575 genename = self.genename
576 (ngenes, nexps) = numpy.shape(self.data)
577 try:
578 outputfile = open(jobname+'.cdt', 'w')
579 except IOError:
580 raise IOError("Unable to open output file")
581 if self.mask!=None:
582 mask = self.mask
583 else:
584 mask = numpy.ones((ngenes,nexps), int)
585 if self.gweight!=None:
586 gweight = self.gweight
587 else:
588 gweight = numpy.ones(ngenes)
589 if self.eweight!=None:
590 eweight = self.eweight
591 else:
592 eweight = numpy.ones(nexps)
593 if gid:
594 outputfile.write('GID\t')
595 outputfile.write(self.uniqid)
596 outputfile.write('\tNAME\tGWEIGHT')
597
598 for j in expindex:
599 outputfile.write('\t%s' % self.expid[j])
600 outputfile.write('\n')
601 if aid:
602 outputfile.write("AID")
603 if gid:
604 outputfile.write('\t')
605 outputfile.write("\t\t")
606 for j in expindex:
607 outputfile.write('\tARRY%dX' % j)
608 outputfile.write('\n')
609 outputfile.write('EWEIGHT')
610 if gid:
611 outputfile.write('\t')
612 outputfile.write('\t\t')
613 for j in expindex:
614 outputfile.write('\t%f' % eweight[j])
615 outputfile.write('\n')
616 for i in geneindex:
617 if gid:
618 outputfile.write('GENE%dX\t' % i)
619 outputfile.write("%s\t%s\t%f" %
620 (self.geneid[i], genename[i], gweight[i]))
621 for j in expindex:
622 outputfile.write('\t')
623 if mask[i,j]:
624 outputfile.write(str(self.data[i,j]))
625 outputfile.write('\n')
626 outputfile.close()
627
628
630 """Read gene expression data from the file handle and return a Record.
631
632 The file should be in the file format defined for Michael Eisen's
633 Cluster/TreeView program.
634
635 """
636 return Record(handle)
637