Package Bio :: Package Cluster
[hide private]
[frames] | no frames]

Source Code for Package Bio.Cluster

  1  import numpy 
  2   
  3  from Bio.Cluster.cluster import * 
  4   
5 -def _treesort(order, nodeorder, nodecounts, tree):
6 # Find the order of the nodes consistent with the hierarchical clustering 7 # tree, taking into account the preferred order of nodes. 8 nNodes = len(tree) 9 nElements = nNodes + 1 10 neworder = numpy.zeros(nElements) 11 clusterids = numpy.arange(nElements) 12 for i in range(nNodes): 13 i1 = tree[i].left 14 i2 = tree[i].right 15 if i1 < 0: 16 order1 = nodeorder[-i1-1] 17 count1 = nodecounts[-i1-1] 18 else: 19 order1 = order[i1] 20 count1 = 1 21 if i2 < 0: 22 order2 = nodeorder[-i2-1] 23 count2 = nodecounts[-i2-1] 24 else: 25 order2 = order[i2] 26 count2 = 1 27 # If order1 and order2 are equal, their order is determined 28 # by the order in which they were clustered 29 if i1 < i2: 30 if order1 < order2: 31 increase = count1 32 else: 33 increase = count2 34 for j in range(nElements): 35 clusterid = clusterids[j] 36 if clusterid == i1 and order1 >= order2: 37 neworder[j] += increase 38 if clusterid == i2 and order1 < order2: 39 neworder[j] += increase 40 if clusterid == i1 or clusterid == i2: 41 clusterids[j] = -i-1 42 else: 43 if order1 <= order2: 44 increase = count1 45 else: 46 increase = count2 47 for j in range(nElements): 48 clusterid = clusterids[j] 49 if clusterid == i1 and order1 > order2: 50 neworder[j] += increase 51 if clusterid == i2 and order1 <= order2: 52 neworder[j] += increase 53 if clusterid == i1 or clusterid == i2: 54 clusterids[j] = -i-1 55 return numpy.argsort(neworder)
56 57
58 -def _savetree(jobname, tree, order, transpose):
59 # Save the hierarchical clustering solution given by the tree, following 60 # the specified order, in a file whose name is based on jobname. 61 if transpose == 0: 62 extension = ".gtr" 63 keyword = "GENE" 64 else: 65 extension = ".atr" 66 keyword = "ARRY" 67 nnodes = len(tree) 68 outputfile = open(jobname+extension, "w") 69 nodeindex = 0 70 nodeID = [''] * nnodes 71 nodecounts = numpy.zeros(nnodes, int) 72 nodeorder = numpy.zeros(nnodes) 73 nodedist = numpy.array([node.distance for node in tree]) 74 for nodeindex in range(nnodes): 75 min1 = tree[nodeindex].left 76 min2 = tree[nodeindex].right 77 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1) 78 outputfile.write(nodeID[nodeindex]) 79 outputfile.write("\t") 80 if min1 < 0: 81 index1 = -min1-1 82 order1 = nodeorder[index1] 83 counts1 = nodecounts[index1] 84 outputfile.write(nodeID[index1]+"\t") 85 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1]) 86 else: 87 order1 = order[min1] 88 counts1 = 1 89 outputfile.write("%s%dX\t" % (keyword, min1)) 90 if min2 < 0: 91 index2 = -min2-1 92 order2 = nodeorder[index2] 93 counts2 = nodecounts[index2] 94 outputfile.write(nodeID[index2]+"\t") 95 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2]) 96 else: 97 order2 = order[min2] 98 counts2 = 1 99 outputfile.write("%s%dX\t" % (keyword, min2)) 100 outputfile.write(str(1.0-nodedist[nodeindex])) 101 outputfile.write("\n") 102 counts = counts1 + counts2 103 nodecounts[nodeindex] = counts 104 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / counts 105 outputfile.close() 106 # Now set up order based on the tree structure 107 index = _treesort(order, nodeorder, nodecounts, tree) 108 return index
109 110
111 -class Record(object):
112 """Store gene expression data. 113 114 A Record stores the gene expression data and related information contained 115 in a data file following the file format defined for Michael Eisen's 116 Cluster/TreeView program. A Record has the following members: 117 118 data: a matrix containing the gene expression data 119 mask: a matrix containing only 1's and 0's, denoting which values 120 are present (1) or missing (0). If all elements of mask are 121 one (no missing data), then mask is set to None. 122 geneid: a list containing a unique identifier for each gene 123 (e.g., ORF name) 124 genename: a list containing an additional description for each gene 125 (e.g., gene name) 126 gweight: the weight to be used for each gene when calculating the 127 distance 128 gorder: an array of real numbers indicating the preferred order of the 129 genes in the output file 130 expid: a list containing a unique identifier for each experimental 131 condition 132 eweight: the weight to be used for each experimental condition when 133 calculating the distance 134 eorder: an array of real numbers indication the preferred order in the 135 output file of the experimental conditions 136 uniqid: the string that was used instead of UNIQID in the input file. 137 138 """ 139
140 - def __init__(self, handle=None):
141 """Read gene expression data from the file handle and return a Record. 142 143 The file should be in the format defined for Michael Eisen's 144 Cluster/TreeView program. 145 146 """ 147 self.data = None 148 self.mask = None 149 self.geneid = None 150 self.genename = None 151 self.gweight = None 152 self.gorder = None 153 self.expid = None 154 self.eweight = None 155 self.eorder = None 156 self.uniqid = None 157 if not handle: 158 return 159 line = handle.readline().strip("\r\n").split("\t") 160 n = len(line) 161 self.uniqid = line[0] 162 self.expid = [] 163 cols = {0: "GENEID"} 164 for word in line[1:]: 165 if word == "NAME": 166 cols[line.index(word)] = word 167 self.genename = [] 168 elif word == "GWEIGHT": 169 cols[line.index(word)] = word 170 self.gweight = [] 171 elif word=="GORDER": 172 cols[line.index(word)] = word 173 self.gorder = [] 174 else: 175 self.expid.append(word) 176 self.geneid = [] 177 self.data = [] 178 self.mask = [] 179 needmask = 0 180 for line in handle: 181 line = line.strip("\r\n").split("\t") 182 if len(line) != n: 183 raise ValueError("Line with %d columns found (expected %d)" % 184 (len(line), n)) 185 if line[0] == "EWEIGHT": 186 i = max(cols) + 1 187 self.eweight = numpy.array(line[i:], float) 188 continue 189 if line[0] == "EORDER": 190 i = max(cols) + 1 191 self.eorder = numpy.array(line[i:], float) 192 continue 193 rowdata = [] 194 rowmask = [] 195 n = len(line) 196 for i in range(n): 197 word = line[i] 198 if i in cols: 199 if cols[i] == "GENEID": 200 self.geneid.append(word) 201 if cols[i] == "NAME": 202 self.genename.append(word) 203 if cols[i] == "GWEIGHT": 204 self.gweight.append(float(word)) 205 if cols[i] == "GORDER": 206 self.gorder.append(float(word)) 207 continue 208 if not word: 209 rowdata.append(0.0) 210 rowmask.append(0) 211 needmask = 1 212 else: 213 rowdata.append(float(word)) 214 rowmask.append(1) 215 self.data.append(rowdata) 216 self.mask.append(rowmask) 217 self.data = numpy.array(self.data) 218 if needmask: 219 self.mask = numpy.array(self.mask, int) 220 else: 221 self.mask = None 222 if self.gweight: 223 self.gweight = numpy.array(self.gweight) 224 if self.gorder: 225 self.gorder = numpy.array(self.gorder)
226
227 - def treecluster(self, transpose=0, method='m', dist='e'):
228 """Apply hierarchical clustering and return a Tree object. 229 230 The pairwise single, complete, centroid, and average linkage hierarchical 231 clustering methods are available. 232 233 transpose: if equal to 0, genes (rows) are clustered; 234 if equal to 1, microarrays (columns) are clustered. 235 dist : specifies the distance function to be used: 236 dist=='e': Euclidean distance 237 dist=='b': City Block distance 238 dist=='c': Pearson correlation 239 dist=='a': absolute value of the correlation 240 dist=='u': uncentered correlation 241 dist=='x': absolute uncentered correlation 242 dist=='s': Spearman's rank correlation 243 dist=='k': Kendall's tau 244 method : specifies which linkage method is used: 245 method=='s': Single pairwise linkage 246 method=='m': Complete (maximum) pairwise linkage (default) 247 method=='c': Centroid linkage 248 method=='a': Average pairwise linkage 249 250 See the description of the Tree class for more information about the Tree 251 object returned by this method. 252 253 """ 254 if transpose == 0: 255 weight = self.eweight 256 else: 257 weight = self.gweight 258 return treecluster(self.data, self.mask, weight, transpose, method, 259 dist)
260
261 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', 262 initialid=None):
263 """Apply k-means or k-median clustering. 264 265 This method returns a tuple (clusterid, error, nfound). 266 267 nclusters: number of clusters (the 'k' in k-means) 268 transpose: if equal to 0, genes (rows) are clustered; 269 if equal to 1, microarrays (columns) are clustered. 270 npass : number of times the k-means clustering algorithm is 271 performed, each time with a different (random) initial 272 condition. 273 method : specifies how the center of a cluster is found: 274 method=='a': arithmetic mean 275 method=='m': median 276 dist : specifies the distance function to be used: 277 dist=='e': Euclidean distance 278 dist=='b': City Block distance 279 dist=='c': Pearson correlation 280 dist=='a': absolute value of the correlation 281 dist=='u': uncentered correlation 282 dist=='x': absolute uncentered correlation 283 dist=='s': Spearman's rank correlation 284 dist=='k': Kendall's tau 285 initialid: the initial clustering from which the algorithm should start. 286 If initialid is None, the routine carries out npass 287 repetitions of the EM algorithm, each time starting from a 288 different random initial clustering. If initialid is given, 289 the routine carries out the EM algorithm only once, starting 290 from the given initial clustering and without randomizing the 291 order in which items are assigned to clusters (i.e., using 292 the same order as in the data matrix). In that case, the 293 k-means algorithm is fully deterministic. 294 295 Return values: 296 clusterid: array containing the number of the cluster to which each 297 gene/microarray was assigned in the best k-means clustering 298 solution that was found in the npass runs; 299 error: the within-cluster sum of distances for the returned k-means 300 clustering solution; 301 nfound: the number of times this solution was found. 302 303 """ 304 305 if transpose == 0: 306 weight = self.eweight 307 else: 308 weight = self.gweight 309 return kcluster(self.data, nclusters, self.mask, weight, transpose, 310 npass, method, dist, initialid)
311
312 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, 313 niter=1, dist='e'):
314 """Calculate a self-organizing map on a rectangular grid. 315 316 The somcluster method returns a tuple (clusterid, celldata). 317 318 transpose: if equal to 0, genes (rows) are clustered; 319 if equal to 1, microarrays (columns) are clustered. 320 nxgrid : the horizontal dimension of the rectangular SOM map 321 nygrid : the vertical dimension of the rectangular SOM map 322 inittau : the initial value of tau (the neighborbood function) 323 niter : the number of iterations 324 dist : specifies the distance function to be used: 325 dist=='e': Euclidean distance 326 dist=='b': City Block distance 327 dist=='c': Pearson correlation 328 dist=='a': absolute value of the correlation 329 dist=='u': uncentered correlation 330 dist=='x': absolute uncentered correlation 331 dist=='s': Spearman's rank correlation 332 dist=='k': Kendall's tau 333 334 Return values: 335 clusterid: array with two columns, while the number of rows is equal to 336 the number of genes or the number of microarrays depending on 337 whether genes or microarrays are being clustered. Each row in 338 the array contains the x and y coordinates of the cell in the 339 rectangular SOM grid to which the gene or microarray was 340 assigned. 341 celldata: an array with dimensions (nxgrid, nygrid, number of 342 microarrays) if genes are being clustered, or (nxgrid, 343 nygrid, number of genes) if microarrays are being clustered. 344 Each element [ix][iy] of this array is a 1D vector containing 345 the gene expression data for the centroid of the cluster in 346 the SOM grid cell with coordinates (ix, iy). 347 348 """ 349 350 if transpose == 0: 351 weight = self.eweight 352 else: 353 weight = self.gweight 354 return somcluster(self.data, self.mask, weight, transpose, 355 nxgrid, nygrid, inittau, niter, dist)
356
357 - def clustercentroids(self, clusterid=None, method='a', transpose=0):
358 """Calculate the cluster centroids and return a tuple (cdata, cmask). 359 360 The centroid is defined as either the mean or the median over all elements 361 for each dimension. 362 363 data : nrows x ncolumns array containing the expression data 364 mask : nrows x ncolumns array of integers, showing which data are 365 missing. If mask[i][j]==0, then data[i][j] is missing. 366 transpose: if equal to 0, gene (row) clusters are considered; 367 if equal to 1, microarray (column) clusters are considered. 368 clusterid: array containing the cluster number for each gene or 369 microarray. The cluster number should be non-negative. 370 method : specifies how the centroid is calculated: 371 method=='a': arithmetic mean over each dimension. (default) 372 method=='m': median over each dimension. 373 374 Return values: 375 cdata : 2D array containing the cluster centroids. If transpose==0, 376 then the dimensions of cdata are nclusters x ncolumns. If 377 transpose==1, then the dimensions of cdata are 378 nrows x nclusters. 379 cmask : 2D array of integers describing which elements in cdata, 380 if any, are missing. 381 382 """ 383 return clustercentroids(self.data, self.mask, clusterid, method, 384 transpose)
385
386 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e', 387 transpose=0):
388 """Calculate the distance between two clusters. 389 390 index1 : 1D array identifying which genes/microarrays belong to the 391 first cluster. If the cluster contains only one gene, then 392 index1 can also be written as a single integer. 393 index2 : 1D array identifying which genes/microarrays belong to the 394 second cluster. If the cluster contains only one gene, then 395 index2 can also be written as a single integer. 396 transpose: if equal to 0, genes (rows) are clustered; 397 if equal to 1, microarrays (columns) are clustered. 398 dist : specifies the distance function to be used: 399 dist=='e': Euclidean distance 400 dist=='b': City Block distance 401 dist=='c': Pearson correlation 402 dist=='a': absolute value of the correlation 403 dist=='u': uncentered correlation 404 dist=='x': absolute uncentered correlation 405 dist=='s': Spearman's rank correlation 406 dist=='k': Kendall's tau 407 method : specifies how the distance between two clusters is defined: 408 method=='a': the distance between the arithmetic means of the 409 two clusters 410 method=='m': the distance between the medians of the two 411 clusters 412 method=='s': the smallest pairwise distance between members 413 of the two clusters 414 method=='x': the largest pairwise distance between members of 415 the two clusters 416 method=='v': average of the pairwise distances between 417 members of the clusters 418 transpose: if equal to 0: clusters of genes (rows) are considered; 419 if equal to 1: clusters of microarrays (columns) are 420 considered. 421 422 """ 423 424 if transpose == 0: 425 weight = self.eweight 426 else: 427 weight = self.gweight 428 return clusterdistance(self.data, self.mask, weight, 429 index1, index2, method, dist, transpose)
430
431 - def distancematrix(self, transpose=0, dist='e'):
432 """Calculate the distance matrix and return it as a list of arrays 433 434 transpose: if equal to 0: calculate the distances between genes (rows); 435 if equal to 1: calculate the distances beteeen microarrays 436 (columns). 437 dist : specifies the distance function to be used: 438 dist=='e': Euclidean distance 439 dist=='b': City Block distance 440 dist=='c': Pearson correlation 441 dist=='a': absolute value of the correlation 442 dist=='u': uncentered correlation 443 dist=='x': absolute uncentered correlation 444 dist=='s': Spearman's rank correlation 445 dist=='k': Kendall's tau 446 447 Return value: 448 The distance matrix is returned as a list of 1D arrays containing the 449 distance matrix between the gene expression data. The number of columns 450 in each row is equal to the row number. Hence, the first row has zero 451 elements. An example of the return value is 452 matrix = [[], 453 array([1.]), 454 array([7., 3.]), 455 array([4., 2., 6.])] 456 This corresponds to the distance matrix 457 [0., 1., 7., 4.] 458 [1., 0., 3., 2.] 459 [7., 3., 0., 6.] 460 [4., 2., 6., 0.] 461 462 """ 463 if transpose == 0: 464 weight = self.eweight 465 else: 466 weight = self.gweight 467 return distancematrix(self.data, self.mask, weight, transpose, dist)
468
469 - def save(self, jobname, geneclusters=None, expclusters=None):
470 """Save the clustering results. 471 472 The saved files follow the convention for the Java TreeView program, 473 which can therefore be used to view the clustering result. 474 475 Arguments: 476 jobname: The base name of the files to be saved. The filenames are 477 jobname.cdt, jobname.gtr, and jobname.atr for 478 hierarchical clustering, and jobname-K*.cdt, 479 jobname-K*.kgg, jobname-K*.kag for k-means clustering 480 results. 481 geneclusters=None: For hierarchical clustering results, geneclusters 482 is a Tree object as returned by the treecluster method. 483 For k-means clustering results, geneclusters is a vector 484 containing ngenes integers, describing to which cluster a 485 given gene belongs. This vector can be calculated by 486 kcluster. 487 expclusters=None: For hierarchical clustering results, expclusters 488 is a Tree object as returned by the treecluster method. 489 For k-means clustering results, expclusters is a vector 490 containing nexps integers, describing to which cluster a 491 given experimental condition belongs. This vector can be 492 calculated by kcluster. 493 494 """ 495 (ngenes,nexps) = numpy.shape(self.data) 496 if self.gorder == None: 497 gorder = numpy.arange(ngenes) 498 else: 499 gorder = self.gorder 500 if self.eorder == None: 501 eorder = numpy.arange(nexps) 502 else: 503 eorder = self.eorder 504 if geneclusters!=None and expclusters!=None and \ 505 type(geneclusters) != type(expclusters): 506 raise ValueError("found one k-means and one hierarchical " 507 + "clustering solution in geneclusters and " 508 + "expclusters") 509 gid = 0 510 aid = 0 511 filename = jobname 512 postfix = "" 513 if type(geneclusters) == Tree: 514 # This is a hierarchical clustering result. 515 geneindex = _savetree(jobname, geneclusters, gorder, 0) 516 gid = 1 517 elif geneclusters!=None: 518 # This is a k-means clustering result. 519 filename = jobname + "_K" 520 k = max(geneclusters+1) 521 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 522 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0) 523 postfix = "_G%d" % k 524 else: 525 geneindex = numpy.argsort(gorder) 526 if type(expclusters) == Tree: 527 # This is a hierarchical clustering result. 528 expindex = _savetree(jobname, expclusters, eorder, 1) 529 aid = 1 530 elif expclusters!=None: 531 # This is a k-means clustering result. 532 filename = jobname + "_K" 533 k = max(expclusters+1) 534 kagfilename = "%s_K_A%d.kag" % (jobname, k) 535 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1) 536 postfix += "_A%d" % k 537 else: 538 expindex = numpy.argsort(eorder) 539 filename = filename + postfix 540 self._savedata(filename,gid,aid,geneindex,expindex)
541
542 - def _savekmeans(self, filename, clusterids, order, transpose):
543 # Save a k-means clustering solution 544 if transpose == 0: 545 label = self.uniqid 546 names = self.geneid 547 else: 548 label = "ARRAY" 549 names = self.expid 550 try: 551 outputfile = open(filename, "w") 552 except IOError: 553 raise IOError("Unable to open output file") 554 outputfile.write(label + "\tGROUP\n") 555 index = numpy.argsort(order) 556 n = len(names) 557 sortedindex = numpy.zeros(n, int) 558 counter = 0 559 cluster = 0 560 while counter < n: 561 for j in index: 562 if clusterids[j] == cluster: 563 outputfile.write("%s\t%s\n" % (names[j], cluster)) 564 sortedindex[counter] = j 565 counter += 1 566 cluster += 1 567 outputfile.close() 568 return sortedindex
569
570 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
571 # Save the clustered data. 572 if self.genename == None: 573 genename = self.geneid 574 else: 575 genename = self.genename 576 (ngenes, nexps) = numpy.shape(self.data) 577 try: 578 outputfile = open(jobname+'.cdt', 'w') 579 except IOError: 580 raise IOError("Unable to open output file") 581 if self.mask!=None: 582 mask = self.mask 583 else: 584 mask = numpy.ones((ngenes,nexps), int) 585 if self.gweight!=None: 586 gweight = self.gweight 587 else: 588 gweight = numpy.ones(ngenes) 589 if self.eweight!=None: 590 eweight = self.eweight 591 else: 592 eweight = numpy.ones(nexps) 593 if gid: 594 outputfile.write('GID\t') 595 outputfile.write(self.uniqid) 596 outputfile.write('\tNAME\tGWEIGHT') 597 # Now add headers for data columns. 598 for j in expindex: 599 outputfile.write('\t%s' % self.expid[j]) 600 outputfile.write('\n') 601 if aid: 602 outputfile.write("AID") 603 if gid: 604 outputfile.write('\t') 605 outputfile.write("\t\t") 606 for j in expindex: 607 outputfile.write('\tARRY%dX' % j) 608 outputfile.write('\n') 609 outputfile.write('EWEIGHT') 610 if gid: 611 outputfile.write('\t') 612 outputfile.write('\t\t') 613 for j in expindex: 614 outputfile.write('\t%f' % eweight[j]) 615 outputfile.write('\n') 616 for i in geneindex: 617 if gid: 618 outputfile.write('GENE%dX\t' % i) 619 outputfile.write("%s\t%s\t%f" % 620 (self.geneid[i], genename[i], gweight[i])) 621 for j in expindex: 622 outputfile.write('\t') 623 if mask[i,j]: 624 outputfile.write(str(self.data[i,j])) 625 outputfile.write('\n') 626 outputfile.close()
627 628
629 -def read(handle):
630 """Read gene expression data from the file handle and return a Record. 631 632 The file should be in the file format defined for Michael Eisen's 633 Cluster/TreeView program. 634 635 """ 636 return Record(handle)
637