Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  # Copyright 2000-2004 Brad Chapman. 
  2  # Copyright 2001 Iddo Friedberg. 
  3  # Copyright 2007-2010 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """ 
  9  Contains classes to deal with generic sequence alignment stuff not 
 10  specific to a particular program or format. 
 11   
 12  Classes: 
 13   - Alignment 
 14  """ 
 15  __docformat__ = "epytext en" #Don't just use plain text in epydoc API pages! 
 16   
 17  # biopython 
 18  from Bio.Seq import Seq 
 19  from Bio.SeqRecord import SeqRecord 
 20  from Bio import Alphabet 
 21   
22 -class Alignment(object):
23 """Represent a set of alignments (DEPRECATED). 24 25 This is a base class to represent alignments, which can be subclassed 26 to deal with an alignment in a specific format. 27 28 With the introduction of the MultipleSeqAlignment class in Bio.Align, 29 this base class is deprecated and is likely to be removed in future 30 releases of Biopython. 31 """
32 - def __init__(self, alphabet):
33 """Initialize a new Alignment object. 34 35 Arguments: 36 - alphabet - The alphabet to use for the sequence objects that are 37 created. This alphabet must be a gapped type. 38 39 e.g. 40 41 >>> from Bio.Alphabet import IUPAC, Gapped 42 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 43 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 44 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 45 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 46 >>> print align 47 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 48 ACTGCTAGCTAG Alpha 49 ACT-CTAGCTAG Beta 50 ACTGCTAGATAG Gamma 51 """ 52 import warnings 53 import Bio 54 warnings.warn("With the introduction of the MultipleSeqAlignment class in Bio.Align, this base class is deprecated and is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) 55 if not (isinstance(alphabet, Alphabet.Alphabet) \ 56 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 57 raise ValueError("Invalid alphabet argument") 58 self._alphabet = alphabet 59 # hold everything at a list of SeqRecord objects 60 self._records = []
61
62 - def _str_line(self, record):
63 """Returns a truncated string representation of a SeqRecord (PRIVATE). 64 65 This is a PRIVATE function used by the __str__ method. 66 """ 67 if len(record.seq) <= 50: 68 return "%s %s" % (record.seq, record.id) 69 else: 70 return "%s...%s %s" \ 71 % (record.seq[:44], record.seq[-3:], record.id)
72
73 - def __str__(self):
74 """Returns a multi-line string summary of the alignment. 75 76 This output is intended to be readable, but large alignments are 77 shown truncated. A maximum of 20 rows (sequences) and 50 columns 78 are shown, with the record identifiers. This should fit nicely on a 79 single screen. e.g. 80 81 >>> from Bio.Alphabet import IUPAC, Gapped 82 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 83 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 84 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 85 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 86 >>> print align 87 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 88 ACTGCTAGCTAG Alpha 89 ACT-CTAGCTAG Beta 90 ACTGCTAGATAG Gamma 91 92 See also the alignment's format method. 93 """ 94 rows = len(self._records) 95 lines = ["%s alignment with %i rows and %i columns" \ 96 % (str(self._alphabet), rows, self.get_alignment_length())] 97 if rows <= 20: 98 lines.extend([self._str_line(rec) for rec in self._records]) 99 else: 100 lines.extend([self._str_line(rec) for rec in self._records[:18]]) 101 lines.append("...") 102 lines.append(self._str_line(self._records[-1])) 103 return "\n".join(lines)
104
105 - def __repr__(self):
106 """Returns a representation of the object for debugging. 107 108 The representation cannot be used with eval() to recreate the object, 109 which is usually possible with simple python ojects. For example: 110 111 <Bio.Align.Generic.Alignment instance (2 records of length 14, 112 SingleLetterAlphabet()) at a3c184c> 113 114 The hex string is the memory address of the object, see help(id). 115 This provides a simple way to visually distinguish alignments of 116 the same size. 117 """ 118 #A doctest for __repr__ would be nice, but __class__ comes out differently 119 #if run via the __main__ trick. 120 return "<%s instance (%i records of length %i, %s) at %x>" % \ 121 (self.__class__, len(self._records), 122 self.get_alignment_length(), repr(self._alphabet), id(self))
123 #This version is useful for doing eval(repr(alignment)), 124 #but it can be VERY long: 125 #return "%s(%s, %s)" \ 126 # % (self.__class__, repr(self._records), repr(self._alphabet)) 127
128 - def format(self, format):
129 """Returns the alignment as a string in the specified file format. 130 131 The format should be a lower case string supported as an output 132 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 133 "stockholm", etc), which is used to turn the alignment into a 134 string. 135 136 e.g. 137 138 >>> from Bio.Alphabet import IUPAC, Gapped 139 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 140 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 141 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 142 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 143 >>> print align.format("fasta") 144 >Alpha 145 ACTGCTAGCTAG 146 >Beta 147 ACT-CTAGCTAG 148 >Gamma 149 ACTGCTAGATAG 150 <BLANKLINE> 151 >>> print align.format("phylip") 152 3 12 153 Alpha ACTGCTAGCT AG 154 Beta ACT-CTAGCT AG 155 Gamma ACTGCTAGAT AG 156 <BLANKLINE> 157 158 For Python 2.6, 3.0 or later see also the built in format() function. 159 """ 160 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 161 #See also the SeqRecord class and its format() method using Bio.SeqIO 162 return self.__format__(format)
163 164
165 - def __format__(self, format_spec):
166 """Returns the alignment as a string in the specified file format. 167 168 This method supports the python format() function added in 169 Python 2.6/3.0. The format_spec should be a lower case 170 string supported by Bio.AlignIO as an output file format. 171 See also the alignment's format() method.""" 172 if format_spec: 173 from StringIO import StringIO 174 from Bio import AlignIO 175 handle = StringIO() 176 AlignIO.write([self], handle, format_spec) 177 return handle.getvalue() 178 else: 179 #Follow python convention and default to using __str__ 180 return str(self)
181
182 - def get_all_seqs(self):
183 """Return all of the sequences involved in the alignment (DEPRECATED). 184 185 The return value is a list of SeqRecord objects. 186 187 This method is deprecated, as the Alignment object itself now offers 188 much of the functionality of a list of SeqRecord objects (e.g. 189 iteration or slicing to create a sub-alignment). Instead use the 190 Python builtin function list, i.e. my_list = list(my_align) 191 """ 192 import warnings 193 import Bio 194 warnings.warn("This method is deprecated, since the alignment object" 195 "now acts more like a list. Instead of calling " 196 "align.get_all_seqs() you can use list(align)", 197 Bio.BiopythonDeprecationWarning) 198 return self._records
199
200 - def __iter__(self):
201 """Iterate over alignment rows as SeqRecord objects. 202 203 e.g. 204 205 >>> from Bio.Alphabet import IUPAC, Gapped 206 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 207 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 208 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 209 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 210 >>> for record in align: 211 ... print record.id 212 ... print record.seq 213 Alpha 214 ACTGCTAGCTAG 215 Beta 216 ACT-CTAGCTAG 217 Gamma 218 ACTGCTAGATAG 219 """ 220 return iter(self._records)
221
222 - def get_seq_by_num(self, number):
223 """Retrieve a sequence by row number (DEPRECATED). 224 225 Returns: 226 - A Seq object for the requested sequence. 227 228 Raises: 229 - IndexError - If the specified number is out of range. 230 231 NOTE: This is a legacy method. In new code where you need to access 232 the rows of the alignment (i.e. the sequences) consider iterating 233 over them or accessing them as SeqRecord objects. 234 """ 235 import warnings 236 import Bio 237 warnings.warn("This is a legacy method and is likely to be removed in a future release of Biopython. In new code where you need to access the rows of the alignment (i.e. the sequences) consider iterating over them or accessing them as SeqRecord objects.", Bio.BiopythonDeprecationWarning) 238 return self._records[number].seq
239
240 - def __len__(self):
241 """Returns the number of sequences in the alignment. 242 243 Use len(alignment) to get the number of sequences (i.e. the number of 244 rows), and alignment.get_alignment_length() to get the length of the 245 longest sequence (i.e. the number of columns). 246 247 This is easy to remember if you think of the alignment as being like a 248 list of SeqRecord objects. 249 """ 250 return len(self._records)
251
252 - def get_alignment_length(self):
253 """Return the maximum length of the alignment. 254 255 All objects in the alignment should (hopefully) have the same 256 length. This function will go through and find this length 257 by finding the maximum length of sequences in the alignment. 258 259 >>> from Bio.Alphabet import IUPAC, Gapped 260 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 261 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 262 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 263 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 264 >>> align.get_alignment_length() 265 12 266 267 If you want to know the number of sequences in the alignment, 268 use len(align) instead: 269 270 >>> len(align) 271 3 272 273 """ 274 max_length = 0 275 276 for record in self._records: 277 if len(record.seq) > max_length: 278 max_length = len(record.seq) 279 280 return max_length
281
282 - def add_sequence(self, descriptor, sequence, start = None, end = None, 283 weight = 1.0):
284 """Add a sequence to the alignment. 285 286 This doesn't do any kind of alignment, it just adds in the sequence 287 object, which is assumed to be prealigned with the existing 288 sequences. 289 290 Arguments: 291 - descriptor - The descriptive id of the sequence being added. 292 This will be used as the resulting SeqRecord's 293 .id property (and, for historical compatibility, 294 also the .description property) 295 - sequence - A string with sequence info. 296 - start - You can explicitly set the start point of the sequence. 297 This is useful (at least) for BLAST alignments, which can 298 just be partial alignments of sequences. 299 - end - Specify the end of the sequence, which is important 300 for the same reason as the start. 301 - weight - The weight to place on the sequence in the alignment. 302 By default, all sequences have the same weight. (0.0 => 303 no weight, 1.0 => highest weight) 304 """ 305 new_seq = Seq(sequence, self._alphabet) 306 307 #We are now effectively using the SeqRecord's .id as 308 #the primary identifier (e.g. in Bio.SeqIO) so we should 309 #populate it with the descriptor. 310 #For backwards compatibility, also store this in the 311 #SeqRecord's description property. 312 new_record = SeqRecord(new_seq, 313 id = descriptor, 314 description = descriptor) 315 316 # hack! We really need to work out how to deal with annotations 317 # and features in biopython. Right now, I'll just use the 318 # generic annotations dictionary we've got to store the start 319 # and end, but we should think up something better. I don't know 320 # if I'm really a big fan of the LocatableSeq thing they've got 321 # in BioPerl, but I'm not positive what the best thing to do on 322 # this is... 323 if start: 324 new_record.annotations['start'] = start 325 if end: 326 new_record.annotations['end'] = end 327 328 # another hack to add weight information to the sequence 329 new_record.annotations['weight'] = weight 330 331 self._records.append(new_record)
332
333 - def get_column(self,col):
334 """Returns a string containing a given column. 335 336 e.g. 337 338 >>> from Bio.Alphabet import IUPAC, Gapped 339 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 340 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 341 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 342 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 343 >>> align.get_column(0) 344 'AAA' 345 >>> align.get_column(3) 346 'G-G' 347 """ 348 #TODO - Support negative indices? 349 col_str = '' 350 assert col >= 0 and col <= self.get_alignment_length() 351 for rec in self._records: 352 col_str += rec.seq[col] 353 return col_str
354
355 - def __getitem__(self, index):
356 """Access part of the alignment. 357 358 We'll use the following example alignment here for illustration: 359 360 >>> from Bio.Alphabet import IUPAC, Gapped 361 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 362 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 363 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 364 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 365 >>> align.add_sequence("Delta", "ACTGCTTGCTAG") 366 >>> align.add_sequence("Epsilon","ACTGCTTGATAG") 367 368 You can access a row of the alignment as a SeqRecord using an integer 369 index (think of the alignment as a list of SeqRecord objects here): 370 371 >>> first_record = align[0] 372 >>> print first_record.id, first_record.seq 373 Alpha ACTGCTAGCTAG 374 >>> last_record = align[-1] 375 >>> print last_record.id, last_record.seq 376 Epsilon ACTGCTTGATAG 377 378 You can also access use python's slice notation to create a sub-alignment 379 containing only some of the SeqRecord objects: 380 381 >>> sub_alignment = align[2:5] 382 >>> print sub_alignment 383 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 384 ACTGCTAGATAG Gamma 385 ACTGCTTGCTAG Delta 386 ACTGCTTGATAG Epsilon 387 388 This includes support for a step, i.e. align[start:end:step], which 389 can be used to select every second sequence: 390 391 >>> sub_alignment = align[::2] 392 >>> print sub_alignment 393 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 394 ACTGCTAGCTAG Alpha 395 ACTGCTAGATAG Gamma 396 ACTGCTTGATAG Epsilon 397 398 Or to get a copy of the alignment with the rows in reverse order: 399 400 >>> rev_alignment = align[::-1] 401 >>> print rev_alignment 402 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns 403 ACTGCTTGATAG Epsilon 404 ACTGCTTGCTAG Delta 405 ACTGCTAGATAG Gamma 406 ACT-CTAGCTAG Beta 407 ACTGCTAGCTAG Alpha 408 409 Right now, these are the ONLY indexing operations supported. The use of 410 a second column based index is under discussion for a future update. 411 """ 412 if isinstance(index, int): 413 #e.g. result = align[x] 414 #Return a SeqRecord 415 return self._records[index] 416 elif isinstance(index, slice): 417 #e.g. sub_aling = align[i:j:k] 418 #Return a new Alignment using only the specified records. 419 #TODO - See Bug 2554 for changing the __init__ method 420 #to allow us to do this more cleanly. 421 sub_align = Alignment(self._alphabet) 422 sub_align._records = self._records[index] 423 return sub_align 424 elif len(index)==2: 425 raise TypeError("Row and Column indexing is not currently supported,"\ 426 +"but may be in future.") 427 else: 428 raise TypeError("Invalid index type.")
429
430 -def _test():
431 """Run the Bio.Align.Generic module's doctests.""" 432 print "Running doctests..." 433 import doctest 434 doctest.testmod() 435 print "Done"
436 437 if __name__ == "__main__": 438 _test() 439