Package Bio :: Package AlignIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.FastaIO

  1  # Copyright 2008-2011 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  This module contains a parser for the pairwise alignments produced by Bill 
 13  Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is 
 14  refered to as the "fasta-m10" file format (as we only support the machine 
 15  readable output format selected with the -m 10 command line option). 
 16   
 17  This module does NOT cover the generic "fasta" file format originally 
 18  developed as an input format to the FASTA tools.  The Bio.AlignIO and 
 19  Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files, 
 20  which can also be used to store a multiple sequence alignments. 
 21  """ 
 22   
 23  from Bio.Seq import Seq 
 24  from Bio.SeqRecord import SeqRecord 
 25  from Bio.Align import MultipleSeqAlignment 
 26  from Interfaces import AlignmentIterator 
 27  from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein 
 28  from Bio.Alphabet import Gapped 
 29   
 30   
31 -def _extract_alignment_region(alignment_seq_with_flanking, annotation):
32 """Helper function for the main parsing code (PRIVATE). 33 34 To get the actual pairwise alignment sequences, we must first 35 translate the un-gapped sequence based coordinates into positions 36 in the gapped sequence (which may have a flanking region shown 37 using leading - characters). To date, I have never seen any 38 trailing flanking region shown in the m10 file, but the 39 following code should also cope with that. 40 41 Note that this code seems to work fine even when the "sq_offset" 42 entries are prsent as a result of using the -X command line option. 43 """ 44 align_stripped = alignment_seq_with_flanking.strip("-") 45 display_start = int(annotation['al_display_start']) 46 if int(annotation['al_start']) <= int(annotation['al_stop']): 47 start = int(annotation['al_start']) \ 48 - display_start 49 end = int(annotation['al_stop']) \ 50 - display_start + 1 51 else: 52 #FASTA has flipped this sequence... 53 start = display_start \ 54 - int(annotation['al_start']) 55 end = display_start \ 56 - int(annotation['al_stop']) + 1 57 end += align_stripped.count("-") 58 assert 0 <= start and start < end and end <= len(align_stripped), \ 59 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \ 60 % (alignment_seq_with_flanking, start, end, annotation) 61 return align_stripped[start:end]
62
63 -def FastaM10Iterator(handle, alphabet = single_letter_alphabet):
64 """Alignment iterator for the FASTA tool's pairwise alignment output. 65 66 This is for reading the pairwise alignments output by Bill Pearson's 67 FASTA program when called with the -m 10 command line option for machine 68 readable output. For more details about the FASTA tools, see the website 69 http://fasta.bioch.virginia.edu/ and the paper: 70 71 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 72 73 This class is intended to be used via the Bio.AlignIO.parse() function 74 by specifying the format as "fasta-m10" as shown in the following code: 75 76 from Bio import AlignIO 77 handle = ... 78 for a in AlignIO.parse(handle, "fasta-m10"): 79 assert len(a) == 2, "Should be pairwise!" 80 print "Alignment length %i" % a.get_alignment_length() 81 for record in a: 82 print record.seq, record.name, record.id 83 84 Note that this is not a full blown parser for all the information 85 in the FASTA output - for example, most of the header and all of the 86 footer is ignored. Also, the alignments are not batched according to 87 the input queries. 88 89 Also note that there can be up to about 30 letters of flanking region 90 included in the raw FASTA output as contextual information. This is NOT 91 part of the alignment itself, and is not included in the resulting 92 MultipleSeqAlignment objects returned. 93 """ 94 if alphabet is None: 95 alphabet = single_letter_alphabet 96 97 state_PREAMBLE = -1 98 state_NONE = 0 99 state_QUERY_HEADER = 1 100 state_ALIGN_HEADER = 2 101 state_ALIGN_QUERY = 3 102 state_ALIGN_MATCH = 4 103 state_ALIGN_CONS = 5 104 105 def build_hsp(): 106 assert query_tags, query_tags 107 assert match_tags, match_tags 108 evalue = align_tags.get("fa_expect", None) 109 q = "?" #Just for printing len(q) in debug below 110 m = "?" #Just for printing len(m) in debug below 111 tool = global_tags.get("tool", "").upper() 112 try: 113 q = _extract_alignment_region(query_seq, query_tags) 114 if tool in ["TFASTX"] and len(match_seq) == len(q): 115 m = match_seq 116 #Quick hack until I can work out how -, * and / characters 117 #and the apparent mix of aa and bp coordindates works. 118 else: 119 m = _extract_alignment_region(match_seq, match_tags) 120 assert len(q) == len(m) 121 except AssertionError, err: 122 print "Darn... amino acids vs nucleotide coordinates?" 123 print tool 124 print query_seq 125 print query_tags 126 print q, len(q) 127 print match_seq 128 print match_tags 129 print m, len(m) 130 print handle.name 131 raise err 132 133 assert alphabet is not None 134 alignment = MultipleSeqAlignment([], alphabet) 135 136 #TODO - Introduce an annotated alignment class? 137 #For now, store the annotation a new private property: 138 alignment._annotations = {} 139 140 #Want to record both the query header tags, and the alignment tags. 141 for key, value in header_tags.iteritems(): 142 alignment._annotations[key] = value 143 for key, value in align_tags.iteritems(): 144 alignment._annotations[key] = value 145 146 #Query 147 #===== 148 record = SeqRecord(Seq(q, alphabet), 149 id = query_id, 150 name = "query", 151 description = query_descr, 152 annotations = {"original_length" : int(query_tags["sq_len"])}) 153 #TODO - handle start/end coordinates properly. Short term hack for now: 154 record._al_start = int(query_tags["al_start"]) 155 record._al_stop = int(query_tags["al_stop"]) 156 alignment.append(record) 157 158 #TODO - What if a specific alphabet has been requested? 159 #TODO - Use an IUPAC alphabet? 160 #TODO - Can FASTA output RNA? 161 if alphabet == single_letter_alphabet and "sq_type" in query_tags: 162 if query_tags["sq_type"] == "D": 163 record.seq.alphabet = generic_dna 164 elif query_tags["sq_type"] == "p": 165 record.seq.alphabet = generic_protein 166 if "-" in q: 167 if not hasattr(record.seq.alphabet,"gap_char"): 168 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 169 170 #Match 171 #===== 172 record = SeqRecord(Seq(m, alphabet), 173 id = match_id, 174 name = "match", 175 description = match_descr, 176 annotations = {"original_length" : int(match_tags["sq_len"])}) 177 #TODO - handle start/end coordinates properly. Short term hack for now: 178 record._al_start = int(match_tags["al_start"]) 179 record._al_stop = int(match_tags["al_stop"]) 180 alignment.append(record) 181 182 #This is still a very crude way of dealing with the alphabet: 183 if alphabet == single_letter_alphabet and "sq_type" in match_tags: 184 if match_tags["sq_type"] == "D": 185 record.seq.alphabet = generic_dna 186 elif match_tags["sq_type"] == "p": 187 record.seq.alphabet = generic_protein 188 if "-" in m: 189 if not hasattr(record.seq.alphabet,"gap_char"): 190 record.seq.alphabet = Gapped(record.seq.alphabet, "-") 191 192 return alignment
193 194 state = state_PREAMBLE 195 query_id = None 196 match_id = None 197 query_descr = "" 198 match_descr = "" 199 global_tags = {} 200 header_tags = {} 201 align_tags = {} 202 query_tags = {} 203 match_tags = {} 204 query_seq = "" 205 match_seq = "" 206 cons_seq = "" 207 for line in handle: 208 if ">>>" in line and not line.startswith(">>>"): 209 if query_id and match_id: 210 #This happens on old FASTA output which lacked an end of 211 #query >>><<< marker line. 212 yield build_hsp() 213 state = state_NONE 214 query_descr = line[line.find(">>>")+3:].strip() 215 query_id = query_descr.split(None,1)[0] 216 match_id = None 217 header_tags = {} 218 align_tags = {} 219 query_tags = {} 220 match_tags = {} 221 query_seq = "" 222 match_seq = "" 223 cons_seq = "" 224 elif line.startswith("!! No "): 225 #e.g. 226 #!! No library sequences with E() < 0.5 227 #or on more recent versions, 228 #No sequences with E() < 0.05 229 assert state == state_NONE 230 assert not header_tags 231 assert not align_tags 232 assert not match_tags 233 assert not query_tags 234 assert match_id is None 235 assert not query_seq 236 assert not match_seq 237 assert not cons_seq 238 query_id = None 239 elif line.strip() in [">>><<<", ">>>///"]: 240 #End of query, possible end of all queries 241 if query_id and match_id: 242 yield build_hsp() 243 state = state_NONE 244 query_id = None 245 match_id = None 246 header_tags = {} 247 align_tags = {} 248 query_tags = {} 249 match_tags = {} 250 query_seq = "" 251 match_seq = "" 252 cons_seq = "" 253 elif line.startswith(">>>"): 254 #Should be start of a match! 255 assert query_id is not None 256 assert line[3:].split(", ",1)[0] == query_id, line 257 assert match_id is None 258 assert not header_tags 259 assert not align_tags 260 assert not query_tags 261 assert not match_tags 262 assert not match_seq 263 assert not query_seq 264 assert not cons_seq 265 state = state_QUERY_HEADER 266 elif line.startswith(">>"): 267 #Should now be at start of a match alignment! 268 if query_id and match_id: 269 yield build_hsp() 270 align_tags = {} 271 query_tags = {} 272 match_tags = {} 273 query_seq = "" 274 match_seq = "" 275 cons_seq = "" 276 match_descr = line[2:].strip() 277 match_id = match_descr.split(None,1)[0] 278 state = state_ALIGN_HEADER 279 elif line.startswith(">--"): 280 #End of one HSP 281 assert query_id and match_id, line 282 yield build_hsp() 283 #Clean up read for next HSP 284 #but reuse header_tags 285 align_tags = {} 286 query_tags = {} 287 match_tags = {} 288 query_seq = "" 289 match_seq = "" 290 cons_seq = "" 291 state = state_ALIGN_HEADER 292 elif line.startswith(">"): 293 if state == state_ALIGN_HEADER: 294 #Should be start of query alignment seq... 295 assert query_id is not None, line 296 assert match_id is not None, line 297 assert query_id.startswith(line[1:].split(None,1)[0]), line 298 state = state_ALIGN_QUERY 299 elif state == state_ALIGN_QUERY: 300 #Should be start of match alignment seq 301 assert query_id is not None, line 302 assert match_id is not None, line 303 assert match_id.startswith(line[1:].split(None,1)[0]), line 304 state = state_ALIGN_MATCH 305 elif state == state_NONE: 306 #Can get > as the last line of a histogram 307 pass 308 else: 309 assert False, "state %i got %r" % (state, line) 310 elif line.startswith("; al_cons"): 311 assert state == state_ALIGN_MATCH, line 312 state = state_ALIGN_CONS 313 #Next line(s) should be consensus seq... 314 elif line.startswith("; "): 315 key, value = [s.strip() for s in line[2:].split(": ",1)] 316 if state == state_QUERY_HEADER: 317 header_tags[key] = value 318 elif state == state_ALIGN_HEADER: 319 align_tags[key] = value 320 elif state == state_ALIGN_QUERY: 321 query_tags[key] = value 322 elif state == state_ALIGN_MATCH: 323 match_tags[key] = value 324 else: 325 assert False, "Unexpected state %r, %r" % (state, line) 326 elif state == state_ALIGN_QUERY: 327 query_seq += line.strip() 328 elif state == state_ALIGN_MATCH: 329 match_seq += line.strip() 330 elif state == state_ALIGN_CONS: 331 cons_seq += line.strip("\n") 332 elif state == state_PREAMBLE: 333 if line.startswith("#"): 334 global_tags["command"] = line[1:].strip() 335 elif line.startswith(" version "): 336 global_tags["version"] = line[9:].strip() 337 elif " compares a " in line: 338 global_tags["tool"] = line[:line.find(" compares a ")].strip() 339 elif " searches a " in line: 340 global_tags["tool"] = line[:line.find(" searches a ")].strip() 341 else: 342 pass 343 344 345 if __name__ == "__main__": 346 print "Running a quick self-test" 347 348 #http://emboss.sourceforge.net/docs/themes/alnformats/align.simple 349 simple_example = \ 350 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 351 FASTA searches a protein or DNA sequence data bank 352 version 34.26 January 12, 2007 353 Please cite: 354 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448 355 356 Query library NC_002127.faa vs NC_009649.faa library 357 searching NC_009649.faa library 358 359 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa 360 vs NC_009649.faa library 361 362 45119 residues in 180 sequences 363 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 364 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 365 Lambda= 0.175043 366 367 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 368 join: 36, opt: 24, open/ext: -10/-2, width: 16 369 Scan time: 0.000 370 The best scores are: opt bits E(180) 371 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58 372 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99 373 374 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library 375 ; pg_name: /opt/fasta/fasta34 376 ; pg_ver: 34.26 377 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 378 ; pg_name: FASTA 379 ; pg_ver: 3.5 Sept 2006 380 ; pg_matrix: BL50 (15:-5) 381 ; pg_open-ext: -10 -2 382 ; pg_ktup: 2 383 ; pg_optcut: 24 384 ; pg_cgap: 36 385 ; mp_extrap: 60000 180 386 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043 387 ; mp_KS: -0.0000 (N=0) at 8159228 388 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 389 ; fa_frame: f 390 ; fa_initn: 65 391 ; fa_init1: 43 392 ; fa_opt: 71 393 ; fa_z-score: 90.3 394 ; fa_bits: 24.9 395 ; fa_expect: 0.58 396 ; sw_score: 71 397 ; sw_ident: 0.250 398 ; sw_sim: 0.574 399 ; sw_overlap: 108 400 >gi|10955263| .. 401 ; sq_len: 107 402 ; sq_offset: 1 403 ; sq_type: p 404 ; al_start: 5 405 ; al_stop: 103 406 ; al_display_start: 1 407 --------------------------MTKRSGSNT-RRRAISRPVRLTAE 408 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM----- 409 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD 410 >gi|152973457|ref|YP_001338508.1| .. 411 ; sq_len: 931 412 ; sq_type: p 413 ; al_start: 96 414 ; al_stop: 195 415 ; al_display_start: 66 416 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ 417 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI 418 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY 419 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ 420 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 421 ; fa_frame: f 422 ; fa_initn: 33 423 ; fa_init1: 33 424 ; fa_opt: 63 425 ; fa_z-score: 86.1 426 ; fa_bits: 23.1 427 ; fa_expect: 0.99 428 ; sw_score: 63 429 ; sw_ident: 0.266 430 ; sw_sim: 0.656 431 ; sw_overlap: 64 432 >gi|10955263| .. 433 ; sq_len: 107 434 ; sq_offset: 1 435 ; sq_type: p 436 ; al_start: 32 437 ; al_stop: 94 438 ; al_display_start: 2 439 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK 440 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL 441 LSRLMAD 442 >gi|152973588|ref|YP_001338639.1| .. 443 ; sq_len: 459 444 ; sq_type: p 445 ; al_start: 191 446 ; al_stop: 248 447 ; al_display_start: 161 448 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK 449 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG 450 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT 451 SNKALKSQISALLSSIQNKAVADEKLTDQE 452 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa 453 vs NC_009649.faa library 454 455 45119 residues in 180 sequences 456 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 457 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 458 Lambda= 0.179384 459 460 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 461 join: 36, opt: 24, open/ext: -10/-2, width: 16 462 Scan time: 0.000 463 The best scores are: opt bits E(180) 464 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29 465 466 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library 467 ; pg_name: /opt/fasta/fasta34 468 ; pg_ver: 34.26 469 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 470 ; pg_name: FASTA 471 ; pg_ver: 3.5 Sept 2006 472 ; pg_matrix: BL50 (15:-5) 473 ; pg_open-ext: -10 -2 474 ; pg_ktup: 2 475 ; pg_optcut: 24 476 ; pg_cgap: 36 477 ; mp_extrap: 60000 180 478 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384 479 ; mp_KS: -0.0000 (N=0) at 8159228 480 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 481 ; fa_frame: f 482 ; fa_initn: 50 483 ; fa_init1: 50 484 ; fa_opt: 58 485 ; fa_z-score: 95.8 486 ; fa_bits: 22.9 487 ; fa_expect: 0.29 488 ; sw_score: 58 489 ; sw_ident: 0.289 490 ; sw_sim: 0.632 491 ; sw_overlap: 38 492 >gi|10955264| .. 493 ; sq_len: 126 494 ; sq_offset: 1 495 ; sq_type: p 496 ; al_start: 1 497 ; al_stop: 38 498 ; al_display_start: 1 499 ------------------------------MKKDKKYQIEAIKNKDKTLF 500 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF 501 NGEKFSSYTLNKVTKTDEYN 502 >gi|152973462|ref|YP_001338513.1| .. 503 ; sq_len: 101 504 ; sq_type: p 505 ; al_start: 44 506 ; al_stop: 81 507 ; al_display_start: 14 508 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI 509 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA 510 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa 511 vs NC_009649.faa library 512 513 45119 residues in 180 sequences 514 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 515 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 516 Lambda= 0.210386 517 518 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2 519 join: 37, opt: 25, open/ext: -10/-2, width: 16 520 Scan time: 0.020 521 The best scores are: opt bits E(180) 522 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082 523 524 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library 525 ; pg_name: /opt/fasta/fasta34 526 ; pg_ver: 34.26 527 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa 528 ; pg_name: FASTA 529 ; pg_ver: 3.5 Sept 2006 530 ; pg_matrix: BL50 (15:-5) 531 ; pg_open-ext: -10 -2 532 ; pg_ktup: 2 533 ; pg_optcut: 25 534 ; pg_cgap: 37 535 ; mp_extrap: 60000 180 536 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386 537 ; mp_KS: -0.0000 (N=0) at 8159228 538 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578] 539 ; fa_frame: f 540 ; fa_initn: 52 541 ; fa_init1: 52 542 ; fa_opt: 70 543 ; fa_z-score: 105.5 544 ; fa_bits: 27.5 545 ; fa_expect: 0.082 546 ; sw_score: 70 547 ; sw_ident: 0.279 548 ; sw_sim: 0.651 549 ; sw_overlap: 43 550 >gi|10955265| .. 551 ; sq_len: 346 552 ; sq_offset: 1 553 ; sq_type: p 554 ; al_start: 197 555 ; al_stop: 238 556 ; al_display_start: 167 557 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK 558 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL 559 GEYFTENKPKYIIREIHQET 560 >gi|152973545|ref|YP_001338596.1| .. 561 ; sq_len: 242 562 ; sq_type: p 563 ; al_start: 52 564 ; al_stop: 94 565 ; al_display_start: 22 566 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD 567 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR 568 QDFAFTRKMRREARQVEQSW 569 >>><<< 570 571 572 579 residues in 3 query sequences 573 45119 residues in 180 library sequences 574 Scomplib [34.26] 575 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008 576 Total Scan time: 0.020 Total Display time: 0.010 577 578 Function used was FASTA [version 34.26 January 12, 2007] 579 580 """ 581 582 583 from StringIO import StringIO 584 585 alignments = list(FastaM10Iterator(StringIO(simple_example))) 586 assert len(alignments) == 4, len(alignments) 587 assert len(alignments[0]) == 2 588 for a in alignments: 589 print "Alignment %i sequences of length %i" \ 590 % (len(a), a.get_alignment_length()) 591 for r in a: 592 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"]) 593 #print a.annotations 594 print "Done" 595 596 import os 597 path = "../../Tests/Fasta/" 598 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"] 599 files.sort() 600 for filename in files: 601 if os.path.splitext(filename)[-1] == ".m10": 602 print 603 print filename 604 print "="*len(filename) 605 for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))): 606 print "#%i, %s" % (i+1,a) 607 for r in a: 608 if "-" in r.seq: 609 assert r.seq.alphabet.gap_char == "-" 610 else: 611 assert not hasattr(r.seq.alphabet, "gap_char") 612