Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  Parser for ACE files output by PHRAP. 
  7   
  8  Written by Frank Kauff (fkauff@duke.edu) and 
  9  Cymon J. Cox (cymon@duke.edu) 
 10   
 11  Uses the Biopython Parser interface: ParserSupport.py 
 12   
 13  Usage: 
 14   
 15  There are two ways of reading an ace file: 
 16  1) The function 'read' reads the whole file at once; 
 17  2) The function 'parse' reads the file contig after contig. 
 18       
 19  1) Parse whole ace file at once: 
 20   
 21          from Bio.Sequencing import Ace 
 22          acefilerecord=Ace.read(open('my_ace_file.ace')) 
 23   
 24  This gives you: 
 25          acefilerecord.ncontigs (the number of contigs in the ace file) 
 26          acefilerecord.nreads (the number of reads in the ace file) 
 27          acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 28   
 29  The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 30  for this contig in a list of instances of the Read class, e.g.: 
 31   
 32          contig3=acefilerecord.contigs[2] 
 33          read4=contig3.reads[3] 
 34          RD_of_read4=read4.rd 
 35          DS_of_read4=read4.ds 
 36   
 37  CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 38  sorted into the right place. 
 39   
 40  see _RecordConsumer for details. 
 41   
 42  2) Or you can iterate over the contigs of an ace file one by one in the ususal way: 
 43   
 44          from Bio.Sequencing import Ace 
 45          contigs=Ace.parse(open('my_ace_file.ace')) 
 46          for contig in contigs: 
 47              print contig.name 
 48              ... 
 49   
 50  Please note that for memory efficiency, when using the iterator approach, only one 
 51  contig is kept in memory at once.  However, there can be a footer to the ACE file 
 52  containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. 
 53  Because the parser doesn't see this data until the final record, it cannot be added to 
 54  the appropriate records.  Instead these tags will be returned with the last contig record. 
 55  Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 56  are needed, the 'read' function rather than the 'parse' function might be more appropriate. 
 57  """ 
 58   
 59   
60 -class rd(object):
61 """RD (reads), store a read with its name, sequence etc. 62 63 The location and strand each read is mapped to is held in the AF lines. 64 """
65 - def __init__(self):
66 self.name='' 67 self.padded_bases=None 68 self.info_items=None 69 self.read_tags=None 70 self.sequence=''
71
72 -class qa(object):
73 """QA (read quality), including which part if any was used as the consensus."""
74 - def __init__(self, line=None):
75 self.qual_clipping_start=None 76 self.qual_clipping_end=None 77 self.align_clipping_start=None 78 self.align_clipping_end=None 79 if line: 80 header=map(eval,line.split()[1:]) 81 self.qual_clipping_start=header[0] 82 self.qual_clipping_end=header[1] 83 self.align_clipping_start=header[2] 84 self.align_clipping_end=header[3]
85
86 -class ds(object):
87 """DS lines, include file name of a read's chromatogram file."""
88 - def __init__(self, line=None):
89 self.chromat_file='' 90 self.phd_file='' 91 self.time='' 92 self.chem='' 93 self.dye='' 94 self.template='' 95 self.direction='' 96 if line: 97 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION'] 98 poss=map(line.find,tags) 99 tagpos=dict(zip(poss,tags)) 100 if -1 in tagpos: 101 del tagpos[-1] 102 ps=tagpos.keys() 103 ps.sort() 104 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]): 105 setattr(self,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
106 107
108 -class af(object):
109 """AF lines, define the location of the read within the contig. 110 111 Note attribute coru is short for complemented (C) or uncomplemented (U), 112 since the strand information is stored in an ACE file using either the 113 C or U character. 114 """
115 - def __init__(self, line=None):
116 self.name='' 117 self.coru=None 118 self.padded_start=None 119 if line: 120 header = line.split() 121 self.name = header[1] 122 self.coru = header[2] 123 self.padded_start = int(header[3])
124
125 -class bs(object):
126 """"BS (base segment), which read was chosen as the consensus at each position."""
127 - def __init__(self, line=None):
128 self.name='' 129 self.padded_start=None 130 self.padded_end=None 131 if line: 132 header = line.split() 133 self.padded_start = int(header[1]) 134 self.padded_end = int(header[2]) 135 self.name = header[3]
136
137 -class rt(object):
138 """RT (transient read tags), generated by crossmatch and phrap."""
139 - def __init__(self, line=None):
140 self.name='' 141 self.tag_type='' 142 self.program='' 143 self.padded_start=None 144 self.padded_end=None 145 self.date='' 146 self.comment=[] 147 if line: 148 header=line.split() 149 self.name=header[0] 150 self.tag_type=header[1] 151 self.program=header[2] 152 self.padded_start=int(header[3]) 153 self.padded_end=int(header[4]) 154 self.date=header[5]
155
156 -class ct(object):
157 """CT (consensus tags)."""
158 - def __init__(self, line=None):
159 self.name='' 160 self.tag_type='' 161 self.program='' 162 self.padded_start=None 163 self.padded_end=None 164 self.date='' 165 self.notrans='' 166 self.info=[] 167 self.comment=[] 168 if line: 169 header=line.split() 170 self.name = header[0] 171 self.tag_type = header[1] 172 self.program = header[2] 173 self.padded_start = int(header[3]) 174 self.padded_end = int(header[4]) 175 self.date = header[5] 176 if len(header)==7: 177 self.notrans = header[6]
178
179 -class wa(object):
180 """WA (whole assembly tag), holds the assembly program name, version, etc."""
181 - def __init__(self, line=None):
182 self.tag_type='' 183 self.program='' 184 self.date='' 185 self.info=[] 186 if line: 187 header = line.split() 188 self.tag_type = header[0] 189 self.program = header[1] 190 self.date = header[2]
191
192 -class wr(object):
193 """WR lines."""
194 - def __init__(self, line=None):
195 self.name='' 196 self.aligned='' 197 self.program='' 198 self.date=[] 199 if line: 200 header = line.split() 201 self.name = header[0] 202 self.aligned = header[1] 203 self.program = header[2] 204 self.date = header[3]
205
206 -class Reads(object):
207 """Holds information about a read supporting an ACE contig."""
208 - def __init__(self, line=None):
209 self.rd=None # one per read 210 self.qa=None # one per read 211 self.ds=None # none or one per read 212 self.rt=None # none or many per read 213 self.wr=None # none or many per read 214 if line: 215 self.rd = rd() 216 header = line.split() 217 self.rd.name = header[1] 218 self.rd.padded_bases = int(header[2]) 219 self.rd.info_items = int(header[3]) 220 self.rd.read_tags = int(header[4])
221
222 -class Contig(object):
223 """Holds information about a contig from an ACE record."""
224 - def __init__(self, line=None):
225 self.name = '' 226 self.nbases=None 227 self.nreads=None 228 self.nsegments=None 229 self.uorc=None 230 self.sequence="" 231 self.quality=[] 232 self.af=[] 233 self.bs=[] 234 self.reads=[] 235 self.ct=None # none or many 236 self.wa=None # none or many 237 if line: 238 header = line.split() 239 self.name = header[1] 240 self.nbases = int(header[2]) 241 self.nreads = int(header[3]) 242 self.nsegments = int(header[4]) 243 self.uorc = header[5]
244
245 -def parse(handle):
246 """parse(handle) 247 248 where handle is a file-like object. 249 250 This function returns an iterator that allows you to iterate 251 over the ACE file record by record: 252 253 records = parse(handle) 254 for record in records: 255 # do something with the record 256 257 where each record is a Contig object. 258 """ 259 260 handle = iter(handle) 261 262 line = "" 263 while True: 264 # at beginning, skip the AS and look for first CO command 265 try: 266 while True: 267 if line.startswith('CO'): 268 break 269 line = handle.next() 270 except StopIteration: 271 return 272 273 record = Contig(line) 274 275 for line in handle: 276 line = line.strip() 277 if not line: 278 break 279 record.sequence+=line 280 281 for line in handle: 282 if line.strip(): 283 break 284 if not line.startswith("BQ"): 285 raise ValueError("Failed to find BQ line") 286 287 for line in handle: 288 if not line.strip(): 289 break 290 record.quality.extend(map(int,line.split())) 291 292 for line in handle: 293 if line.strip(): 294 break 295 296 while True: 297 if not line.startswith("AF "): 298 break 299 record.af.append(af(line)) 300 try: 301 line = handle.next() 302 except StopIteration: 303 raise ValueError("Unexpected end of AF block") 304 305 while True: 306 if line.strip(): 307 break 308 try: 309 line = handle.next() 310 except StopIteration: 311 raise ValueError("Unexpected end of file") 312 313 while True: 314 if not line.startswith("BS "): 315 break 316 record.bs.append(bs(line)) 317 try: 318 line = handle.next() 319 except StopIteration: 320 raise ValueError("Failed to find end of BS block") 321 322 # now read all the read data 323 # it starts with a 'RD', and then a mandatory QA 324 # then follows an optional DS 325 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actual read or contig, 326 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 327 # with that later. 328 while True: 329 330 # each read must have a rd and qa 331 try: 332 while True: 333 # If I've met the condition, then stop reading the line. 334 if line.startswith("RD "): 335 break 336 line = handle.next() 337 except StopIteration: 338 raise ValueError("Failed to find RD line") 339 340 record.reads.append(Reads(line)) 341 342 for line in handle: 343 line = line.strip() 344 if not line: 345 break 346 record.reads[-1].rd.sequence+=line 347 348 for line in handle: 349 if line.strip(): 350 break 351 if not line.startswith("QA "): 352 raise ValueError("Failed to find QA line") 353 record.reads[-1].qa = qa(line) 354 355 # now one ds can follow 356 for line in handle: 357 if line.strip(): 358 break 359 else: 360 break 361 362 if line.startswith("DS "): 363 record.reads[-1].ds = ds(line) 364 line = "" 365 # the file could just end, or there's some more stuff. In ace files, anything can happen. 366 # the following tags are interspersed between reads and can appear multiple times. 367 while True: 368 # something left 369 try: 370 while True: 371 if line.strip(): 372 break 373 line = handle.next() 374 except StopIteration: 375 # file ends here 376 break 377 if line.startswith("RT{"): 378 # now if we're at the end of the file, this rt could 379 # belong to a previous read, not the actual one. 380 # we store it here were it appears, the user can sort later. 381 if record.reads[-1].rt is None: 382 record.reads[-1].rt=[] 383 for line in handle: 384 line=line.strip() 385 #if line=="COMMENT{": 386 if line.startswith("COMMENT{"): 387 if line[8:].strip(): 388 #MIRA 3.0.5 would miss the new line out :( 389 record.reads[-1].rt[-1].comment.append(line[8:]) 390 for line in handle: 391 line = line.strip() 392 if line.endswith("C}"): 393 break 394 record.reads[-1].rt[-1].comment.append(line) 395 elif line=='}': 396 break 397 else: 398 record.reads[-1].rt.append(rt(line)) 399 line = "" 400 elif line.startswith("WR{"): 401 if record.reads[-1].wr is None: 402 record.reads[-1].wr=[] 403 for line in handle: 404 line=line.strip() 405 if line=='}': break 406 record.reads[-1].wr.append(wr(line)) 407 line = "" 408 elif line.startswith("WA{"): 409 if record.wa is None: 410 record.wa=[] 411 try: 412 line = handle.next() 413 except StopIteration: 414 raise ValueError("Failed to read WA block") 415 record.wa.append(wa(line)) 416 for line in handle: 417 line=line.strip() 418 if line=='}': break 419 record.wa[-1].info.append(line) 420 line = "" 421 elif line.startswith("CT{"): 422 if record.ct is None: 423 record.ct=[] 424 try: 425 line = handle.next() 426 except StopIteration: 427 raise ValueError("Failed to read CT block") 428 record.ct.append(ct(line)) 429 for line in handle: 430 line=line.strip() 431 if line=="COMMENT{": 432 for line in handle: 433 line = line.strip() 434 if line.endswith("C}"): 435 break 436 record.ct[-1].comment.append(line) 437 elif line=='}': 438 break 439 else: 440 record.ct[-1].info.append(line) 441 line = "" 442 else: 443 break 444 445 if not line.startswith('RD'): # another read? 446 break 447 448 yield record
449
450 -class ACEFileRecord(object):
451 """Holds data of an ACE file. 452 """
453 - def __init__(self):
454 self.ncontigs=None 455 self.nreads=None 456 self.contigs=[] 457 self.wa=None # none or many
458
459 - def sort(self):
460 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """ 461 462 ct=[] 463 rt=[] 464 wr=[] 465 # search for tags that aren't in the right position 466 for i in range(len(self.contigs)): 467 c = self.contigs[i] 468 if c.wa: 469 if not self.wa: 470 self.wa=[] 471 self.wa.extend(c.wa) 472 if c.ct: 473 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name] 474 for x in newcts: 475 self.contigs[i].ct.remove(x) 476 ct.extend(newcts) 477 for j in range(len(c.reads)): 478 r = c.reads[j] 479 if r.rt: 480 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name] 481 for x in newrts: 482 self.contigs[i].reads[j].rt.remove(x) 483 rt.extend(newrts) 484 if r.wr: 485 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name] 486 for x in newwrs: 487 self.contigs[i].reads[j].wr.remove(x) 488 wr.extend(newwrs) 489 # now sort them into their proper place 490 for i in range(len(self.contigs)): 491 c = self.contigs[i] 492 for ct_tag in ct: 493 if ct_tag.name==c.name: 494 if self.contigs[i].ct is None: 495 self.contigs[i].ct=[] 496 self.contigs[i].ct.append(ct_tag) 497 if rt or wr: 498 for j in range(len(c.reads)): 499 r = c.reads[j] 500 for rt_tag in rt: 501 if rt_tag.name==r.rd.name: 502 if self.contigs[i].reads[j].rt is None: 503 self.contigs[i].reads[j].rt=[] 504 self.contigs[i].reads[j].rt.append(rt_tag) 505 for wr_tag in wr: 506 if wr_tag.name==r.rd.name: 507 if self.contigs[i].reads[j].wr is None: 508 self.contigs[i].reads[j].wr=[] 509 self.contigs[i].reads[j].wr.append(wr_tag)
510
511 -def read(handle):
512 """Parses the full ACE file in list of contigs. 513 514 """ 515 516 handle = iter(handle) 517 518 record=ACEFileRecord() 519 520 try: 521 line = handle.next() 522 except StopIteration: 523 raise ValueError("Premature end of file") 524 525 # check if the file starts correctly 526 if not line.startswith('AS'): 527 raise ValueError("File does not start with 'AS'.") 528 529 words = line.split() 530 record.ncontigs, record.nreads = map(int, words[1:3]) 531 532 # now read all the records 533 record.contigs = list(parse(handle)) 534 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 535 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 536 # if all tags are at the end, they are read with the last contig. The concept of an 537 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 538 # them and put them into the appropriate contig/read instance. 539 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 540 record.sort() 541 return record
542