1
2
3
4
5 """
6 Parser for ACE files output by PHRAP.
7
8 Written by Frank Kauff (fkauff@duke.edu) and
9 Cymon J. Cox (cymon@duke.edu)
10
11 Uses the Biopython Parser interface: ParserSupport.py
12
13 Usage:
14
15 There are two ways of reading an ace file:
16 1) The function 'read' reads the whole file at once;
17 2) The function 'parse' reads the file contig after contig.
18
19 1) Parse whole ace file at once:
20
21 from Bio.Sequencing import Ace
22 acefilerecord=Ace.read(open('my_ace_file.ace'))
23
24 This gives you:
25 acefilerecord.ncontigs (the number of contigs in the ace file)
26 acefilerecord.nreads (the number of reads in the ace file)
27 acefilerecord.contigs[] (one instance of the Contig class for each contig)
28
29 The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used
30 for this contig in a list of instances of the Read class, e.g.:
31
32 contig3=acefilerecord.contigs[2]
33 read4=contig3.reads[3]
34 RD_of_read4=read4.rd
35 DS_of_read4=read4.ds
36
37 CT, WA, RT tags from the end of the file can appear anywhere are automatically
38 sorted into the right place.
39
40 see _RecordConsumer for details.
41
42 2) Or you can iterate over the contigs of an ace file one by one in the ususal way:
43
44 from Bio.Sequencing import Ace
45 contigs=Ace.parse(open('my_ace_file.ace'))
46 for contig in contigs:
47 print contig.name
48 ...
49
50 Please note that for memory efficiency, when using the iterator approach, only one
51 contig is kept in memory at once. However, there can be a footer to the ACE file
52 containing WA, CT, RT or WR tags which contain additional meta-data on the contigs.
53 Because the parser doesn't see this data until the final record, it cannot be added to
54 the appropriate records. Instead these tags will be returned with the last contig record.
55 Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
56 are needed, the 'read' function rather than the 'parse' function might be more appropriate.
57 """
58
59
61 """RD (reads), store a read with its name, sequence etc.
62
63 The location and strand each read is mapped to is held in the AF lines.
64 """
66 self.name=''
67 self.padded_bases=None
68 self.info_items=None
69 self.read_tags=None
70 self.sequence=''
71
73 """QA (read quality), including which part if any was used as the consensus."""
75 self.qual_clipping_start=None
76 self.qual_clipping_end=None
77 self.align_clipping_start=None
78 self.align_clipping_end=None
79 if line:
80 header=map(eval,line.split()[1:])
81 self.qual_clipping_start=header[0]
82 self.qual_clipping_end=header[1]
83 self.align_clipping_start=header[2]
84 self.align_clipping_end=header[3]
85
87 """DS lines, include file name of a read's chromatogram file."""
89 self.chromat_file=''
90 self.phd_file=''
91 self.time=''
92 self.chem=''
93 self.dye=''
94 self.template=''
95 self.direction=''
96 if line:
97 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION']
98 poss=map(line.find,tags)
99 tagpos=dict(zip(poss,tags))
100 if -1 in tagpos:
101 del tagpos[-1]
102 ps=tagpos.keys()
103 ps.sort()
104 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]):
105 setattr(self,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
106
107
109 """AF lines, define the location of the read within the contig.
110
111 Note attribute coru is short for complemented (C) or uncomplemented (U),
112 since the strand information is stored in an ACE file using either the
113 C or U character.
114 """
116 self.name=''
117 self.coru=None
118 self.padded_start=None
119 if line:
120 header = line.split()
121 self.name = header[1]
122 self.coru = header[2]
123 self.padded_start = int(header[3])
124
126 """"BS (base segment), which read was chosen as the consensus at each position."""
128 self.name=''
129 self.padded_start=None
130 self.padded_end=None
131 if line:
132 header = line.split()
133 self.padded_start = int(header[1])
134 self.padded_end = int(header[2])
135 self.name = header[3]
136
138 """RT (transient read tags), generated by crossmatch and phrap."""
140 self.name=''
141 self.tag_type=''
142 self.program=''
143 self.padded_start=None
144 self.padded_end=None
145 self.date=''
146 self.comment=[]
147 if line:
148 header=line.split()
149 self.name=header[0]
150 self.tag_type=header[1]
151 self.program=header[2]
152 self.padded_start=int(header[3])
153 self.padded_end=int(header[4])
154 self.date=header[5]
155
157 """CT (consensus tags)."""
159 self.name=''
160 self.tag_type=''
161 self.program=''
162 self.padded_start=None
163 self.padded_end=None
164 self.date=''
165 self.notrans=''
166 self.info=[]
167 self.comment=[]
168 if line:
169 header=line.split()
170 self.name = header[0]
171 self.tag_type = header[1]
172 self.program = header[2]
173 self.padded_start = int(header[3])
174 self.padded_end = int(header[4])
175 self.date = header[5]
176 if len(header)==7:
177 self.notrans = header[6]
178
180 """WA (whole assembly tag), holds the assembly program name, version, etc."""
182 self.tag_type=''
183 self.program=''
184 self.date=''
185 self.info=[]
186 if line:
187 header = line.split()
188 self.tag_type = header[0]
189 self.program = header[1]
190 self.date = header[2]
191
205
207 """Holds information about a read supporting an ACE contig."""
221
223 """Holds information about a contig from an ACE record."""
225 self.name = ''
226 self.nbases=None
227 self.nreads=None
228 self.nsegments=None
229 self.uorc=None
230 self.sequence=""
231 self.quality=[]
232 self.af=[]
233 self.bs=[]
234 self.reads=[]
235 self.ct=None
236 self.wa=None
237 if line:
238 header = line.split()
239 self.name = header[1]
240 self.nbases = int(header[2])
241 self.nreads = int(header[3])
242 self.nsegments = int(header[4])
243 self.uorc = header[5]
244
246 """parse(handle)
247
248 where handle is a file-like object.
249
250 This function returns an iterator that allows you to iterate
251 over the ACE file record by record:
252
253 records = parse(handle)
254 for record in records:
255 # do something with the record
256
257 where each record is a Contig object.
258 """
259
260 handle = iter(handle)
261
262 line = ""
263 while True:
264
265 try:
266 while True:
267 if line.startswith('CO'):
268 break
269 line = handle.next()
270 except StopIteration:
271 return
272
273 record = Contig(line)
274
275 for line in handle:
276 line = line.strip()
277 if not line:
278 break
279 record.sequence+=line
280
281 for line in handle:
282 if line.strip():
283 break
284 if not line.startswith("BQ"):
285 raise ValueError("Failed to find BQ line")
286
287 for line in handle:
288 if not line.strip():
289 break
290 record.quality.extend(map(int,line.split()))
291
292 for line in handle:
293 if line.strip():
294 break
295
296 while True:
297 if not line.startswith("AF "):
298 break
299 record.af.append(af(line))
300 try:
301 line = handle.next()
302 except StopIteration:
303 raise ValueError("Unexpected end of AF block")
304
305 while True:
306 if line.strip():
307 break
308 try:
309 line = handle.next()
310 except StopIteration:
311 raise ValueError("Unexpected end of file")
312
313 while True:
314 if not line.startswith("BS "):
315 break
316 record.bs.append(bs(line))
317 try:
318 line = handle.next()
319 except StopIteration:
320 raise ValueError("Failed to find end of BS block")
321
322
323
324
325
326
327
328 while True:
329
330
331 try:
332 while True:
333
334 if line.startswith("RD "):
335 break
336 line = handle.next()
337 except StopIteration:
338 raise ValueError("Failed to find RD line")
339
340 record.reads.append(Reads(line))
341
342 for line in handle:
343 line = line.strip()
344 if not line:
345 break
346 record.reads[-1].rd.sequence+=line
347
348 for line in handle:
349 if line.strip():
350 break
351 if not line.startswith("QA "):
352 raise ValueError("Failed to find QA line")
353 record.reads[-1].qa = qa(line)
354
355
356 for line in handle:
357 if line.strip():
358 break
359 else:
360 break
361
362 if line.startswith("DS "):
363 record.reads[-1].ds = ds(line)
364 line = ""
365
366
367 while True:
368
369 try:
370 while True:
371 if line.strip():
372 break
373 line = handle.next()
374 except StopIteration:
375
376 break
377 if line.startswith("RT{"):
378
379
380
381 if record.reads[-1].rt is None:
382 record.reads[-1].rt=[]
383 for line in handle:
384 line=line.strip()
385
386 if line.startswith("COMMENT{"):
387 if line[8:].strip():
388
389 record.reads[-1].rt[-1].comment.append(line[8:])
390 for line in handle:
391 line = line.strip()
392 if line.endswith("C}"):
393 break
394 record.reads[-1].rt[-1].comment.append(line)
395 elif line=='}':
396 break
397 else:
398 record.reads[-1].rt.append(rt(line))
399 line = ""
400 elif line.startswith("WR{"):
401 if record.reads[-1].wr is None:
402 record.reads[-1].wr=[]
403 for line in handle:
404 line=line.strip()
405 if line=='}': break
406 record.reads[-1].wr.append(wr(line))
407 line = ""
408 elif line.startswith("WA{"):
409 if record.wa is None:
410 record.wa=[]
411 try:
412 line = handle.next()
413 except StopIteration:
414 raise ValueError("Failed to read WA block")
415 record.wa.append(wa(line))
416 for line in handle:
417 line=line.strip()
418 if line=='}': break
419 record.wa[-1].info.append(line)
420 line = ""
421 elif line.startswith("CT{"):
422 if record.ct is None:
423 record.ct=[]
424 try:
425 line = handle.next()
426 except StopIteration:
427 raise ValueError("Failed to read CT block")
428 record.ct.append(ct(line))
429 for line in handle:
430 line=line.strip()
431 if line=="COMMENT{":
432 for line in handle:
433 line = line.strip()
434 if line.endswith("C}"):
435 break
436 record.ct[-1].comment.append(line)
437 elif line=='}':
438 break
439 else:
440 record.ct[-1].info.append(line)
441 line = ""
442 else:
443 break
444
445 if not line.startswith('RD'):
446 break
447
448 yield record
449
451 """Holds data of an ACE file.
452 """
454 self.ncontigs=None
455 self.nreads=None
456 self.contigs=[]
457 self.wa=None
458
460 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """
461
462 ct=[]
463 rt=[]
464 wr=[]
465
466 for i in range(len(self.contigs)):
467 c = self.contigs[i]
468 if c.wa:
469 if not self.wa:
470 self.wa=[]
471 self.wa.extend(c.wa)
472 if c.ct:
473 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name]
474 for x in newcts:
475 self.contigs[i].ct.remove(x)
476 ct.extend(newcts)
477 for j in range(len(c.reads)):
478 r = c.reads[j]
479 if r.rt:
480 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name]
481 for x in newrts:
482 self.contigs[i].reads[j].rt.remove(x)
483 rt.extend(newrts)
484 if r.wr:
485 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name]
486 for x in newwrs:
487 self.contigs[i].reads[j].wr.remove(x)
488 wr.extend(newwrs)
489
490 for i in range(len(self.contigs)):
491 c = self.contigs[i]
492 for ct_tag in ct:
493 if ct_tag.name==c.name:
494 if self.contigs[i].ct is None:
495 self.contigs[i].ct=[]
496 self.contigs[i].ct.append(ct_tag)
497 if rt or wr:
498 for j in range(len(c.reads)):
499 r = c.reads[j]
500 for rt_tag in rt:
501 if rt_tag.name==r.rd.name:
502 if self.contigs[i].reads[j].rt is None:
503 self.contigs[i].reads[j].rt=[]
504 self.contigs[i].reads[j].rt.append(rt_tag)
505 for wr_tag in wr:
506 if wr_tag.name==r.rd.name:
507 if self.contigs[i].reads[j].wr is None:
508 self.contigs[i].reads[j].wr=[]
509 self.contigs[i].reads[j].wr.append(wr_tag)
510
512 """Parses the full ACE file in list of contigs.
513
514 """
515
516 handle = iter(handle)
517
518 record=ACEFileRecord()
519
520 try:
521 line = handle.next()
522 except StopIteration:
523 raise ValueError("Premature end of file")
524
525
526 if not line.startswith('AS'):
527 raise ValueError("File does not start with 'AS'.")
528
529 words = line.split()
530 record.ncontigs, record.nreads = map(int, words[1:3])
531
532
533 record.contigs = list(parse(handle))
534
535
536
537
538
539
540 record.sort()
541 return record
542