1
2
3
4
5
6 import re
7
8 line_floats_re = re.compile("-*\d+\.\d+")
9
10 try:
11 float("nan")
12 _nan_float = float
13 except ValueError:
14
16 try:
17 return float(text)
18 except ValueError:
19 if text.lower()=="nan":
20 import struct
21 return struct.unpack('d', struct.pack('Q', 0xfff8000000000000))[0]
22 else:
23 raise
24
26 """Parse the basic information that should be present in most codeml output files.
27 """
28
29
30 multi_models = False
31 version_re = re.compile(".+ \(in paml version (\d+\.\d+[a-z]*).*")
32 model_re = re.compile("Model:\s+(.+)")
33 codon_freq_re = re.compile("Codon frequency model:\s+(.+)")
34 siteclass_re = re.compile("Site-class models:\s*(.*)")
35 for line in lines:
36
37 line_floats_res = line_floats_re.findall(line)
38 line_floats = [_nan_float(val) for val in line_floats_res]
39
40 version_res = version_re.match(line)
41 if version_res is not None:
42 results["version"] = version_res.group(1)
43 continue
44
45 model_res = model_re.match(line)
46 if model_res is not None:
47 results["model"] = model_res.group(1)
48 continue
49
50 codon_freq_res = codon_freq_re.match(line)
51 if codon_freq_res is not None:
52 results["codon model"] = codon_freq_res.group(1)
53 continue
54
55
56
57
58
59 siteclass_res = siteclass_re.match(line)
60 if siteclass_res is not None:
61 siteclass_model = siteclass_res.group(1)
62 if siteclass_model != "":
63 results["site-class model"] = siteclass_model
64 multi_models = False
65 else:
66 multi_models = True
67
68 if "ln Lmax" in line and len(line_floats) > 0:
69 results["lnL max"] = line_floats[0]
70 return (results, multi_models)
71
73 """Determine which NSsites models are present and parse them.
74 """
75
76 ns_sites = {}
77 model_re = re.compile("Model (\d+):\s+(.+)")
78 siteclass_model = results.get("site-class model")
79 if not multi_models:
80
81
82 if siteclass_model is None:
83 siteclass_model = "one-ratio"
84 current_model = {"one-ratio" : 0,
85 "NearlyNeutral" : 1,
86 "PositiveSelection" : 2,
87 "discrete (4 categories)" : 3,
88 "beta (4 categories)" : 7,
89 "beta&w>1 (5 categories)" : 8}[siteclass_model]
90 model_results = {"description" : siteclass_model}
91 model_results = parse_model(lines, model_results)
92 ns_sites[current_model] = model_results
93 else:
94
95
96 current_model = None
97 model_start = None
98 for line_num in range(len(lines)):
99
100
101
102
103 model_res = model_re.match(lines[line_num])
104 if model_res:
105 if current_model is not None:
106
107
108
109 parse_model(lines[model_start:line_num], model_results)
110 ns_sites[current_model] = model_results
111 model_start = line_num
112 current_model = int(model_res.group(1))
113 model_results = {"description":model_res.group(2)}
114 if ns_sites.get(current_model) is None:
115
116
117 model_results = parse_model(lines[model_start:], model_results)
118 ns_sites[current_model] = model_results
119
120
121
122 if len(ns_sites) == 1:
123 m0 = ns_sites.get(0)
124 if not m0 or len(m0) > 1:
125 results["NSsites"] = ns_sites
126 elif len(ns_sites) > 1:
127 results["NSsites"] = ns_sites
128 return results
129
131 """Parse an individual NSsites model's results.
132 """
133 parameters = {}
134 SEs_flag = False
135 dS_tree_flag = False
136 dN_tree_flag = False
137 w_tree_flag = False
138 num_params = None
139 tree_re = re.compile("\(\(+")
140 branch_re = re.compile("\s+(\d+\.\.\d+)[\s+\d+\.\d+]+")
141 model_params_re = re.compile("([a-z]\d?)=\s+(\d+\.\d+)")
142 for line in lines:
143
144 line_floats_res = line_floats_re.findall(line)
145 line_floats = [_nan_float(val) for val in line_floats_res]
146
147 branch_res = branch_re.match(line)
148
149 model_params = model_params_re.findall(line)
150
151
152
153 if "lnL(ntime:" in line and len(line_floats) > 0:
154 results["lnL"] = line_floats[0]
155 np_res = re.match("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)",line)
156 if np_res is not None:
157 num_params = int(np_res.group(1))
158
159
160
161
162
163
164 elif len(line_floats) == num_params and not SEs_flag:
165 parameters["parameter list"] = line.strip()
166
167
168
169
170
171
172 elif "SEs for parameters:" in line:
173 SEs_flag = True
174 elif SEs_flag and len(line_floats) == num_params:
175 parameters["SEs"] = line.strip()
176 SEs_flag = False
177
178
179 elif "tree length =" in line and len(line_floats) > 0:
180 results["tree length"] = line_floats[0]
181
182
183 elif tree_re.match(line) is not None:
184 if ":" in line:
185 if dS_tree_flag:
186 results["dS tree"] = line.strip()
187 dS_tree_flag = False
188 elif dN_tree_flag:
189 results["dN tree"] = line.strip()
190 dN_tree_flag = False
191 elif w_tree_flag:
192 line_edit = line.replace(" '#", ": ")
193 line_edit = line_edit.replace("'", "")
194 line_edit = line_edit.replace(" ,", ",")
195 line_edit = line_edit.replace(" )", ")")
196 results["omega tree"] = line_edit.strip()
197 w_tree_flag = False
198 else:
199 results["tree"] = line.strip()
200 elif "dS tree:" in line:
201 dS_tree_flag = True
202 elif "dN tree:" in line:
203 dN_tree_flag = True
204 elif "w ratios as labels for TreeView:" in line:
205 w_tree_flag = True
206
207
208 elif "rates for" in line and len(line_floats) > 0:
209 line_floats.insert(0, 1.0)
210 parameters["rates"] = line_floats
211
212
213 elif "kappa (ts/tv)" in line and len(line_floats) > 0:
214 parameters["kappa"] = line_floats[0]
215
216
217 elif "omega (dN/dS)" in line and len(line_floats) > 0:
218 parameters["omega"] = line_floats[0]
219 elif "w (dN/dS)" in line and len(line_floats) > 0:
220 parameters["omega"] = line_floats
221
222
223 elif "gene # " in line:
224 gene_num = int(re.match("gene # (\d+)", line).group(1))
225 if parameters.get("genes") is None:
226 parameters["genes"] = {}
227 parameters["genes"][gene_num] = {"kappa":line_floats[0],
228 "omega":line_floats[1]}
229
230
231 elif "tree length for dN" in line and len(line_floats) > 0:
232 parameters["dN"] = line_floats[0]
233
234
235 elif "tree length for dS" in line and len(line_floats) > 0:
236 parameters["dS"] = line_floats[0]
237
238
239
240
241
242 elif line[0:2] == "p:" or line[0:10] == "proportion":
243 site_classes = parse_siteclass_proportions(line_floats)
244 parameters["site classes"] = site_classes
245
246
247 elif line[0:2] == "w:":
248 site_classes = parameters.get("site classes")
249 site_classes = parse_siteclass_omegas(line, site_classes)
250 parameters["site classes"] = site_classes
251
252
253
254
255 elif "branch type " in line:
256 branch_type = re.match("branch type (\d)", line)
257 if branch_type:
258 site_classes = parameters.get("site classes")
259 branch_type_no = int(branch_type.group(1))
260 site_classes = parse_clademodelc(branch_type_no, line_floats,
261 site_classes)
262 parameters["site classes"] = site_classes
263
264
265
266
267 elif line[0:12] == "foreground w":
268 site_classes = parameters.get("site classes")
269 site_classes = parse_branch_site_a(True, line_floats, site_classes)
270 parameters["site classes"] = site_classes
271
272
273
274
275 elif line[0:12] == "background w":
276 site_classes = parameters.get("site classes")
277 site_classes = parse_branch_site_a(False, line_floats, site_classes)
278 parameters["site classes"] = site_classes
279
280
281
282
283
284 elif branch_res is not None and len(line_floats) > 0:
285 branch = branch_res.group(1)
286 if parameters.get("branches") is None:
287 parameters["branches"] = {}
288
289 line = line.replace(" -nan", " nan")
290 params = line.strip().split()[1:]
291 parameters["branches"][branch]= {
292 "t" : _nan_float(params[0].strip()),
293 "N" : _nan_float(params[1].strip()),
294 "S" : _nan_float(params[2].strip()),
295 "omega" :_nan_float(params[3].strip()),
296 "dN" : _nan_float(params[4].strip()),
297 "dS" : _nan_float(params[5].strip()),
298 "N*dN" : _nan_float(params[6].strip()),
299 "S*dS" : _nan_float(params[7].strip())}
300
301
302
303
304
305 elif len(model_params) > 0:
306 float_model_params = []
307 for param in model_params:
308 float_model_params.append((param[0], _nan_float(param[1])))
309 parameters = dict(parameters.items() + float_model_params)
310 if len(parameters) > 0:
311 results["parameters"] = parameters
312 return results
313
315 """For models which have multiple site classes, find the proportion of the alignment assigned to each class.
316 """
317 site_classes = {}
318 if len(line_floats) > 0:
319 for n in range(len(line_floats)):
320 site_classes[n] = {"proportion" : line_floats[n]}
321 return site_classes
322
324 """For models which have multiple site classes, find the omega estimated for each class.
325 """
326
327
328
329
330
331 line_floats = re.findall("\d{1,3}\.\d{5}", line)
332 if not site_classes or len(line_floats) == 0:
333 return
334 for n in range(len(line_floats)):
335 site_classes[n]["omega"] = line_floats[n]
336 return site_classes
337
339 """Parse results specific to the clade model C.
340 """
341 if not site_classes or len(line_floats) == 0:
342 return
343 for n in range(len(line_floats)):
344 if site_classes[n].get("branch types") is None:
345 site_classes[n]["branch types"] = {}
346 site_classes[n]["branch types"][branch_type_no] = line_floats[n]
347 return site_classes
348
350 """Parse results specific to the branch site A model.
351 """
352 if not site_classes or len(line_floats) == 0:
353 return
354 for n in range(len(line_floats)):
355 if site_classes[n].get("branch types") is None:
356 site_classes[n]["branch types"] = {}
357 if foreground:
358 site_classes[n]["branch types"]["foreground"] =\
359 line_floats[n]
360 else:
361 site_classes[n]["branch types"]["background"] =\
362 line_floats[n]
363 return site_classes
364
366 """Parse results from pairwise comparisons.
367 """
368
369
370
371
372
373
374
375 pair_re = re.compile("\d+ \((.+)\) ... \d+ \((.+)\)")
376 pairwise = {}
377 for line in lines:
378
379 line_floats_res = line_floats_re.findall(line)
380 line_floats = [_nan_float(val) for val in line_floats_res]
381 pair_res = pair_re.match(line)
382 if pair_res:
383 seq1 = pair_res.group(1)
384 seq2 = pair_res.group(2)
385 if pairwise.get(seq1) is None:
386 pairwise[seq1] = {}
387 if pairwise.get(seq2) is None:
388 pairwise[seq2] = {}
389 if len(line_floats) == 1:
390 pairwise[seq1][seq2] = {"lnL" : line_floats[0]}
391 pairwise[seq2][seq1] = pairwise[seq1][seq2]
392 elif len(line_floats) == 6:
393 pairwise[seq1][seq2] = {"t" : line_floats[0],
394 "S" : line_floats[1],
395 "N" : line_floats[2],
396 "omega" : line_floats[3],
397 "dN" : line_floats[4],
398 "dS" : line_floats[5]}
399 pairwise[seq2][seq1] = pairwise[seq1][seq2]
400 if len(pairwise) > 0:
401 results["pairwise"] = pairwise
402 return results
403
405 """Parse amino acid sequence distance results.
406 """
407 distances = {}
408 sequences = []
409 raw_aa_distances_flag = False
410 ml_aa_distances_flag = False
411 matrix_row_re = re.compile("(.+)\s{5,15}")
412 for line in lines:
413
414 line_floats_res = line_floats_re.findall(line)
415 line_floats = [_nan_float(val) for val in line_floats_res]
416 if "AA distances" in line:
417 raw_aa_distances_flag = True
418
419
420 ml_aa_distances_flag = False
421 elif "ML distances of aa seqs." in line:
422 ml_aa_distances_flag = True
423 raw_aa_distances_flag = False
424
425 matrix_row_res = matrix_row_re.match(line)
426 if matrix_row_res and (raw_aa_distances_flag or \
427 ml_aa_distances_flag):
428 seq_name = matrix_row_res.group(1).strip()
429 if seq_name not in sequences:
430 sequences.append(seq_name)
431 if raw_aa_distances_flag:
432 if distances.get("raw") is None:
433 distances["raw"] = {}
434 distances["raw"][seq_name] = {}
435 for i in range(0, len(line_floats)):
436 distances["raw"][seq_name][sequences[i]] = line_floats[i]
437 distances["raw"][sequences[i]][seq_name] = line_floats[i]
438 else:
439 if distances.get("ml") is None:
440 distances["ml"] = {}
441 distances["ml"][seq_name] = {}
442 for i in range(0, len(line_floats)):
443 distances["ml"][seq_name][sequences[i]] = line_floats[i]
444 distances["ml"][sequences[i]][seq_name] = line_floats[i]
445 if len(distances) > 0:
446 results["distances"] = distances
447 return results
448