Package Bio :: Package Phylo :: Package PAML :: Module _parse_codeml
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_codeml

  1  # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8  line_floats_re = re.compile("-*\d+\.\d+") 
  9   
 10  try: 
 11      float("nan") 
 12      _nan_float = float 
 13  except ValueError: 
 14      #Happens prior to Python 2.6 depending on C library, e.g. breaks on WinXP 
15 - def _nan_float(text):
16 try: 17 return float(text) 18 except ValueError: 19 if text.lower()=="nan": 20 import struct 21 return struct.unpack('d', struct.pack('Q', 0xfff8000000000000))[0] 22 else: 23 raise
24
25 -def parse_basics(lines, results):
26 """Parse the basic information that should be present in most codeml output files. 27 """ 28 # multi_models is used to designate there being results for more than one 29 # model in the file 30 multi_models = False 31 version_re = re.compile(".+ \(in paml version (\d+\.\d+[a-z]*).*") 32 model_re = re.compile("Model:\s+(.+)") 33 codon_freq_re = re.compile("Codon frequency model:\s+(.+)") 34 siteclass_re = re.compile("Site-class models:\s*(.*)") 35 for line in lines: 36 # Find all floating point numbers in this line 37 line_floats_res = line_floats_re.findall(line) 38 line_floats = [_nan_float(val) for val in line_floats_res] 39 # Get the program version number 40 version_res = version_re.match(line) 41 if version_res is not None: 42 results["version"] = version_res.group(1) 43 continue 44 # Find the model description at the beginning of the file. 45 model_res = model_re.match(line) 46 if model_res is not None: 47 results["model"] = model_res.group(1) 48 continue 49 # Get the codon substitution model 50 codon_freq_res = codon_freq_re.match(line) 51 if codon_freq_res is not None: 52 results["codon model"] = codon_freq_res.group(1) 53 continue 54 # Find the site-class model name at the beginning of the file. 55 # This exists only if a NSsites class other than 0 is used. 56 # If there's no site class model listed, then there are multiple 57 # models in the results file 58 # Example match: "Site-class models: PositiveSelection" 59 siteclass_res = siteclass_re.match(line) 60 if siteclass_res is not None: 61 siteclass_model = siteclass_res.group(1) 62 if siteclass_model != "": 63 results["site-class model"] = siteclass_model 64 multi_models = False 65 else: 66 multi_models = True 67 # Get the maximum log-likelihood 68 if "ln Lmax" in line and len(line_floats) > 0: 69 results["lnL max"] = line_floats[0] 70 return (results, multi_models)
71
72 -def parse_nssites(lines, results, multi_models):
73 """Determine which NSsites models are present and parse them. 74 """ 75 76 ns_sites = {} 77 model_re = re.compile("Model (\d+):\s+(.+)") 78 siteclass_model = results.get("site-class model") 79 if not multi_models: 80 # If there's only one model in the results, find out 81 # which one it is and then parse it. 82 if siteclass_model is None: 83 siteclass_model = "one-ratio" 84 current_model = {"one-ratio" : 0, 85 "NearlyNeutral" : 1, 86 "PositiveSelection" : 2, 87 "discrete (4 categories)" : 3, 88 "beta (4 categories)" : 7, 89 "beta&w>1 (5 categories)" : 8}[siteclass_model] 90 model_results = {"description" : siteclass_model} 91 model_results = parse_model(lines, model_results) 92 ns_sites[current_model] = model_results 93 else: 94 # If there are multiple models in the results, scan through 95 # the file and send each model's text to be parsed individually. 96 current_model = None 97 model_start = None 98 for line_num in range(len(lines)): 99 # Find model names. If this is found on a line, 100 # all of the following lines until the next time this matches 101 # contain results for Model X. 102 # Example match: "Model 1: NearlyNeutral (2 categories)" 103 model_res = model_re.match(lines[line_num]) 104 if model_res: 105 if current_model is not None: 106 # We've already been tracking a model, so it's time 107 # to send those lines off for parsing before beginning 108 # a new one. 109 parse_model(lines[model_start:line_num], model_results) 110 ns_sites[current_model] = model_results 111 model_start = line_num 112 current_model = int(model_res.group(1)) 113 model_results = {"description":model_res.group(2)} 114 if ns_sites.get(current_model) is None: 115 # When we reach the end of the file, we'll still have one more 116 # model to parse. 117 model_results = parse_model(lines[model_start:], model_results) 118 ns_sites[current_model] = model_results 119 # Only add the ns_sites dict to the results if we really have results. 120 # Model M0 is added by default in some cases, so if it exists, make sure 121 # it's not empty 122 if len(ns_sites) == 1: 123 m0 = ns_sites.get(0) 124 if not m0 or len(m0) > 1: 125 results["NSsites"] = ns_sites 126 elif len(ns_sites) > 1: 127 results["NSsites"] = ns_sites 128 return results
129
130 -def parse_model(lines, results):
131 """Parse an individual NSsites model's results. 132 """ 133 parameters = {} 134 SEs_flag = False 135 dS_tree_flag = False 136 dN_tree_flag = False 137 w_tree_flag = False 138 num_params = None 139 tree_re = re.compile("\(\(+") 140 branch_re = re.compile("\s+(\d+\.\.\d+)[\s+\d+\.\d+]+") 141 model_params_re = re.compile("([a-z]\d?)=\s+(\d+\.\d+)") 142 for line in lines: 143 # Find all floating point numbers in this line 144 line_floats_res = line_floats_re.findall(line) 145 line_floats = [_nan_float(val) for val in line_floats_res] 146 # Check if branch-specific results are in the line 147 branch_res = branch_re.match(line) 148 # Check if additional model parameters are in the line 149 model_params = model_params_re.findall(line) 150 # Find lnL values. 151 # Example match (lnL = -2021.348300): 152 # "lnL(ntime: 19 np: 22): -2021.348300 +0.000000" 153 if "lnL(ntime:" in line and len(line_floats) > 0: 154 results["lnL"] = line_floats[0] 155 np_res = re.match("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)",line) 156 if np_res is not None: 157 num_params = int(np_res.group(1)) 158 # Get parameter list. This can be useful for specifying starting 159 # parameters in another run by copying the list of parameters 160 # to a file called in.codeml. Since the parameters must be in 161 # a fixed order and format, copying and pasting to the file is 162 # best. For this reason, they are grabbed here just as a long 163 # string and not as individual numbers. 164 elif len(line_floats) == num_params and not SEs_flag: 165 parameters["parameter list"] = line.strip() 166 # Find SEs. The same format as parameters above is maintained 167 # since there is a correspondance between the SE format and 168 # the parameter format. 169 # Example match: 170 # "SEs for parameters: 171 # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 172 elif "SEs for parameters:" in line: 173 SEs_flag = True 174 elif SEs_flag and len(line_floats) == num_params: 175 parameters["SEs"] = line.strip() 176 SEs_flag = False 177 # Find tree lengths. 178 # Example match: "tree length = 1.71931" 179 elif "tree length =" in line and len(line_floats) > 0: 180 results["tree length"] = line_floats[0] 181 # Find the estimated trees only taking the tree if it has 182 # lengths or rate estimates on the branches 183 elif tree_re.match(line) is not None: 184 if ":" in line: 185 if dS_tree_flag: 186 results["dS tree"] = line.strip() 187 dS_tree_flag = False 188 elif dN_tree_flag: 189 results["dN tree"] = line.strip() 190 dN_tree_flag = False 191 elif w_tree_flag: 192 line_edit = line.replace(" '#", ": ") 193 line_edit = line_edit.replace("'", "") 194 line_edit = line_edit.replace(" ,", ",") 195 line_edit = line_edit.replace(" )", ")") 196 results["omega tree"] = line_edit.strip() 197 w_tree_flag = False 198 else: 199 results["tree"] = line.strip() 200 elif "dS tree:" in line: 201 dS_tree_flag = True 202 elif "dN tree:" in line: 203 dN_tree_flag = True 204 elif "w ratios as labels for TreeView:" in line: 205 w_tree_flag = True 206 # Find rates for multiple genes 207 # Example match: "rates for 2 genes: 1 2.75551" 208 elif "rates for" in line and len(line_floats) > 0: 209 line_floats.insert(0, 1.0) 210 parameters["rates"] = line_floats 211 # Find kappa values. 212 # Example match: "kappa (ts/tv) = 2.77541" 213 elif "kappa (ts/tv)" in line and len(line_floats) > 0: 214 parameters["kappa"] = line_floats[0] 215 # Find omega values. 216 # Example match: "omega (dN/dS) = 0.25122" 217 elif "omega (dN/dS)" in line and len(line_floats) > 0: 218 parameters["omega"] = line_floats[0] 219 elif "w (dN/dS)" in line and len(line_floats) > 0: 220 parameters["omega"] = line_floats 221 # Find omega and kappa values for multi-gene files 222 # Example match: "gene # 1: kappa = 1.72615 omega = 0.39333" 223 elif "gene # " in line: 224 gene_num = int(re.match("gene # (\d+)", line).group(1)) 225 if parameters.get("genes") is None: 226 parameters["genes"] = {} 227 parameters["genes"][gene_num] = {"kappa":line_floats[0], 228 "omega":line_floats[1]} 229 # Find dN values. 230 # Example match: "tree length for dN: 0.2990" 231 elif "tree length for dN" in line and len(line_floats) > 0: 232 parameters["dN"] = line_floats[0] 233 # Find dS values 234 # Example match: "tree length for dS: 1.1901" 235 elif "tree length for dS" in line and len(line_floats) > 0: 236 parameters["dS"] = line_floats[0] 237 # Find site class distributions. 238 # Example match 1 (normal model, 2 site classes): 239 # "p: 0.77615 0.22385" 240 # Example match 2 (branch site A model, 4 site classes): 241 # "proportion 0.00000 0.00000 0.73921 0.26079" 242 elif line[0:2] == "p:" or line[0:10] == "proportion": 243 site_classes = parse_siteclass_proportions(line_floats) 244 parameters["site classes"] = site_classes 245 # Find the omega value corresponding to each site class 246 # Example match (2 site classes): "w: 0.10224 1.00000" 247 elif line[0:2] == "w:": 248 site_classes = parameters.get("site classes") 249 site_classes = parse_siteclass_omegas(line, site_classes) 250 parameters["site classes"] = site_classes 251 # Find the omega values corresponding to a branch type from 252 # the clade model C for each site class 253 # Example match: 254 # "branch type 0: 0.31022 1.00000 0.00000" 255 elif "branch type " in line: 256 branch_type = re.match("branch type (\d)", line) 257 if branch_type: 258 site_classes = parameters.get("site classes") 259 branch_type_no = int(branch_type.group(1)) 260 site_classes = parse_clademodelc(branch_type_no, line_floats, 261 site_classes) 262 parameters["site classes"] = site_classes 263 # Find the omega values of the foreground branch for each site 264 # class in the branch site A model 265 # Example match: 266 # "foreground w 0.07992 1.00000 134.54218 134.54218" 267 elif line[0:12] == "foreground w": 268 site_classes = parameters.get("site classes") 269 site_classes = parse_branch_site_a(True, line_floats, site_classes) 270 parameters["site classes"] = site_classes 271 # Find the omega values of the background for each site 272 # class in the branch site A model 273 # Example match: 274 # "background w 0.07992 1.00000 0.07992 1.00000" 275 elif line[0:12] == "background w": 276 site_classes = parameters.get("site classes") 277 site_classes = parse_branch_site_a(False, line_floats, site_classes) 278 parameters["site classes"] = site_classes 279 # Find dN & dS for each branch, which is organized in a table 280 # The possibility of NaNs forces me to not use the line_floats 281 # method. 282 # Example row (some spaces removed to make it smaller...). 283 # " 6..7 0.000 167.7 54.3 0.0000 0.0000 0.0000 0.0 0.0" 284 elif branch_res is not None and len(line_floats) > 0: 285 branch = branch_res.group(1) 286 if parameters.get("branches") is None: 287 parameters["branches"] = {} 288 #Hack for Jython http://bugs.jython.org/issue1762 float("-nan") 289 line = line.replace(" -nan", " nan") 290 params = line.strip().split()[1:] 291 parameters["branches"][branch]= { 292 "t" : _nan_float(params[0].strip()), 293 "N" : _nan_float(params[1].strip()), 294 "S" : _nan_float(params[2].strip()), 295 "omega" :_nan_float(params[3].strip()), 296 "dN" : _nan_float(params[4].strip()), 297 "dS" : _nan_float(params[5].strip()), 298 "N*dN" : _nan_float(params[6].strip()), 299 "S*dS" : _nan_float(params[7].strip())} 300 # Find model parameters, which can be spread across multiple 301 # lines. 302 # Example matches: 303 # " p0= 0.99043 p= 0.36657 q= 1.04445 304 #" (p1= 0.00957) w= 3.25530" 305 elif len(model_params) > 0: 306 float_model_params = [] 307 for param in model_params: 308 float_model_params.append((param[0], _nan_float(param[1]))) 309 parameters = dict(parameters.items() + float_model_params) 310 if len(parameters) > 0: 311 results["parameters"] = parameters 312 return results
313
314 -def parse_siteclass_proportions(line_floats):
315 """For models which have multiple site classes, find the proportion of the alignment assigned to each class. 316 """ 317 site_classes = {} 318 if len(line_floats) > 0: 319 for n in range(len(line_floats)): 320 site_classes[n] = {"proportion" : line_floats[n]} 321 return site_classes
322
323 -def parse_siteclass_omegas(line, site_classes):
324 """For models which have multiple site classes, find the omega estimated for each class. 325 """ 326 # The omega results are tabular with strictly 9 characters per column 327 # (1 to 3 digits before the decimal point and 5 after). This causes 328 # numbers to sometimes run into each other, so we must use a different 329 # regular expression to account for this. i.e.: 330 # w: 0.00012 1.00000109.87121 331 line_floats = re.findall("\d{1,3}\.\d{5}", line) 332 if not site_classes or len(line_floats) == 0: 333 return 334 for n in range(len(line_floats)): 335 site_classes[n]["omega"] = line_floats[n] 336 return site_classes
337
338 -def parse_clademodelc(branch_type_no, line_floats, site_classes):
339 """Parse results specific to the clade model C. 340 """ 341 if not site_classes or len(line_floats) == 0: 342 return 343 for n in range(len(line_floats)): 344 if site_classes[n].get("branch types") is None: 345 site_classes[n]["branch types"] = {} 346 site_classes[n]["branch types"][branch_type_no] = line_floats[n] 347 return site_classes
348
349 -def parse_branch_site_a(foreground, line_floats, site_classes):
350 """Parse results specific to the branch site A model. 351 """ 352 if not site_classes or len(line_floats) == 0: 353 return 354 for n in range(len(line_floats)): 355 if site_classes[n].get("branch types") is None: 356 site_classes[n]["branch types"] = {} 357 if foreground: 358 site_classes[n]["branch types"]["foreground"] =\ 359 line_floats[n] 360 else: 361 site_classes[n]["branch types"]["background"] =\ 362 line_floats[n] 363 return site_classes
364
365 -def parse_pairwise(lines, results):
366 """Parse results from pairwise comparisons. 367 """ 368 # Find pairwise comparisons 369 # Example: 370 # 2 (Pan_troglo) ... 1 (Homo_sapie) 371 # lnL = -291.465693 372 # 0.01262 999.00000 0.00100 373 # 374 # t= 0.0126 S= 81.4 N= 140.6 dN/dS= 0.0010 dN= 0.0000 dS= 0.0115 375 pair_re = re.compile("\d+ \((.+)\) ... \d+ \((.+)\)") 376 pairwise = {} 377 for line in lines: 378 # Find all floating point numbers in this line 379 line_floats_res = line_floats_re.findall(line) 380 line_floats = [_nan_float(val) for val in line_floats_res] 381 pair_res = pair_re.match(line) 382 if pair_res: 383 seq1 = pair_res.group(1) 384 seq2 = pair_res.group(2) 385 if pairwise.get(seq1) is None: 386 pairwise[seq1] = {} 387 if pairwise.get(seq2) is None: 388 pairwise[seq2] = {} 389 if len(line_floats) == 1: 390 pairwise[seq1][seq2] = {"lnL" : line_floats[0]} 391 pairwise[seq2][seq1] = pairwise[seq1][seq2] 392 elif len(line_floats) == 6: 393 pairwise[seq1][seq2] = {"t" : line_floats[0], 394 "S" : line_floats[1], 395 "N" : line_floats[2], 396 "omega" : line_floats[3], 397 "dN" : line_floats[4], 398 "dS" : line_floats[5]} 399 pairwise[seq2][seq1] = pairwise[seq1][seq2] 400 if len(pairwise) > 0: 401 results["pairwise"] = pairwise 402 return results
403
404 -def parse_distances(lines, results):
405 """Parse amino acid sequence distance results. 406 """ 407 distances = {} 408 sequences = [] 409 raw_aa_distances_flag = False 410 ml_aa_distances_flag = False 411 matrix_row_re = re.compile("(.+)\s{5,15}") 412 for line in lines: 413 # Find all floating point numbers in this line 414 line_floats_res = line_floats_re.findall(line) 415 line_floats = [_nan_float(val) for val in line_floats_res] 416 if "AA distances" in line: 417 raw_aa_distances_flag = True 418 # In current versions, the raw distances always come 419 # first but I don't trust this to always be true 420 ml_aa_distances_flag = False 421 elif "ML distances of aa seqs." in line: 422 ml_aa_distances_flag = True 423 raw_aa_distances_flag = False 424 # Parse AA distances (raw or ML), in a lower diagonal matrix 425 matrix_row_res = matrix_row_re.match(line) 426 if matrix_row_res and (raw_aa_distances_flag or \ 427 ml_aa_distances_flag): 428 seq_name = matrix_row_res.group(1).strip() 429 if seq_name not in sequences: 430 sequences.append(seq_name) 431 if raw_aa_distances_flag: 432 if distances.get("raw") is None: 433 distances["raw"] = {} 434 distances["raw"][seq_name] = {} 435 for i in range(0, len(line_floats)): 436 distances["raw"][seq_name][sequences[i]] = line_floats[i] 437 distances["raw"][sequences[i]][seq_name] = line_floats[i] 438 else: 439 if distances.get("ml") is None: 440 distances["ml"] = {} 441 distances["ml"][seq_name] = {} 442 for i in range(0, len(line_floats)): 443 distances["ml"][seq_name][sequences[i]] = line_floats[i] 444 distances["ml"][sequences[i]][seq_name] = line_floats[i] 445 if len(distances) > 0: 446 results["distances"] = distances 447 return results
448