Package Bio :: Package Phylo :: Package PAML :: Module _parse_baseml
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_baseml

  1  # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
  8  line_floats_re = re.compile("-*\d+\.\d+") 
  9   
10 -def parse_basics(lines, results):
11 """Parse the basics that should be present in most baseml results files. 12 """ 13 version_re = re.compile("BASEML \(in paml version (\d+\.\d+[a-z]*).*") 14 np_re = re.compile("lnL\(ntime:\s+\d+\s+np:\s+(\d+)\)") 15 num_params = -1 16 for line in lines: 17 # Find all floating point numbers in this line 18 line_floats_res = line_floats_re.findall(line) 19 line_floats = [float(val) for val in line_floats_res] 20 # Find the version number 21 # Example match: 22 # "BASEML (in paml version 4.3, August 2009) alignment.phylip" 23 version_res = version_re.match(line) 24 if version_res is not None: 25 results["version"] = version_res.group(1) 26 # Find max lnL 27 # Example match: 28 # ln Lmax (unconstrained) = -316.049385 29 if "ln Lmax" in line and len(line_floats) == 1: 30 results["lnL max"] = line_floats[0] 31 # Find lnL values. 32 # Example match (lnL = -2021.348300): 33 # "lnL(ntime: 19 np: 22): -2021.348300 +0.000000" 34 elif "lnL(ntime:" in line and len(line_floats) > 0: 35 results["lnL"] = line_floats[0] 36 np_res = np_re.match(line) 37 if np_res is not None: 38 num_params = int(np_res.group(1)) 39 # Find tree lengths. 40 # Example match: "tree length = 1.71931" 41 elif "tree length" in line and len(line_floats) == 1: 42 results["tree length"] = line_floats[0] 43 # Find the estimated tree, only taking the tree if it has 44 # branch lengths 45 elif re.match("\(+", line) is not None: 46 if ":" in line: 47 results["tree"] = line.strip() 48 return (results, num_params)
49
50 -def parse_parameters(lines, results, num_params):
51 """Parse the various parameters from the file. 52 """ 53 parameters = {} 54 parameters = parse_parameter_list(lines, parameters, num_params) 55 parameters = parse_kappas(lines, parameters) 56 parameters = parse_rates(lines, parameters) 57 parameters = parse_freqs(lines, parameters) 58 results["parameters"] = parameters 59 return results 60
61 -def parse_parameter_list(lines, parameters, num_params):
62 """ Parse the parameters list, which is just an unlabeled list of numeric values. 63 """ 64 for line_num in range(len(lines)): 65 line = lines[line_num] 66 # Find all floating point numbers in this line 67 line_floats_res = line_floats_re.findall(line) 68 line_floats = [float(val) for val in line_floats_res] 69 # Get parameter list. This can be useful for specifying starting 70 # parameters in another run by copying the list of parameters 71 # to a file called in.baseml. Since the parameters must be in 72 # a fixed order and format, copying and pasting to the file is 73 # best. For this reason, they are grabbed here just as a long 74 # string and not as individual numbers. 75 if len(line_floats) == num_params: 76 parameters["parameter list"] = line.strip() 77 # Find SEs. The same format as parameters above is maintained 78 # since there is a correspondance between the SE format and 79 # the parameter format. 80 # Example match: 81 # "SEs for parameters: 82 # -1.00000 -1.00000 -1.00000 801727.63247 730462.67590 -1.00000 83 if "SEs for parameters:" in lines[line_num + 1]: 84 SEs_line = lines[line_num + 2] 85 parameters["SEs"] = SEs_line.strip() 86 break 87 return parameters
88
89 -def parse_kappas(lines, parameters):
90 """Parse out the kappa parameters. 91 """ 92 kappa_found = False 93 for line in lines: 94 # Find all floating point numbers in this line 95 line_floats_res = line_floats_re.findall(line) 96 line_floats = [float(val) for val in line_floats_res] 97 # Find kappa parameter (F84, HKY85, T92 model) 98 # Example match: 99 # "Parameters (kappa) in the rate matrix (F84) (Yang 1994 J Mol Evol 39:105-111): 100 # 3.00749" 101 if "Parameters (kappa)" in line: 102 kappa_found = True 103 elif kappa_found and len(line_floats) > 0: 104 branch_res = re.match("\s(\d+\.\.\d+)", line) 105 if branch_res is None: 106 if len(line_floats) == 1: 107 parameters["kappa"] = line_floats[0] 108 else: 109 parameters["kappa"] = line_floats 110 kappa_found = False 111 else: 112 if parameters.get("branches") is None: 113 parameters["branches"] = {} 114 branch = branch_res.group(1) 115 if len(line_floats) > 0: 116 parameters["branches"][branch] = \ 117 {"t":line_floats[0], "kappa":line_floats[1], 118 "TS":line_floats[2], "TV":line_floats[3]} 119 # Find kappa under REV 120 # Example match: 121 # kappa under REV: 999.00000 145.76453 0.00001 0.00001 0.00001 122 elif "kappa under" in line and len(line_floats) > 0: 123 if len(line_floats) == 1: 124 parameters["kappa"] = line_floats[0] 125 else: 126 parameters["kappa"] = line_floats 127 return parameters
128
129 -def parse_rates(lines, parameters):
130 """Parse the rate parameters. 131 """ 132 Q_mat_found = False 133 for line in lines: 134 # Find all floating point numbers in this line 135 line_floats_res = line_floats_re.findall(line) 136 line_floats = [float(val) for val in line_floats_res] 137 # Find rate parameters 138 # Example match: 139 # "Rate parameters: 999.00000 145.59775 0.00001 0.00001 0.00001" 140 if "Rate parameters:" in line and len(line_floats) > 0: 141 parameters["rate parameters"] = line_floats 142 # Find rates 143 # Example match: 144 # "rate: 0.90121 0.96051 0.99831 1.03711 1.10287" 145 elif "rate: " in line and len(line_floats) > 0: 146 parameters["rates"] = line_floats 147 # Find Rate matrix Q & average kappa (REV model) 148 # Example match: 149 # Rate matrix Q, Average Ts/Tv = 3.0308 150 # -2.483179 1.865730 0.617449 0.000000 151 # 2.298662 -2.298662 0.000000 0.000000 152 # 0.335015 0.000000 -0.338059 0.003044 153 # 0.000000 0.000000 0.004241 -0.004241 154 elif "matrix Q" in line: 155 parameters["Q matrix"] = {"matrix":[]} 156 if len(line_floats) > 0: 157 parameters["Q matrix"]["average Ts/Tv"] = \ 158 line_floats[0] 159 Q_mat_found = True 160 elif Q_mat_found and len(line_floats) > 0: 161 parameters["Q matrix"]["matrix"].append(line_floats) 162 if len(parameters["Q matrix"]["matrix"]) == 4: 163 Q_mat_found = False 164 # Find alpha (gamma shape parameter for variable rates) 165 # Example match: "alpha (gamma, K=5) = 192.47918" 166 elif "alpha" in line and len(line_floats) > 0: 167 parameters["alpha"] = line_floats[0] 168 return parameters
169
170 -def parse_freqs(lines, parameters):
171 """Parse the basepair frequencies. 172 """ 173 root_re = re.compile("Note: node (\d+) is root.") 174 branch_freqs_found = False 175 for line in lines: 176 # Find all floating point numbers in this line 177 line_floats_res = line_floats_re.findall(line) 178 line_floats = [float(val) for val in line_floats_res] 179 # Find base frequencies 180 # Example match: 181 # "Base frequencies: 0.20090 0.16306 0.37027 0.26577" 182 if "Base frequencies" in line and len(line_floats) > 0: 183 base_frequencies = {} 184 base_frequencies["T"] = line_floats[0] 185 base_frequencies["C"] = line_floats[1] 186 base_frequencies["A"] = line_floats[2] 187 base_frequencies["G"] = line_floats[3] 188 parameters["base frequencies"] = base_frequencies 189 # Find frequencies 190 # Example match: 191 # "freq: 0.90121 0.96051 0.99831 1.03711 1.10287" 192 elif "freq: " in line and len(line_floats) > 0: 193 parameters["rate frequencies"] = line_floats 194 # Find branch-specific frequency parameters 195 # Example match (note: I think it's possible to have 4 more 196 # values per line, enclosed in brackets, so I'll account for 197 # this): 198 # (frequency parameters for branches) [frequencies at nodes] (see Yang & Roberts 1995 fig 1) 199 # 200 # Node #1 ( 0.25824 0.24176 0.25824 0.24176 ) 201 # Node #2 ( 0.00000 0.50000 0.00000 0.50000 ) 202 elif "(frequency parameters for branches)" in line: 203 parameters["nodes"] = {} 204 branch_freqs_found = True 205 elif branch_freqs_found is True: 206 if len(line_floats) > 0: 207 node_res = re.match("Node \#(\d+)", line) 208 node_num = int(node_res.group(1)) 209 node = {"root":False} 210 node["frequency parameters"] = line_floats[:4] 211 if len(line_floats) > 4: 212 node["base frequencies"] = {"T":line_floats[4], 213 "C":line_floats[5], 214 "A":line_floats[6], 215 "G":line_floats[7]} 216 parameters["nodes"][node_num] = node 217 else: 218 root_res = root_re.match(line) 219 if root_res is not None: 220 root_node = int(root_res.group(1)) 221 parameters["nodes"][root_node]["root"] =\ 222 True 223 branch_freqs_found = False 224 return parameters
225