Package Bio :: Package Phylo :: Package PAML :: Module _parse_yn00
[hide private]
[frames] | no frames]

Source Code for Module Bio.Phylo.PAML._parse_yn00

  1  # Copyright (C) 2011 by Brandon Invergo (b.invergo@gmail.com) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  import re 
  7   
8 -def parse_ng86(lines, results):
9 """ Parse the Nei & Gojobori (1986) section of the resuls. 10 Nei_Gojobori results are organized in a lower 11 triangular mattrix, with the sequence names labeling 12 the rows and statistics in the format: 13 w (dN dS) per column 14 Example row (2 columns): 15 0.0000 (0.0000 0.0207) 0.0000 (0.0000 0.0421)""" 16 sequences = [] 17 for line in lines: 18 # Find all floating point numbers in this line 19 line_floats_res = re.findall("-*\d+\.\d+", line) 20 line_floats = [float(val) for val in line_floats_res] 21 matrix_row_res = re.match("(.+)\s{5,15}",line) 22 if matrix_row_res is not None: 23 seq_name = matrix_row_res.group(1).strip() 24 sequences.append(seq_name) 25 results[seq_name] = {} 26 for i in range(0, len(line_floats), 3): 27 NG86 = {} 28 NG86["omega"] = line_floats[i] 29 NG86["dN"] = line_floats[i+1] 30 NG86["dS"] = line_floats[i+2] 31 results[seq_name][sequences[i//3]] = {"NG86":NG86} 32 results[sequences[i//3]][seq_name] = {"NG86":NG86} 33 return (results, sequences)
34
35 -def parse_yn00(lines, results, sequences):
36 """ Parse the Yang & Nielsen (2000) part of the results. 37 Yang & Nielsen results are organized in a table with 38 each row comprising one pairwise species comparison. 39 Rows are labeled by spequence number rather than by 40 sequence name.""" 41 42 # Example (header row and first table row): 43 # seq. seq. S N t kappa omega dN +- SE dS +- SE 44 # 2 1 67.3 154.7 0.0136 3.6564 0.0000 -0.0000 +- 0.0000 0.0150 45 # +- 0.0151 46 for line in lines: 47 # Find all floating point numbers in this line 48 line_floats_res = re.findall("-*\d+\.\d+", line) 49 line_floats = [float(val) for val in line_floats_res] 50 row_res = re.match("\s+(\d+)\s+(\d+)", line) 51 if row_res is not None: 52 seq1 = int(row_res.group(1)) 53 seq2 = int(row_res.group(2)) 54 seq_name1 = sequences[seq1-1] 55 seq_name2 = sequences[seq2-1] 56 YN00 = {} 57 YN00["S"] = line_floats[0] 58 YN00["N"] = line_floats[1] 59 YN00["t"] = line_floats[2] 60 YN00["kappa"] = line_floats[3] 61 YN00["omega"] = line_floats[4] 62 YN00["dN"] = line_floats[5] 63 YN00["dN SE"] = line_floats[6] 64 YN00["dS"] = line_floats[7] 65 YN00["dS SE"] = line_floats[8] 66 results[seq_name1][seq_name2]["YN00"] = YN00 67 results[seq_name2][seq_name1]["YN00"] = YN00 68 seq_name1 = None 69 seq_name2 = None 70 return results
71
72 -def parse_others(lines, results, sequences):
73 """Parse the results from the other methods. 74 75 The remaining methods are grouped together. Statistics 76 for all three are listed for each of the pairwise 77 species comparisons, with each method's results on its 78 own line. 79 The stats in this section must be handled differently 80 due to the possible presence of NaN values, which won't 81 get caught by my typical "line_floats" method used above. 82 """ 83 # Example: 84 # 2 (Pan_troglo) vs. 1 (Homo_sapie) 85 86 # L(i): 143.0 51.0 28.0 sum= 222.0 87 # Ns(i): 0.0000 1.0000 0.0000 sum= 1.0000 88 # Nv(i): 0.0000 0.0000 0.0000 sum= 0.0000 89 # A(i): 0.0000 0.0200 0.0000 90 # B(i): -0.0000 -0.0000 -0.0000 91 # LWL85: dS = 0.0227 dN = 0.0000 w = 0.0000 S = 45.0 N = 177.0 92 # LWL85m: dS = -nan dN = -nan w = -nan S = -nan N = -nan (rho = -nan) 93 # LPB93: dS = 0.0129 dN = 0.0000 w = 0.0000 94 seq_name1 = None 95 seq_name2 = None 96 for line in lines: 97 comp_res = re.match("\d+ \((.+)\) vs. \d+ \((.+)\)", line) 98 if comp_res is not None: 99 seq_name1 = comp_res.group(1) 100 seq_name2 = comp_res.group(2) 101 elif seq_name1 is not None and seq_name2 is not None: 102 if "dS =" in line: 103 stats = {} 104 line_stats = line.split(":")[1].strip() 105 # Find all of the xx = ###### values in a row 106 # ie dS = 0.0227 107 # For dN and dS, the values have 8 characters from the equals 108 # sign, while the rest have 7 characters. On Windows, 109 # NaNs take on weird values like -1.#IND, which might fill the 110 # entire fixed column width. 111 res_matches = re.findall("[dSNwrho]{1,3} =.{7,8}?", 112 line_stats) 113 for stat_pair in res_matches: 114 stat = stat_pair.split('=')[0].strip() 115 value = stat_pair.split('=')[1].strip() 116 try: 117 stats[stat] = float(value) 118 except: 119 stats[stat] = None 120 if "LWL85:" in line: 121 results[seq_name1][seq_name2]["LWL85"] = stats 122 results[seq_name2][seq_name1]["LWL85"] = stats 123 elif "LWL85m" in line: 124 results[seq_name1][seq_name2]["LWL85m"] = stats 125 results[seq_name2][seq_name1]["LWL85m"] = stats 126 elif "LPB93" in line: 127 results[seq_name1][seq_name2]["LPB93"] = stats 128 results[seq_name2][seq_name1]["LPB93"] = stats 129 return results
130