1
2
3
4
5
6 import re
7
9 """ Parse the Nei & Gojobori (1986) section of the resuls.
10 Nei_Gojobori results are organized in a lower
11 triangular mattrix, with the sequence names labeling
12 the rows and statistics in the format:
13 w (dN dS) per column
14 Example row (2 columns):
15 0.0000 (0.0000 0.0207) 0.0000 (0.0000 0.0421)"""
16 sequences = []
17 for line in lines:
18
19 line_floats_res = re.findall("-*\d+\.\d+", line)
20 line_floats = [float(val) for val in line_floats_res]
21 matrix_row_res = re.match("(.+)\s{5,15}",line)
22 if matrix_row_res is not None:
23 seq_name = matrix_row_res.group(1).strip()
24 sequences.append(seq_name)
25 results[seq_name] = {}
26 for i in range(0, len(line_floats), 3):
27 NG86 = {}
28 NG86["omega"] = line_floats[i]
29 NG86["dN"] = line_floats[i+1]
30 NG86["dS"] = line_floats[i+2]
31 results[seq_name][sequences[i//3]] = {"NG86":NG86}
32 results[sequences[i//3]][seq_name] = {"NG86":NG86}
33 return (results, sequences)
34
36 """ Parse the Yang & Nielsen (2000) part of the results.
37 Yang & Nielsen results are organized in a table with
38 each row comprising one pairwise species comparison.
39 Rows are labeled by spequence number rather than by
40 sequence name."""
41
42
43
44
45
46 for line in lines:
47
48 line_floats_res = re.findall("-*\d+\.\d+", line)
49 line_floats = [float(val) for val in line_floats_res]
50 row_res = re.match("\s+(\d+)\s+(\d+)", line)
51 if row_res is not None:
52 seq1 = int(row_res.group(1))
53 seq2 = int(row_res.group(2))
54 seq_name1 = sequences[seq1-1]
55 seq_name2 = sequences[seq2-1]
56 YN00 = {}
57 YN00["S"] = line_floats[0]
58 YN00["N"] = line_floats[1]
59 YN00["t"] = line_floats[2]
60 YN00["kappa"] = line_floats[3]
61 YN00["omega"] = line_floats[4]
62 YN00["dN"] = line_floats[5]
63 YN00["dN SE"] = line_floats[6]
64 YN00["dS"] = line_floats[7]
65 YN00["dS SE"] = line_floats[8]
66 results[seq_name1][seq_name2]["YN00"] = YN00
67 results[seq_name2][seq_name1]["YN00"] = YN00
68 seq_name1 = None
69 seq_name2 = None
70 return results
71
73 """Parse the results from the other methods.
74
75 The remaining methods are grouped together. Statistics
76 for all three are listed for each of the pairwise
77 species comparisons, with each method's results on its
78 own line.
79 The stats in this section must be handled differently
80 due to the possible presence of NaN values, which won't
81 get caught by my typical "line_floats" method used above.
82 """
83
84
85
86
87
88
89
90
91
92
93
94 seq_name1 = None
95 seq_name2 = None
96 for line in lines:
97 comp_res = re.match("\d+ \((.+)\) vs. \d+ \((.+)\)", line)
98 if comp_res is not None:
99 seq_name1 = comp_res.group(1)
100 seq_name2 = comp_res.group(2)
101 elif seq_name1 is not None and seq_name2 is not None:
102 if "dS =" in line:
103 stats = {}
104 line_stats = line.split(":")[1].strip()
105
106
107
108
109
110
111 res_matches = re.findall("[dSNwrho]{1,3} =.{7,8}?",
112 line_stats)
113 for stat_pair in res_matches:
114 stat = stat_pair.split('=')[0].strip()
115 value = stat_pair.split('=')[1].strip()
116 try:
117 stats[stat] = float(value)
118 except:
119 stats[stat] = None
120 if "LWL85:" in line:
121 results[seq_name1][seq_name2]["LWL85"] = stats
122 results[seq_name2][seq_name1]["LWL85"] = stats
123 elif "LWL85m" in line:
124 results[seq_name1][seq_name2]["LWL85m"] = stats
125 results[seq_name2][seq_name1]["LWL85m"] = stats
126 elif "LPB93" in line:
127 results[seq_name1][seq_name2]["LPB93"] = stats
128 results[seq_name2][seq_name1]["LPB93"] = stats
129 return results
130