1
2
3
4
5
6
7 from Bio.Alphabet import IUPAC
8 from Bio import Seq
9 import re
10 from math import sqrt
11 import sys
12 from Bio.Motif import Motif
13
14
15
17 """Parses the text output of the MEME program into MEME.Record object.
18
19 Example:
20
21 >>> f = open("meme.output.txt")
22 >>> from Bio.Motif.Parsers import MEME
23 >>> record = MEME.read(f)
24 >>> for motif in record.motifs:
25 ... for instance in motif.instances:
26 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
27
28 """
29 record = MEMERecord()
30 __read_version(record, handle)
31 __read_datafile(record, handle)
32 __read_alphabet(record, handle)
33 __read_sequence_names(record, handle)
34 __read_command(record, handle)
35 for line in handle:
36 if line.startswith('MOTIF 1'):
37 break
38 else:
39 raise ValueError('Unexpected end of stream')
40 while True:
41 motif = __create_motif(line)
42 motif.alphabet = record.alphabet
43 record.motifs.append(motif)
44 __read_motif_name(motif, handle)
45 __read_motif_sequences(motif, handle, 'revcomp' in record.command)
46 __skip_unused_lines(handle)
47 try:
48 line = handle.next()
49 except StopIteration:
50 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs')
51 if line.startswith("SUMMARY OF MOTIFS"):
52 break
53 if not line.startswith('MOTIF'):
54 raise ValueError("Line does not start with 'MOTIF':\n%s" % line)
55 return record
56
57
59 """A subclass of Motif used in parsing MEME (and MAST) output.
60
61 This sublcass defines functions and data specific to MEME motifs.
62 This includes the evalue for a motif and the PSSM of the motif.
63
64 Methods:
65 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values.
66 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies
67 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position.
68 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif
69 """
73
75 if type(number) == int:
76 self.num_occurrences = number
77 else:
78 number = int(number)
79 self.num_occurrences = number
80
82 for i in self.instances:
83 if i.sequence_name == name:
84 return i
85 return None
86
100
102 if type(evalue) == float:
103 self.evalue = evalue
104 else:
105 evalue = float(evalue)
106 self.evalue = evalue
107
108
110 """A class describing the instances of a MEME motif, and the data thereof.
111 """
113 Seq.Seq.__init__(self,*args,**kwds)
114 self.sequence_name = ""
115 self.start = 0
116 self.pvalue = 1.0
117 self.strand = 0
118 self.length = 0
119 self.motif_name = ""
120
121
123 self.sequence_name = name
124
126 self.motif_name = name
127
131
133 pval = float(pval)
134 self.pvalue = pval
135
139
142
145
146
148 """A class for holding the results of a MEME run.
149
150 A MEMERecord is an object that holds the results from running
151 MEME. It implements no methods of its own.
152
153 """
155 """__init__ (self)"""
156 self.motifs = []
157 self.version = ""
158 self.datafile = ""
159 self.command = ""
160 self.alphabet = None
161 self.sequence_names = []
162
164 for m in self.motifs:
165 if m.name == name:
166 return m
167
168
169
170
171
173 for line in handle:
174 if line.startswith('MEME version'):
175 break
176 else:
177 raise ValueError("Improper input file. File should contain a line starting MEME version.")
178 line = line.strip()
179 ls = line.split()
180 record.version = ls[2]
181
182
184 for line in handle:
185 if line.startswith('TRAINING SET'):
186 break
187 else:
188 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.")
189 try:
190 line = handle.next()
191 except StopIteration:
192 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'")
193 if not line.startswith('****'):
194 raise ValueError("Line does not start with '****':\n%s" % line)
195 try:
196 line = handle.next()
197 except StopIteration:
198 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'")
199 if not line.startswith('DATAFILE'):
200 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line)
201 line = line.strip()
202 line = line.replace('DATAFILE= ','')
203 record.datafile = line
204
205
207 try:
208 line = handle.next()
209 except StopIteration:
210 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'")
211 if not line.startswith('ALPHABET'):
212 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line)
213 line = line.strip()
214 line = line.replace('ALPHABET= ','')
215 if line == 'ACGT':
216 al = IUPAC.unambiguous_dna
217 else:
218 al = IUPAC.protein
219 record.alphabet = al
220
221
223 try:
224 line = handle.next()
225 except StopIteration:
226 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'")
227 if not line.startswith('Sequence name'):
228 raise ValueError("Line does not start with 'Sequence name':\n%s" % line)
229 try:
230 line = handle.next()
231 except StopIteration:
232 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'")
233 if not line.startswith('----'):
234 raise ValueError("Line does not start with '----':\n%s" % line)
235 for line in handle:
236 if line.startswith('***'):
237 break
238 line = line.strip()
239 ls = line.split()
240 record.sequence_names.append(ls[0])
241 if len(ls) == 6:
242 record.sequence_names.append(ls[3])
243 else:
244 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
245
246
248 for line in handle:
249 if line.startswith('command:'):
250 break
251 else:
252 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'")
253 line = line.strip()
254 line = line.replace('command: ','')
255 record.command = line
256
257
266
267
269 for line in handle:
270 if 'sorted by position p-value' in line:
271 break
272 else:
273 raise ValueError('Unexpected end of stream: Failed to find motif name')
274 line = line.strip()
275 ls = line.split()
276 name = " ".join(ls[0:2])
277 motif.name=name
278
279
281 try:
282 line = handle.next()
283 except StopIteration:
284 raise ValueError('Unexpected end of stream: Failed to find motif sequences')
285 if not line.startswith('---'):
286 raise ValueError("Line does not start with '---':\n%s" % line)
287 try:
288 line = handle.next()
289 except StopIteration:
290 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'")
291 if not line.startswith('Sequence name'):
292 raise ValueError("Line does not start with 'Sequence name':\n%s" % line)
293 try:
294 line = handle.next()
295 except StopIteration:
296 raise ValueError('Unexpected end of stream: Failed to find motif sequences')
297 if not line.startswith('---'):
298 raise ValueError("Line does not start with '---':\n%s" % line)
299 for line in handle:
300 if line.startswith('---'):
301 break
302 line = line.strip()
303 ls = line.split()
304 if rv:
305
306 motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1])
307 else:
308
309 motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2])
310 else:
311 raise ValueError('Unexpected end of stream')
312
313
315 for line in handle:
316 if line.startswith('log-odds matrix'):
317 break
318 else:
319 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'")
320 for line in handle:
321 if line.startswith('---'):
322 break
323 else:
324 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'")
325 for line in handle:
326 if line.startswith('letter-probability matrix'):
327 break
328 else:
329 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'")
330 for line in handle:
331 if line.startswith('---'):
332 break
333 else:
334 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'")
335 for line in handle:
336 if line.startswith('Time'):
337 break
338 else:
339 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'")
340 try:
341 line = handle.next()
342 except StopIteration:
343 raise ValueError('Unexpected end of stream: Expected to find blank line')
344 if line.strip():
345 raise ValueError("Expected blank line, but got:\n%s" % line)
346 try:
347 line = handle.next()
348 except StopIteration:
349 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
350 if not line.startswith('***'):
351 raise ValueError("Line does not start with '***':\n%s" % line)
352 for line in handle:
353 if line.strip():
354 break
355 else:
356 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
357 if not line.startswith('***'):
358 raise ValueError("Line does not start with '***':\n%s" % line)
359