Package Bio :: Package Motif :: Package Parsers :: Module MEME
[hide private]
[frames] | no frames]

Source Code for Module Bio.Motif.Parsers.MEME

  1  # Copyright 2008 by Bartek Wilczynski 
  2  # Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from Bio.Alphabet import IUPAC 
  8  from Bio import Seq 
  9  import re 
 10  from math import sqrt 
 11  import sys 
 12  from Bio.Motif import Motif 
 13   
 14   
 15   
16 -def read(handle):
17 """Parses the text output of the MEME program into MEME.Record object. 18 19 Example: 20 21 >>> f = open("meme.output.txt") 22 >>> from Bio.Motif.Parsers import MEME 23 >>> record = MEME.read(f) 24 >>> for motif in record.motifs: 25 ... for instance in motif.instances: 26 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 27 28 """ 29 record = MEMERecord() 30 __read_version(record, handle) 31 __read_datafile(record, handle) 32 __read_alphabet(record, handle) 33 __read_sequence_names(record, handle) 34 __read_command(record, handle) 35 for line in handle: 36 if line.startswith('MOTIF 1'): 37 break 38 else: 39 raise ValueError('Unexpected end of stream') 40 while True: 41 motif = __create_motif(line) 42 motif.alphabet = record.alphabet 43 record.motifs.append(motif) 44 __read_motif_name(motif, handle) 45 __read_motif_sequences(motif, handle, 'revcomp' in record.command) 46 __skip_unused_lines(handle) 47 try: 48 line = handle.next() 49 except StopIteration: 50 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs') 51 if line.startswith("SUMMARY OF MOTIFS"): 52 break 53 if not line.startswith('MOTIF'): 54 raise ValueError("Line does not start with 'MOTIF':\n%s" % line) 55 return record
56 57
58 -class MEMEMotif (Motif):
59 """A subclass of Motif used in parsing MEME (and MAST) output. 60 61 This sublcass defines functions and data specific to MEME motifs. 62 This includes the evalue for a motif and the PSSM of the motif. 63 64 Methods: 65 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values. 66 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies 67 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position. 68 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif 69 """
70 - def __init__ (self):
71 Motif.__init__(self) 72 self.evalue = 0.0
73
74 - def _numoccurrences (self, number):
75 if type(number) == int: 76 self.num_occurrences = number 77 else: 78 number = int(number) 79 self.num_occurrences = number
80
81 - def get_instance_by_name (self,name):
82 for i in self.instances: 83 if i.sequence_name == name: 84 return i 85 return None
86
87 - def add_instance_from_values (self, name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = '+'):
88 inst = MEMEInstance(sequence,self.alphabet) 89 inst._pvalue(pvalue) 90 inst._seqname(name) 91 inst._start(start) 92 inst._strand(strand) 93 if self.length: 94 inst._length(self.length) 95 else: 96 inst._length(len(sequence)) 97 if self.name: 98 inst._motifname(self.name) 99 self.add_instance(inst)
100
101 - def _evalue (self, evalue):
102 if type(evalue) == float: 103 self.evalue = evalue 104 else: 105 evalue = float(evalue) 106 self.evalue = evalue
107 108
109 -class MEMEInstance(Seq.Seq):
110 """A class describing the instances of a MEME motif, and the data thereof. 111 """
112 - def __init__ (self,*args,**kwds):
113 Seq.Seq.__init__(self,*args,**kwds) 114 self.sequence_name = "" 115 self.start = 0 116 self.pvalue = 1.0 117 self.strand = 0 118 self.length = 0 119 self.motif_name = ""
120 121
122 - def _seqname (self, name):
123 self.sequence_name = name
124
125 - def _motifname (self, name):
126 self.motif_name = name
127
128 - def _start (self,start):
129 start = int(start) 130 self.start = start
131
132 - def _pvalue (self,pval):
133 pval = float(pval) 134 self.pvalue = pval
135
136 - def _score (self, score):
137 score = float(score) 138 self.score = score
139
140 - def _strand (self, strand):
141 self.strand = strand
142
143 - def _length (self, length):
144 self.length = length
145 146
147 -class MEMERecord(object):
148 """A class for holding the results of a MEME run. 149 150 A MEMERecord is an object that holds the results from running 151 MEME. It implements no methods of its own. 152 153 """
154 - def __init__ (self):
155 """__init__ (self)""" 156 self.motifs = [] 157 self.version = "" 158 self.datafile = "" 159 self.command = "" 160 self.alphabet = None 161 self.sequence_names = []
162
163 - def get_motif_by_name (self, name):
164 for m in self.motifs: 165 if m.name == name: 166 return m
167 168 169 # Everything below is private 170 171
172 -def __read_version(record, handle):
173 for line in handle: 174 if line.startswith('MEME version'): 175 break 176 else: 177 raise ValueError("Improper input file. File should contain a line starting MEME version.") 178 line = line.strip() 179 ls = line.split() 180 record.version = ls[2]
181 182
183 -def __read_datafile(record, handle):
184 for line in handle: 185 if line.startswith('TRAINING SET'): 186 break 187 else: 188 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.") 189 try: 190 line = handle.next() 191 except StopIteration: 192 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'") 193 if not line.startswith('****'): 194 raise ValueError("Line does not start with '****':\n%s" % line) 195 try: 196 line = handle.next() 197 except StopIteration: 198 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'") 199 if not line.startswith('DATAFILE'): 200 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line) 201 line = line.strip() 202 line = line.replace('DATAFILE= ','') 203 record.datafile = line
204 205
206 -def __read_alphabet(record, handle):
207 try: 208 line = handle.next() 209 except StopIteration: 210 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'") 211 if not line.startswith('ALPHABET'): 212 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line) 213 line = line.strip() 214 line = line.replace('ALPHABET= ','') 215 if line == 'ACGT': 216 al = IUPAC.unambiguous_dna 217 else: 218 al = IUPAC.protein 219 record.alphabet = al
220 221
222 -def __read_sequence_names(record, handle):
223 try: 224 line = handle.next() 225 except StopIteration: 226 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 227 if not line.startswith('Sequence name'): 228 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 229 try: 230 line = handle.next() 231 except StopIteration: 232 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'") 233 if not line.startswith('----'): 234 raise ValueError("Line does not start with '----':\n%s" % line) 235 for line in handle: 236 if line.startswith('***'): 237 break 238 line = line.strip() 239 ls = line.split() 240 record.sequence_names.append(ls[0]) 241 if len(ls) == 6: 242 record.sequence_names.append(ls[3]) 243 else: 244 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
245 246
247 -def __read_command(record, handle):
248 for line in handle: 249 if line.startswith('command:'): 250 break 251 else: 252 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'") 253 line = line.strip() 254 line = line.replace('command: ','') 255 record.command = line
256 257
258 -def __create_motif(line):
259 line = line[5:].strip() 260 ls = line.split() 261 motif = MEMEMotif() 262 motif.length = int(ls[3]) 263 motif._numoccurrences(ls[6]) 264 motif._evalue(ls[12]) 265 return motif
266 267
268 -def __read_motif_name(motif, handle):
269 for line in handle: 270 if 'sorted by position p-value' in line: 271 break 272 else: 273 raise ValueError('Unexpected end of stream: Failed to find motif name') 274 line = line.strip() 275 ls = line.split() 276 name = " ".join(ls[0:2]) 277 motif.name=name
278 279
280 -def __read_motif_sequences(motif, handle, rv):
281 try: 282 line = handle.next() 283 except StopIteration: 284 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 285 if not line.startswith('---'): 286 raise ValueError("Line does not start with '---':\n%s" % line) 287 try: 288 line = handle.next() 289 except StopIteration: 290 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 291 if not line.startswith('Sequence name'): 292 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 293 try: 294 line = handle.next() 295 except StopIteration: 296 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 297 if not line.startswith('---'): 298 raise ValueError("Line does not start with '---':\n%s" % line) 299 for line in handle: 300 if line.startswith('---'): 301 break 302 line = line.strip() 303 ls = line.split() 304 if rv: 305 #seq = Seq.Seq(ls[5], record.alphabet) 306 motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1]) 307 else: 308 #seq = Seq.Seq(ls[4], record.alphabet) 309 motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2]) 310 else: 311 raise ValueError('Unexpected end of stream')
312 313
314 -def __skip_unused_lines(handle):
315 for line in handle: 316 if line.startswith('log-odds matrix'): 317 break 318 else: 319 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'") 320 for line in handle: 321 if line.startswith('---'): 322 break 323 else: 324 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 325 for line in handle: 326 if line.startswith('letter-probability matrix'): 327 break 328 else: 329 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'") 330 for line in handle: 331 if line.startswith('---'): 332 break 333 else: 334 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 335 for line in handle: 336 if line.startswith('Time'): 337 break 338 else: 339 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'") 340 try: 341 line = handle.next() 342 except StopIteration: 343 raise ValueError('Unexpected end of stream: Expected to find blank line') 344 if line.strip(): 345 raise ValueError("Expected blank line, but got:\n%s" % line) 346 try: 347 line = handle.next() 348 except StopIteration: 349 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 350 if not line.startswith('***'): 351 raise ValueError("Line does not start with '***':\n%s" % line) 352 for line in handle: 353 if line.strip(): 354 break 355 else: 356 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 357 if not line.startswith('***'): 358 raise ValueError("Line does not start with '***':\n%s" % line)
359