1
2
3
4
5 """Implementation of sequence motifs (PRIVATE).
6 """
7 from Bio.Seq import Seq
8 from Bio.SubsMat import FreqTable
9 from Bio.Alphabet import IUPAC
10 import math,random
11
13 """
14 A class representing sequence motifs.
15 """
17 self.instances = []
18 self.has_instances=False
19 self.counts = {}
20 self.has_counts=False
21 self.mask = []
22 self._pwm_is_current = False
23 self._pwm = []
24 self._log_odds_is_current = False
25 self._log_odds = []
26 self.alphabet=alphabet
27 self.length=None
28 self.background=dict((n, 1.0/len(self.alphabet.letters)) \
29 for n in self.alphabet.letters)
30 self.beta=1.0
31 self.info=None
32 self.name=""
33
35 if self.length==None:
36 self.length = len
37 elif self.length != len:
38 print "len",self.length,self.instances, len
39 raise ValueError("You can't change the length of the motif")
40
42 if self.alphabet==None:
43 self.alphabet=alphabet
44 elif self.alphabet != alphabet:
45 raise ValueError("Wrong Alphabet")
46
48 """
49 adds new instance to the motif
50 """
51 self._check_alphabet(instance.alphabet)
52 self._check_length(len(instance))
53 if self.has_counts:
54 for i in range(self.length):
55 let=instance[i]
56 self.counts[let][i]+=1
57
58 if self.has_instances or not self.has_counts:
59 self.instances.append(instance)
60 self.has_instances=True
61
62 self._pwm_is_current = False
63 self._log_odds_is_current = False
64
65
67 """
68 sets the mask for the motif
69
70 The mask should be a string containing asterisks in the position of significant columns and spaces in other columns
71 """
72 self._check_length(len(mask))
73 self.mask=[]
74 for char in mask:
75 if char=="*":
76 self.mask.append(1)
77 elif char==" ":
78 self.mask.append(0)
79 else:
80 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)
81
82 - def pwm(self,laplace=True):
83 """
84 returns the PWM computed for the set of instances
85
86 if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions.
87 """
88
89 if self._pwm_is_current:
90 return self._pwm
91
92 self._pwm = []
93 for i in xrange(self.length):
94 dict = {}
95
96 for letter in self.alphabet.letters:
97 if laplace:
98 dict[letter]=self.beta*self.background[letter]
99 else:
100 dict[letter]=0.0
101 if self.has_counts:
102
103 for letter in self.alphabet.letters:
104 dict[letter]+=self.counts[letter][i]
105 elif self.has_instances:
106
107 for seq in self.instances:
108
109 try:
110 dict[seq[i]]+=1
111 except KeyError:
112 pass
113 self._pwm.append(FreqTable.FreqTable(dict,FreqTable.COUNT,self.alphabet))
114 self._pwm_is_current=1
115 return self._pwm
116
118 """
119 returns the logg odds matrix computed for the set of instances
120 """
121
122 if self._log_odds_is_current:
123 return self._log_odds
124
125 self._log_odds = []
126 pwm=self.pwm(laplace)
127 for i in xrange(self.length):
128 d = {}
129 for a in self.alphabet.letters:
130 d[a]=math.log(pwm[i][a]/self.background[a],2)
131 self._log_odds.append(d)
132 self._log_odds_is_current=1
133 return self._log_odds
134
136 """Method returning the information content of a motif.
137 """
138 res=0
139 pwm=self.pwm()
140 for i in range(self.length):
141 res+=2
142 for a in self.alphabet.letters:
143 if pwm[i][a]!=0:
144 res+=pwm[i][a]*math.log(pwm[i][a],2)
145 return res
146
148 """
149 Computes expected score of motif's instance and its standard deviation
150 """
151 exs=0.0
152 var=0.0
153 pwm=self.pwm()
154 for i in range(self.length):
155 ex1=0.0
156 ex2=0.0
157 for a in self.alphabet.letters:
158 if pwm[i][a]!=0:
159 ex1+=pwm[i][a]*(math.log(pwm[i][a],2)-math.log(self.background[a],2))
160 ex2+=pwm[i][a]*(math.log(pwm[i][a],2)-math.log(self.background[a],2))**2
161 exs+=ex1
162 var+=ex2-ex1**2
163 if st_dev:
164 return exs,math.sqrt(var)
165 else:
166 return exs
167
169 """
170 a generator function, returning found positions of instances of the motif in a given sequence
171 """
172 if not self.has_instances:
173 raise ValueError ("This motif has no instances")
174 for pos in xrange(0,len(sequence)-self.length+1):
175 for instance in self.instances:
176 if instance.tostring()==sequence[pos:pos+self.length].tostring():
177 yield(pos,instance)
178 break
179
180 - def score_hit(self,sequence,position,normalized=0,masked=0):
181 """
182 give the pwm score for a given position
183 """
184 lo=self.log_odds()
185 score = 0.0
186 for pos in xrange(self.length):
187 a = sequence[position+pos]
188 if not masked or self.mask[pos]:
189 try:
190 score += lo[pos][a]
191 except:
192 pass
193 if normalized:
194 if not masked:
195 score/=self.length
196 else:
197 score/=len([x for x in self.mask if x])
198 return score
199
200 - def search_pwm(self,sequence,normalized=0,masked=0,threshold=0.0,both=True):
201 """
202 a generator function, returning found hits in a given sequence with the pwm score higher than the threshold
203 """
204 if both:
205 rc = self.reverse_complement()
206
207 sequence=sequence.tostring().upper()
208 for pos in xrange(0,len(sequence)-self.length+1):
209 score = self.score_hit(sequence,pos,normalized,masked)
210 if score > threshold:
211 yield (pos,score)
212 if both:
213 rev_score = rc.score_hit(sequence,pos,normalized,masked)
214 if rev_score > threshold:
215 yield (-pos,rev_score)
216
218 """
219 return the similarity score based on pearson correlation for the given motif against self.
220
221 We use the Pearson's correlation of the respective probabilities.
222 """
223
224 if self.alphabet != motif.alphabet:
225 raise ValueError("Cannot compare motifs with different alphabets")
226
227 max_p=-2
228 for offset in range(-self.length+1,motif.length):
229 if offset<0:
230 p = self.dist_pearson_at(motif,-offset)
231 else:
232 p = motif.dist_pearson_at(self,offset)
233
234 if max_p<p:
235 max_p=p
236 max_o=-offset
237 return 1-max_p,max_o
238
240 sxx = 0
241 sxy = 0
242 sx = 0
243 sy = 0
244 syy = 0
245 norm=max(self.length,offset+motif.length)
246
247 for pos in range(max(self.length,offset+motif.length)):
248 for l in self.alphabet.letters:
249 xi = self[pos][l]
250 yi = motif[pos-offset][l]
251 sx = sx + xi
252 sy = sy + yi
253 sxx = sxx + xi * xi
254 syy = syy + yi * yi
255 sxy = sxy + xi * yi
256
257 norm *= len(self.alphabet.letters)
258 s1 = (sxy - sx*sy*1.0/norm)
259 s2 = (norm*sxx - sx*sx*1.0)*(norm*syy- sy*sy*1.0)
260 return s1/math.sqrt(s2)
261
263 """
264 A similarity measure taking into account a product probability of generating overlaping instances of two motifs
265 """
266 max_p=0.0
267 for offset in range(-self.length+1,other.length):
268 if offset<0:
269 p = self.dist_product_at(other,-offset)
270 else:
271 p = other.dist_product_at(self,offset)
272 if max_p<p:
273 max_p=p
274 max_o=-offset
275 return 1-max_p/self.dist_product_at(self,0),max_o
276
285
287 r"""Calculates the DPQ distance measure between motifs.
288
289 It is calculated as a maximal value of DPQ formula (shown using LaTeX
290 markup, familiar to mathematicians):
291
292 \sqrt{\sum_{i=1}^{alignment.len()} \sum_{k=1}^alphabet.len() \
293 \{ m1[i].freq(alphabet[k])*log_2(m1[i].freq(alphabet[k])/m2[i].freq(alphabet[k])) +
294 m2[i].freq(alphabet[k])*log_2(m2[i].freq(alphabet[k])/m1[i].freq(alphabet[k]))
295 }
296
297 over possible non-spaced alignemts of two motifs. See this reference:
298
299 D. M Endres and J. E Schindelin, "A new metric for probability
300 distributions", IEEE transactions on Information Theory 49, no. 7
301 (July 2003): 1858-1860.
302 """
303
304 min_d=float("inf")
305 min_o=-1
306 d_s=[]
307 for offset in range(-self.length+1,other.length):
308
309 if offset<0:
310 d = self.dist_dpq_at(other,-offset)
311 overlap = self.length+offset
312 else:
313 d = other.dist_dpq_at(self,offset)
314 overlap = other.length-offset
315 overlap = min(self.length,other.length,overlap)
316 out = self.length+other.length-2*overlap
317
318
319 d = (d/(out+overlap))*(2*overlap+out)/(2*overlap)
320
321 d_s.append((offset,d))
322 if min_d> d:
323 min_d=d
324 min_o=-offset
325 return min_d,min_o
326
328 """
329 calculates the dist_dpq measure with a given offset.
330
331 offset should satisfy 0<=offset<=len(self)
332 """
333 def dpq (f1,f2,alpha):
334 s=0
335 for n in alpha.letters:
336 avg=(f1[n]+f2[n])/2
337 s+=f1[n]*math.log(f1[n]/avg,2)+f2[n]*math.log(f2[n]/avg,2)
338 return math.sqrt(s)
339
340 s=0
341 for i in range(max(self.length,offset+other.length)):
342 f1=self[i]
343 f2=other[i-offset]
344 s+=dpq(f1,f2,self.alphabet)
345 return s
346
348 """Reads the motif from the stream (in AlignAce format).
349
350 the self.alphabet variable must be set beforehand.
351 If the last line contains asterisks it is used for setting mask
352 """
353
354 while 1:
355 ln = stream.readline()
356 if "*" in ln:
357 self.set_mask(ln.strip("\n\c"))
358 break
359 self.add_instance(Seq(ln.strip(),self.alphabet))
360
362 """ string representation of a motif.
363 """
364 str = ""
365 for inst in self.instances:
366 str = str + inst.tostring() + "\n"
367
368 if masked:
369 for i in xrange(self.length):
370 if self.mask[i]:
371 str = str + "*"
372 else:
373 str = str + " "
374 str = str + "\n"
375 return str
376
378 """return the length of a motif
379
380 Please use this method (i.e. invoke len(m)) instead of refering to the m.length directly.
381 """
382 if self.length==None:
383 return 0
384 else:
385 return self.length
386
388 """
389 writes the motif to the stream
390 """
391
392 stream.write(self.__str__())
393
394
395
397 """
398 FASTA representation of motif
399 """
400 if not self.has_instances:
401 self.make_instances_from_counts()
402 str = ""
403 for i,inst in enumerate(self.instances):
404 str = str + ">instance%d\n"%i + inst.tostring() + "\n"
405
406 return str
407
409 """
410 Gives the reverse complement of the motif
411 """
412 res = Motif()
413 if self.has_instances:
414 for i in self.instances:
415 res.add_instance(i.reverse_complement())
416 else:
417 res.has_counts=True
418 res.counts["A"]=self.counts["T"][:]
419 res.counts["T"]=self.counts["A"][:]
420 res.counts["G"]=self.counts["C"][:]
421 res.counts["C"]=self.counts["G"][:]
422 res.counts["A"].reverse()
423 res.counts["C"].reverse()
424 res.counts["G"].reverse()
425 res.counts["T"].reverse()
426 res.length=self.length
427 res.mask = self.mask
428 return res
429
430
432 """
433 reads the motif from Jaspar .pfm file
434
435 The instances are fake, but the pwm is accurate.
436 """
437 return self._from_horiz_matrix(stream,letters="ACGT",make_instances=make_instances)
438
459
461 """reads a horizontal count matrix from stream and fill in the counts.
462 """
463 if letters==None:
464 letters=self.alphabet.letters
465 self.counts = {}
466 self.has_counts=True
467
468 for i in letters:
469 ln = stream.readline().strip().split()
470
471 if ln[0]==i:
472 ln=ln[1:]
473
474 try:
475 self.counts[i]=map(int,ln)
476 except ValueError:
477 self.counts[i]=map(float,ln)
478
479
480 s = sum(self.counts[nuc][0] for nuc in letters)
481 l = len(self.counts[letters[0]])
482 self.length=l
483 self.set_mask("*"*l)
484 if make_instances==True:
485 self.make_instances_from_counts()
486 return self
487
488
490 """Creates "fake" instances for a motif created from a count matrix.
491
492 In case the sums of counts are different for different columnes, the
493 shorter columns are padded with background.
494 """
495 alpha="".join(self.alphabet.letters)
496
497 col=[]
498 self.has_instances=True
499 self.instances=[]
500 s = sum(map(lambda nuc: self.counts[nuc][0],self.alphabet.letters))
501 for i in range(self.length):
502 col.append("")
503 for n in self.alphabet.letters:
504 col[i] = col[i]+ (n*(self.counts[n][i]))
505 if len(col[i])<s:
506 print "WARNING, column too short",len(col[i]),s
507 col[i]+=(alpha*s)[:(s-len(col[i]))]
508
509
510 for i in range(s):
511 inst=""
512 for j in range(self.length):
513 inst+=col[j][i]
514
515 inst=Seq(inst,self.alphabet)
516 self.add_instance(inst)
517 return self.instances
518
520 """Creates the count matrix for a motif with instances.
521
522 """
523
524
525 counts={}
526 for a in self.alphabet.letters:
527 counts[a]=[]
528 self.has_counts=True
529 s = len(self.instances)
530 for i in range(self.length):
531 ci = dict((a,0) for a in self.alphabet.letters)
532 for inst in self.instances:
533 ci[inst[i]]+=1
534 for a in self.alphabet.letters:
535 counts[a].append(ci[a])
536 self.counts=counts
537 return counts
538
540 """
541 reads the motif from Jaspar .sites file
542
543 The instances and pwm are OK.
544 """
545
546 while True:
547 ln = stream.readline()
548 if ln=="" or ln[0]!=">":
549 break
550
551 ln=stream.readline().strip()
552 i=0
553 while ln[i]==ln[i].lower():
554 i+=1
555 inst=""
556 while i<len(ln) and ln[i]==ln[i].upper():
557 inst+=ln[i]
558 i+=1
559 inst=Seq(inst,self.alphabet)
560 self.add_instance(inst)
561
562 self.set_mask("*"*len(inst))
563 return self
564
565
567 """Returns the probability distribution over symbols at a given position, padding with background.
568
569 If the requested index is out of bounds, the returned distribution comes from background.
570 """
571 if index in range(self.length):
572 return self.pwm()[index]
573 else:
574 return self.background
575
577 """Returns the consensus sequence of a motif.
578 """
579 res=""
580 for i in range(self.length):
581 max_f=0
582 max_n="X"
583 for n in sorted(self[i]):
584 if self[i][n]>max_f:
585 max_f=self[i][n]
586 max_n=n
587 res+=max_n
588 return Seq(res,self.alphabet)
589
591 """returns the least probable pattern to be generated from this motif.
592 """
593 res=""
594 for i in range(self.length):
595 min_f=10.0
596 min_n="X"
597 for n in sorted(self[i]):
598 if self[i][n]<min_f:
599 min_f=self[i][n]
600 min_n=n
601 res+=min_n
602 return Seq(res,self.alphabet)
603
605 """Maximal possible score for this motif.
606
607 returns the score computed for the consensus sequence.
608 """
609 return self.score_hit(self.consensus(),0)
610
612 """Minimal possible score for this motif.
613
614 returns the score computed for the anticonsensus sequence.
615 """
616 return self.score_hit(self.anticonsensus(),0)
617
618 - def weblogo(self,fname,format="PNG",**kwds):
619 """
620 uses the Berkeley weblogo service to download and save a weblogo of itself
621
622 requires an internet connection.
623 The parameters from **kwds are passed directly to the weblogo server.
624 """
625 import urllib
626 import urllib2
627 al= self._to_fasta()
628 url = 'http://weblogo.berkeley.edu/logo.cgi'
629 values = {'sequence' : al,
630 'format' : format,
631 'logowidth' : '18',
632 'logoheight' : '5',
633 'logounits' : 'cm',
634 'kind' : 'AUTO',
635 'firstnum' : "1",
636 'command' : 'Create Logo',
637 'smallsamplecorrection' : "on",
638 'symbolsperline' : 32,
639 'res' : '96',
640 'res_units' : 'ppi',
641 'antialias' : 'on',
642 'title' : '',
643 'barbits' : '',
644 'xaxis': 'on',
645 'xaxis_label' : '',
646 'yaxis': 'on',
647 'yaxis_label' : '',
648 'showends' : 'on',
649 'shrink' : '0.5',
650 'fineprint' : 'on',
651 'ticbits' : '1',
652 'colorscheme' : 'DEFAULT',
653 'color1' : 'green',
654 'color2' : 'blue',
655 'color3' : 'red',
656 'color4' : 'black',
657 'color5' : 'purple',
658 'color6' : 'orange',
659 'color1' : 'black',
660 }
661 for k,v in kwds.iteritems():
662 values[k]=str(v)
663
664 data = urllib.urlencode(values)
665 req = urllib2.Request(url, data)
666 response = urllib2.urlopen(req)
667 f=open(fname,"w")
668 im=response.read()
669
670 f.write(im)
671 f.close()
672
673
675 """Write the representation of a motif in TRANSFAC format
676 """
677 res="XX\nTY Motif\n"
678 try:
679 res+="ID %s\n"%self.name
680 except:
681 pass
682 res+="BF undef\nP0"
683 for a in self.alphabet.letters:
684 res+=" %s"%a
685 res+="\n"
686 if not self.has_counts:
687 self.make_counts_from_instances()
688 for i in range(self.length):
689 if i<9:
690 res+="0%d"%(i+1)
691 else:
692 res+="%d"%(i+1)
693 for a in self.alphabet.letters:
694 res+=" %d"%self.counts[a][i]
695 res+="\n"
696 res+="XX\n"
697 return res
698
700 """Return string representation of the motif as a matrix.
701
702 """
703 if letters==None:
704 letters=self.alphabet.letters
705 self._pwm_is_current=False
706 pwm=self.pwm(laplace=False)
707 res=""
708 for i in range(self.length):
709 res+="\t".join([str(pwm[i][a]) for a in letters])
710 res+="\n"
711 return res
712
714 """Return string representation of the motif as a matrix.
715
716 """
717 if letters==None:
718 letters=self.alphabet.letters
719 res=""
720 if normalized:
721 self._pwm_is_current=False
722 mat=self.pwm(laplace=False)
723 for a in letters:
724 res+="\t".join([str(mat[i][a]) for i in range(self.length)])
725 res+="\n"
726 else:
727 if not self.has_counts:
728 self.make_counts_from_instances()
729 mat=self.counts
730 for a in letters:
731 res+="\t".join([str(mat[a][i]) for i in range(self.length)])
732 res+="\n"
733 return res
734
739
759
761 """Matrix of log-odds scores for a nucleotide sequence.
762
763 scans a nucleotide sequence and returns the matrix of log-odds
764 scores for all positions.
765
766 - the result is a one-dimensional list or numpy array
767 - the sequence can only be a DNA sequence
768 - the search is performed only on one strand
769 """
770 if self.alphabet!=IUPAC.unambiguous_dna:
771 raise ValueError("Wrong alphabet! Use only with DNA motifs")
772 if seq.alphabet!=IUPAC.unambiguous_dna:
773 raise ValueError("Wrong alphabet! Use only with DNA sequences")
774
775 seq = seq.tostring()
776
777
778 try:
779 import _pwm
780 except ImportError:
781
782 return self._pwm_calculate(seq)
783
784
785
786 logodds=[[y[1] for y in sorted(x.items())] for x in self.log_odds()]
787 return _pwm.calculate(seq, logodds)
788
790 logodds = self.log_odds()
791 m = len(logodds)
792 s = len(sequence)
793 n = s - m + 1
794 result = [None] * n
795 for i in xrange(n):
796 score = 0.0
797 for j in xrange(m):
798 c = sequence[i+j]
799 temp = logodds[j].get(c)
800 if temp==None:
801 break
802 score += temp
803 else:
804 result[i] = score
805 return result
806