Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  import re, time 
 13  from Bio import SeqIO 
 14  from Bio.Seq import Seq 
 15  from Bio import Alphabet 
 16  from Bio.Alphabet import IUPAC 
 17  from Bio.Data import IUPACData, CodonTable 
 18   
 19   
 20  ###################################### 
 21  # DNA 
 22  ###################### 
 23  # {{{  
 24   
 25   
26 -def GC(seq):
27 """Calculates G+C content, returns the percentage (float between 0 and 100). 28 29 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) 30 when counting the G and C content. The percentage is calculated against 31 the full length, e.g.: 32 33 >>> from Bio.SeqUtils import GC 34 >>> GC("ACTGN") 35 40.0 36 37 Note that this will return zero for an empty sequence. 38 """ 39 try: 40 gc = sum(map(seq.count,['G','C','g','c','S','s'])) 41 return gc*100.0/len(seq) 42 except ZeroDivisionError: 43 return 0.0
44 45
46 -def GC123(seq):
47 """Calculates total G+C content plus first, second and third positions. 48 49 Returns a tuple of four floats (percentages between 0 and 100) for the 50 entire sequence, and the three codon positions. e.g. 51 52 >>> from Bio.SeqUtils import GC123 53 >>> GC123("ACTGTN") 54 (40.0, 50.0, 50.0, 0.0) 55 56 Copes with mixed case sequences, but does NOT deal with ambiguous 57 nucleotides. 58 """ 59 d= {} 60 for nt in ['A','T','G','C']: 61 d[nt] = [0,0,0] 62 63 for i in range(0,len(seq),3): 64 codon = seq[i:i+3] 65 if len(codon) <3: codon += ' ' 66 for pos in range(0,3): 67 for nt in ['A','T','G','C']: 68 if codon[pos] == nt or codon[pos] == nt.lower(): 69 d[nt][pos] += 1 70 gc = {} 71 gcall = 0 72 nall = 0 73 for i in range(0,3): 74 try: 75 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 76 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 77 except: 78 gc[i] = 0 79 80 gcall = gcall + d['G'][i] + d['C'][i] 81 nall = nall + n 82 83 gcall = 100.0*gcall/nall 84 return gcall, gc[0], gc[1], gc[2]
85
86 -def GC_skew(seq, window = 100):
87 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence. 88 89 Returns a list of ratios (floats), controlled by the length of the sequence 90 and the size of the window. 91 92 Does NOT look at any ambiguous nucleotides. 93 """ 94 # 8/19/03: Iddo: added lowercase 95 values = [] 96 for i in range(0, len(seq), window): 97 s = seq[i: i + window] 98 g = s.count('G') + s.count('g') 99 c = s.count('C') + s.count('c') 100 skew = (g-c)/float(g+c) 101 values.append(skew) 102 return values
103 104 from math import pi, sin, cos, log
105 -def xGC_skew(seq, window = 1000, zoom = 100, 106 r = 300, px = 100, py = 100):
107 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 108 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \ 109 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y 110 yscroll = Scrollbar(orient = VERTICAL) 111 xscroll = Scrollbar(orient = HORIZONTAL) 112 canvas = Canvas(yscrollcommand = yscroll.set, 113 xscrollcommand = xscroll.set, background = 'white') 114 win = canvas.winfo_toplevel() 115 win.geometry('700x700') 116 117 yscroll.config(command = canvas.yview) 118 xscroll.config(command = canvas.xview) 119 yscroll.pack(side = RIGHT, fill = Y) 120 xscroll.pack(side = BOTTOM, fill = X) 121 canvas.pack(fill=BOTH, side = LEFT, expand = 1) 122 canvas.update() 123 124 X0, Y0 = r + px, r + py 125 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r 126 127 ty = Y0 128 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 129 ty +=20 130 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) 131 ty +=20 132 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') 133 ty +=20 134 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') 135 ty +=20 136 canvas.create_oval(x1,y1, x2, y2) 137 138 acc = 0 139 start = 0 140 for gc in GC_skew(seq, window): 141 r1 = r 142 acc+=gc 143 # GC skew 144 alpha = pi - (2*pi*start)/len(seq) 145 r2 = r1 - gc*zoom 146 x1 = X0 + r1 * sin(alpha) 147 y1 = Y0 + r1 * cos(alpha) 148 x2 = X0 + r2 * sin(alpha) 149 y2 = Y0 + r2 * cos(alpha) 150 canvas.create_line(x1,y1,x2,y2, fill = 'blue') 151 # accumulated GC skew 152 r1 = r - 50 153 r2 = r1 - acc 154 x1 = X0 + r1 * sin(alpha) 155 y1 = Y0 + r1 * cos(alpha) 156 x2 = X0 + r2 * sin(alpha) 157 y2 = Y0 + r2 * cos(alpha) 158 canvas.create_line(x1,y1,x2,y2, fill = 'magenta') 159 160 canvas.update() 161 start += window 162 163 canvas.configure(scrollregion = canvas.bbox(ALL))
164
165 -def molecular_weight(seq):
166 """Calculate the molecular weight of a DNA sequence.""" 167 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) 168 weight_table = IUPACData.unambiguous_dna_weights 169 return sum(weight_table[x] for x in seq)
170
171 -def nt_search(seq, subseq):
172 """Search for a DNA subseq in sequence. 173 174 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 175 searches only on forward strand 176 """ 177 pattern = '' 178 for nt in subseq: 179 value = IUPACData.ambiguous_dna_values[nt] 180 if len(value) == 1: 181 pattern += value 182 else: 183 pattern += '[%s]' % value 184 185 pos = -1 186 result = [pattern] 187 l = len(seq) 188 while True: 189 pos+=1 190 s = seq[pos:] 191 m = re.search(pattern, s) 192 if not m: break 193 pos += int(m.start(0)) 194 result.append(pos) 195 return result
196 197 # }}} 198 199 ###################################### 200 # Protein 201 ###################### 202 # {{{ 203 204
205 -def seq3(seq):
206 """Turn a one letter code protein sequence into one with three letter codes. 207 208 The single input argument 'seq' should be a protein sequence using single 209 letter codes, either as a python string or as a Seq or MutableSeq object. 210 211 This function returns the amino acid sequence as a string using the three 212 letter amino acid codes. Output follows the IUPAC standard (including 213 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 214 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. 215 Any unknown character (including possible gap characters), is changed into 216 'Xaa'. 217 218 e.g. 219 >>> from Bio.SeqUtils import seq3 220 >>> seq3("MAIVMGRWKGAR*") 221 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 222 223 This function was inspired by BioPerl's seq3. 224 """ 225 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 226 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 227 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 228 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 229 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 230 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', 231 'U':'Sel', 'O':'Pyl', 'J':'Xle', 232 } 233 #We use a default of 'Xaa' for undefined letters 234 #Note this will map '-' to 'Xaa' which may be undesirable! 235 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
236 237 238 # }}} 239 240 ###################################### 241 # Mixed ??? 242 ###################### 243 # {{{ 244 245
246 -def six_frame_translations(seq, genetic_code = 1):
247 """Formatted string showing the 6 frame translations and GC content. 248 249 nice looking 6 frame translation with GC content - code from xbbtools 250 similar to DNA Striders six-frame translation 251 252 e.g. 253 from Bio.SeqUtils import six_frame_translations 254 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") 255 """ 256 from Bio.Seq import reverse_complement, translate 257 anti = reverse_complement(seq) 258 comp = anti[::-1] 259 length = len(seq) 260 frames = {} 261 for i in range(0,3): 262 frames[i+1] = translate(seq[i:], genetic_code) 263 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) 264 265 # create header 266 if length > 20: 267 short = '%s ... %s' % (seq[:10], seq[-10:]) 268 else: 269 short = seq 270 #TODO? Remove the date as this would spoil any unit test... 271 date = time.strftime('%y %b %d, %X', time.localtime(time.time())) 272 header = 'GC_Frame: %s, ' % date 273 for nt in ['a','t','g','c']: 274 header += '%s:%d ' % (nt, seq.count(nt.upper())) 275 276 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) 277 res = header 278 279 for i in range(0,length,60): 280 subseq = seq[i:i+60] 281 csubseq = comp[i:i+60] 282 p = i/3 283 res = res + '%d/%d\n' % (i+1, i/3+1) 284 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' 285 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' 286 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' 287 # seq 288 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) 289 res = res + csubseq.lower() + '\n' 290 # - frames 291 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' 292 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' 293 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' 294 return res
295 296 # }}} 297 298 ###################################### 299 # FASTA file utilities 300 ###################### 301 # {{{ 302 303
304 -def quick_FASTA_reader(file):
305 """Simple FASTA reader, returning a list of string tuples. 306 307 The single argument 'file' should be the filename of a FASTA format file. 308 This function will open and read in the entire file, constructing a list 309 of all the records, each held as a tuple of strings (the sequence name or 310 title, and its sequence). 311 312 This function was originally intended for use on large files, where its 313 low overhead makes it very fast. However, because it returns the data as 314 a single in memory list, this can require a lot of RAM on large files. 315 316 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 317 allows you to iterate over the records one by one (avoiding having all the 318 records in memory at once). Using Bio.SeqIO also makes it easy to switch 319 between different input file formats. However, please note that rather 320 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 321 """ 322 #Want to split on "\n>" not just ">" in case there are any extra ">" 323 #in the name/description. So, in order to make sure we also split on 324 #the first entry, prepend a "\n" to the start of the file. 325 handle = open(file) 326 txt = "\n" + handle.read() 327 handle.close() 328 entries = [] 329 for entry in txt.split('\n>')[1:]: 330 name,seq= entry.split('\n',1) 331 seq = seq.replace('\n','').replace(' ','').upper() 332 entries.append((name, seq)) 333 return entries
334 335 336 # }}} 337 338
339 -def _test():
340 """Run the Bio.SeqUtils module's doctests (PRIVATE).""" 341 print "Runing doctests..." 342 import doctest 343 doctest.testmod() 344 print "Done"
345 346 if __name__ == "__main__": 347 _test() 348