1
2
3
4
5
6
7
8
9
10 """Miscellaneous functions for dealing with sequences."""
11
12 import re, time
13 from Bio import SeqIO
14 from Bio.Seq import Seq
15 from Bio import Alphabet
16 from Bio.Alphabet import IUPAC
17 from Bio.Data import IUPACData, CodonTable
18
19
20
21
22
23
24
25
27 """Calculates G+C content, returns the percentage (float between 0 and 100).
28
29 Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
30 when counting the G and C content. The percentage is calculated against
31 the full length, e.g.:
32
33 >>> from Bio.SeqUtils import GC
34 >>> GC("ACTGN")
35 40.0
36
37 Note that this will return zero for an empty sequence.
38 """
39 try:
40 gc = sum(map(seq.count,['G','C','g','c','S','s']))
41 return gc*100.0/len(seq)
42 except ZeroDivisionError:
43 return 0.0
44
45
47 """Calculates total G+C content plus first, second and third positions.
48
49 Returns a tuple of four floats (percentages between 0 and 100) for the
50 entire sequence, and the three codon positions. e.g.
51
52 >>> from Bio.SeqUtils import GC123
53 >>> GC123("ACTGTN")
54 (40.0, 50.0, 50.0, 0.0)
55
56 Copes with mixed case sequences, but does NOT deal with ambiguous
57 nucleotides.
58 """
59 d= {}
60 for nt in ['A','T','G','C']:
61 d[nt] = [0,0,0]
62
63 for i in range(0,len(seq),3):
64 codon = seq[i:i+3]
65 if len(codon) <3: codon += ' '
66 for pos in range(0,3):
67 for nt in ['A','T','G','C']:
68 if codon[pos] == nt or codon[pos] == nt.lower():
69 d[nt][pos] += 1
70 gc = {}
71 gcall = 0
72 nall = 0
73 for i in range(0,3):
74 try:
75 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
76 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
77 except:
78 gc[i] = 0
79
80 gcall = gcall + d['G'][i] + d['C'][i]
81 nall = nall + n
82
83 gcall = 100.0*gcall/nall
84 return gcall, gc[0], gc[1], gc[2]
85
87 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
88
89 Returns a list of ratios (floats), controlled by the length of the sequence
90 and the size of the window.
91
92 Does NOT look at any ambiguous nucleotides.
93 """
94
95 values = []
96 for i in range(0, len(seq), window):
97 s = seq[i: i + window]
98 g = s.count('G') + s.count('g')
99 c = s.count('C') + s.count('c')
100 skew = (g-c)/float(g+c)
101 values.append(skew)
102 return values
103
104 from math import pi, sin, cos, log
105 -def xGC_skew(seq, window = 1000, zoom = 100,
106 r = 300, px = 100, py = 100):
107 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
108 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
109 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
110 yscroll = Scrollbar(orient = VERTICAL)
111 xscroll = Scrollbar(orient = HORIZONTAL)
112 canvas = Canvas(yscrollcommand = yscroll.set,
113 xscrollcommand = xscroll.set, background = 'white')
114 win = canvas.winfo_toplevel()
115 win.geometry('700x700')
116
117 yscroll.config(command = canvas.yview)
118 xscroll.config(command = canvas.xview)
119 yscroll.pack(side = RIGHT, fill = Y)
120 xscroll.pack(side = BOTTOM, fill = X)
121 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
122 canvas.update()
123
124 X0, Y0 = r + px, r + py
125 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
126
127 ty = Y0
128 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
129 ty +=20
130 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
131 ty +=20
132 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
133 ty +=20
134 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
135 ty +=20
136 canvas.create_oval(x1,y1, x2, y2)
137
138 acc = 0
139 start = 0
140 for gc in GC_skew(seq, window):
141 r1 = r
142 acc+=gc
143
144 alpha = pi - (2*pi*start)/len(seq)
145 r2 = r1 - gc*zoom
146 x1 = X0 + r1 * sin(alpha)
147 y1 = Y0 + r1 * cos(alpha)
148 x2 = X0 + r2 * sin(alpha)
149 y2 = Y0 + r2 * cos(alpha)
150 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
151
152 r1 = r - 50
153 r2 = r1 - acc
154 x1 = X0 + r1 * sin(alpha)
155 y1 = Y0 + r1 * cos(alpha)
156 x2 = X0 + r2 * sin(alpha)
157 y2 = Y0 + r2 * cos(alpha)
158 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
159
160 canvas.update()
161 start += window
162
163 canvas.configure(scrollregion = canvas.bbox(ALL))
164
170
172 """Search for a DNA subseq in sequence.
173
174 use ambiguous values (like N = A or T or C or G, R = A or G etc.)
175 searches only on forward strand
176 """
177 pattern = ''
178 for nt in subseq:
179 value = IUPACData.ambiguous_dna_values[nt]
180 if len(value) == 1:
181 pattern += value
182 else:
183 pattern += '[%s]' % value
184
185 pos = -1
186 result = [pattern]
187 l = len(seq)
188 while True:
189 pos+=1
190 s = seq[pos:]
191 m = re.search(pattern, s)
192 if not m: break
193 pos += int(m.start(0))
194 result.append(pos)
195 return result
196
197
198
199
200
201
202
203
204
206 """Turn a one letter code protein sequence into one with three letter codes.
207
208 The single input argument 'seq' should be a protein sequence using single
209 letter codes, either as a python string or as a Seq or MutableSeq object.
210
211 This function returns the amino acid sequence as a string using the three
212 letter amino acid codes. Output follows the IUPAC standard (including
213 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
214 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk.
215 Any unknown character (including possible gap characters), is changed into
216 'Xaa'.
217
218 e.g.
219 >>> from Bio.SeqUtils import seq3
220 >>> seq3("MAIVMGRWKGAR*")
221 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
222
223 This function was inspired by BioPerl's seq3.
224 """
225 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
226 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
227 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
228 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
229 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
230 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
231 'U':'Sel', 'O':'Pyl', 'J':'Xle',
232 }
233
234
235 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
236
237
238
239
240
241
242
243
244
245
247 """Formatted string showing the 6 frame translations and GC content.
248
249 nice looking 6 frame translation with GC content - code from xbbtools
250 similar to DNA Striders six-frame translation
251
252 e.g.
253 from Bio.SeqUtils import six_frame_translations
254 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
255 """
256 from Bio.Seq import reverse_complement, translate
257 anti = reverse_complement(seq)
258 comp = anti[::-1]
259 length = len(seq)
260 frames = {}
261 for i in range(0,3):
262 frames[i+1] = translate(seq[i:], genetic_code)
263 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
264
265
266 if length > 20:
267 short = '%s ... %s' % (seq[:10], seq[-10:])
268 else:
269 short = seq
270
271 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
272 header = 'GC_Frame: %s, ' % date
273 for nt in ['a','t','g','c']:
274 header += '%s:%d ' % (nt, seq.count(nt.upper()))
275
276 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
277 res = header
278
279 for i in range(0,length,60):
280 subseq = seq[i:i+60]
281 csubseq = comp[i:i+60]
282 p = i/3
283 res = res + '%d/%d\n' % (i+1, i/3+1)
284 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
285 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
286 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
287
288 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
289 res = res + csubseq.lower() + '\n'
290
291 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
292 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
293 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
294 return res
295
296
297
298
299
300
301
302
303
305 """Simple FASTA reader, returning a list of string tuples.
306
307 The single argument 'file' should be the filename of a FASTA format file.
308 This function will open and read in the entire file, constructing a list
309 of all the records, each held as a tuple of strings (the sequence name or
310 title, and its sequence).
311
312 This function was originally intended for use on large files, where its
313 low overhead makes it very fast. However, because it returns the data as
314 a single in memory list, this can require a lot of RAM on large files.
315
316 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
317 allows you to iterate over the records one by one (avoiding having all the
318 records in memory at once). Using Bio.SeqIO also makes it easy to switch
319 between different input file formats. However, please note that rather
320 than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
321 """
322
323
324
325 handle = open(file)
326 txt = "\n" + handle.read()
327 handle.close()
328 entries = []
329 for entry in txt.split('\n>')[1:]:
330 name,seq= entry.split('\n',1)
331 seq = seq.replace('\n','').replace(' ','').upper()
332 entries.append((name, seq))
333 return entries
334
335
336
337
338
340 """Run the Bio.SeqUtils module's doctests (PRIVATE)."""
341 print "Runing doctests..."
342 import doctest
343 doctest.testmod()
344 print "Done"
345
346 if __name__ == "__main__":
347 _test()
348