1
2
3
4
5 """Command line wrapper for the multiple alignment programme MAFFT.
6 """
7
8 __docformat__ = "epytext en"
9
10
11
12
13 import os
14 from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline
15
17 """Command line wrapper for the multiple alignment program MAFFT.
18
19 http://align.bmr.kyushu-u.ac.jp/mafft/software/
20
21 Example:
22
23 >>> from Bio.Align.Applications import MafftCommandline
24 >>> mafft_exe = "/opt/local/mafft"
25 >>> in_file = "../Doc/examples/opuntia.fasta"
26 >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file)
27 >>> print mafft_cline
28 /opt/local/mafft ../Doc/examples/opuntia.fasta
29
30 If the mafft binary is on the path (typically the case on a Unix style
31 operating system) then you don't need to supply the executable location:
32
33 >>> from Bio.Align.Applications import MafftCommandline
34 >>> in_file = "../Doc/examples/opuntia.fasta"
35 >>> mafft_cline = MafftCommandline(input=in_file)
36 >>> print mafft_cline
37 mafft ../Doc/examples/opuntia.fasta
38
39 You would typically run the command line with mafft_cline() or via
40 the Python subprocess module, as described in the Biopython tutorial.
41 Note that MAFFT will write the alignment to stdout, which you may
42 want to save to a file and then parse, e.g.
43
44 stdout, stderr = mafft_cline()
45 handle = open("aligned.fasta", "w")
46 handle.write(stdout)
47 handle.close()
48 from Bio import AlignIO
49 align = AlignIO.read("aligned.fasta", "fasta")
50
51 Alternatively, to parse the output with AlignIO directly you can
52 use StringIO to turn the string into a handle:
53
54 stdout, stderr = mafft_cline()
55 from StringIO import StringIO
56 from Bio import AlignIO
57 align = AlignIO.read(StringIO(stdout), "fasta")
58
59 Citations:
60
61 Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of
62 multiple ncRNA alignment by incorporating structural information into
63 a MAFFT-based framework (describes RNA structural alignment methods)
64
65 Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent
66 developments in the MAFFT multiple sequence alignment program
67 (outlines version 6)
68
69 Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an
70 algorithm to build an approximate tree from a large number of
71 unaligned sequences (describes the PartTree algorithm)
72
73 Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT
74 version 5: improvement in accuracy of multiple sequence alignment
75 (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i
76 strategies)
77
78 Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002)
79
80 Last checked against version: MAFFT v6.717b (2009/12/03)
81 """
82 - def __init__(self, cmd="mafft", **kwargs):
83 BLOSUM_MATRICES = ["30","45","62","80"]
84 self.parameters = \
85 [
86
87
88
89 _Switch(["--auto", "auto"],
90 "Automatically select strategy. Default off."),
91
92 _Switch(["--6merpair", "6merpair", "sixmerpair"],
93 "Distance is calculated based on the number of shared "
94 "6mers. Default: on"),
95
96
97
98
99
100 _Switch(["--globalpair", "globalpair"],
101 "All pairwise alignments are computed with the "
102 "Needleman-Wunsch algorithm. Default: off"),
103
104
105
106
107
108 _Switch(["--localpair", "localpair"],
109 "All pairwise alignments are computed with the "
110 "Smith-Waterman algorithm. Default: off"),
111
112
113
114
115
116
117 _Switch(["--genafpair", "genafpair"],
118 "All pairwise alignments are computed with a local "
119 "algorithm with the generalized affine gap cost "
120 "(Altschul 1998). Default: off"),
121
122
123 _Switch(["--fastapair", "fastapair"],
124 "All pairwise alignments are computed with FASTA "
125 "(Pearson and Lipman 1988). Default: off"),
126
127
128
129 _Option(["--weighti", "weighti"],
130 "Weighting factor for the consistency term calculated "
131 "from pairwise alignments. Default: 2.7",
132 checker_function=lambda x: isinstance(x, float),
133 equate=False),
134
135
136 _Option(["--retree", "retree"],
137 "Guide tree is built number times in the progressive "
138 "stage. Valid with 6mer distance. Default: 2",
139 checker_function=lambda x: isinstance(x, int),
140 equate=False),
141
142 _Option(["--maxiterate", "maxiterate"],
143 "Number cycles of iterative refinement are performed. "
144 "Default: 0",
145 checker_function=lambda x: isinstance(x, int),
146 equate=False),
147
148 _Switch(["--fft", "fft"],
149 "Use FFT approximation in group-to-group alignment. "
150 "Default: on"),
151
152
153 _Switch(["--nofft", "nofft"],
154 "Do not use FFT approximation in group-to-group "
155 "alignment. Default: off"),
156
157
158 _Switch(["--noscore", "noscore"],
159 "Alignment score is not checked in the iterative "
160 "refinement stage. Default: off (score is checked)"),
161
162
163 _Switch(["--memsave", "memsave"],
164 "Use the Myers-Miller (1988) algorithm. Default: "
165 "automatically turned on when the alignment length "
166 "exceeds 10,000 (aa/nt)."),
167
168
169
170 _Switch(["--parttree", "parttree"],
171 "Use a fast tree-building method with the 6mer "
172 "distance. Default: off"),
173
174
175
176 _Switch(["--dpparttree", "dpparttree"],
177 "The PartTree algorithm is used with distances "
178 "based on DP. Default: off"),
179
180
181
182
183 _Switch(["--fastaparttree", "fastaparttree"],
184 "The PartTree algorithm is used with distances based "
185 "on FASTA. Default: off"),
186
187 _Option(["--partsize", "partsize"],
188 "The number of partitions in the PartTree algorithm. "
189 "Default: 50",
190 checker_function=lambda x: isinstance(x, int),
191 equate=False),
192
193
194 _Switch(["--groupsize", "groupsize"],
195 "Do not make alignment larger than number sequences. "
196 "Default: the number of input sequences"),
197
198
199 _Option(["--op", "op"],
200 "Gap opening penalty at group-to-group alignment. "
201 "Default: 1.53",
202 checker_function=lambda x: isinstance(x, float),
203 equate=False),
204
205
206 _Option(["--ep", "ep"],
207 "Offset value, which works like gap extension penalty, "
208 "for group-to- group alignment. Default: 0.123",
209 checker_function=lambda x: isinstance(x, float),
210 equate=False),
211
212
213 _Option(["--lop", "lop"],
214 "Gap opening penalty at local pairwise alignment. "
215 "Default: 0.123",
216 checker_function=lambda x: isinstance(x, float),
217 equate=False),
218
219
220 _Option(["--lep", "lep"],
221 "Offset value at local pairwise alignment. "
222 "Default: 0.1",
223 checker_function=lambda x: isinstance(x, float),
224 equate=False),
225
226
227 _Option(["--lexp", "lexp"],
228 "Gap extension penalty at local pairwise alignment. "
229 "Default: -0.1",
230 checker_function=lambda x: isinstance(x, float),
231 equate=False),
232
233
234 _Option(["--LOP", "LOP"],
235 "Gap opening penalty to skip the alignment. "
236 "Default: -6.00",
237 checker_function=lambda x: isinstance(x, float),
238 equate=False),
239
240
241 _Option(["--LEXP", "LEXP"],
242 "Gap extension penalty to skip the alignment. "
243 "Default: 0.00",
244 checker_function=lambda x: isinstance(x, float),
245 equate=False),
246
247
248
249 _Option(["--bl", "bl"],
250 "BLOSUM number matrix is used. Default: 62",
251 checker_function=lambda x: x in BLOSUM_MATRICES,
252 equate=False),
253
254
255 _Option(["--jtt", "jtt"],
256 "JTT PAM number (Jones et al. 1992) matrix is used. "
257 "number>0. Default: BLOSUM62",
258 equate=False),
259
260
261 _Option(["--tm", "tm"],
262 "Transmembrane PAM number (Jones et al. 1994) "
263 "matrix is used. number>0. Default: BLOSUM62",
264 checker_function=os.path.exists,
265 filename=True,
266 equate=False),
267
268
269
270 _Option(["--aamatrix", "aamatrix"],
271 "Use a user-defined AA scoring matrix. "
272 "Default: BLOSUM62",
273 checker_function=os.path.exists,
274 filename=True,
275 equate=False),
276
277
278 _Switch(["--fmodel", "fmodel"],
279 "Incorporate the AA/nuc composition information into "
280 "the scoring matrix (True) or not (False, default)"),
281
282
283 _Switch(["--clustalout", "clustalout"],
284 "Output format: clustal (True) or fasta (False, default)"),
285
286 _Switch(["--inputorder", "inputorder"],
287 "Output order: same as input (True, default) or alignment "
288 "based (False)"),
289
290 _Switch(["--reorder", "reorder"],
291 "Output order: aligned (True) or in input order (False, "
292 "default)"),
293
294 _Switch(["--treeout", "treeout"],
295 "Guide tree is output to the input.tree file (True) or "
296 "not (False, default)"),
297
298 _Switch(["--quiet", "quiet"],
299 "Do not report progress (True) or not (False, default)."),
300
301
302 _Switch(["--nuc", "nuc"],
303 "Assume the sequences are nucleotide (True/False). "
304 "Default: auto"),
305
306 _Switch(["--amino", "amino"],
307 "Assume the sequences are amino acid (True/False). "
308 "Default: auto"),
309
310
311
312
313
314
315 _Option(["--seed", "seed"],
316 "Seed alignments given in alignment_n (fasta format) "
317 "are aligned with sequences in input.",
318 checker_function=os.path.exists,
319 filename=True,
320 equate=False),
321
322
323
324
325
326
327
328
329
330 _Argument(["input"],
331 "Input file name",
332 checker_function=os.path.exists,
333 filename=True,
334 is_required=True),
335
336
337
338 _Argument(["input1"],
339 "Second input file name for the mafft-profile command",
340 checker_function=os.path.exists,
341 filename=True),
342 ]
343 AbstractCommandline.__init__(self, cmd, **kwargs)
344
346 """Run the module's doctests (PRIVATE).
347
348 This will try and locate the unit tests directory, and run the doctests
349 from there in order that the relative paths used in the examples work.
350 """
351
352 import doctest
353 import os
354 if os.path.isdir(os.path.join("..","Tests")):
355 print "Runing doctests..."
356 cur_dir = os.path.abspath(os.curdir)
357 os.chdir(os.path.join("..","Tests"))
358 doctest.testmod()
359 os.chdir(cur_dir)
360 del cur_dir
361 print "Done"
362 elif os.path.isdir(os.path.join("Tests")) :
363 print "Runing doctests..."
364 cur_dir = os.path.abspath(os.curdir)
365 os.chdir(os.path.join("Tests"))
366 doctest.testmod()
367 os.chdir(cur_dir)
368 del cur_dir
369 print "Done"
370
371 if __name__ == "__main__":
372 _test()
373