1
2
3
4
5 """Command line wrapper for the multiple alignment program MUSCLE.
6 """
7
8 __docformat__ = "epytext en"
9
10 from Bio.Application import _Option, _Switch, AbstractCommandline
11
13 r"""Command line wrapper for the multiple alignment program MUSCLE.
14
15 http://www.drive5.com/muscle/
16
17 Example:
18
19 >>> from Bio.Align.Applications import MuscleCommandline
20 >>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe"
21 >>> in_file = r"C:\My Documents\unaligned.fasta"
22 >>> out_file = r"C:\My Documents\aligned.fasta"
23 >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file)
24 >>> print muscle_cline
25 C:\Program Files\Aligments\muscle3.8.31_i86win32.exe -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta"
26
27 You would typically run the command line with muscle_cline() or via
28 the Python subprocess module, as described in the Biopython tutorial.
29
30 Citations:
31
32 Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high
33 accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97.
34
35 Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with
36 reduced time and space complexity. BMC Bioinformatics 5(1): 113.
37
38 Last checked against version: 3.7, briefly against 3.8
39 """
40 - def __init__(self, cmd="muscle", **kwargs):
41 CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"]
42 DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3",
43 "kmer4_6"]
44 DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \
45 ["pctid_kimura", "pctid_log"]
46 OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"]
47 TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"]
48 SEQUENCE_TYPES = ["protein", "nucleo", "auto"]
49 WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb",
50 "gsc", "threeway"]
51 self.parameters = \
52 [
53
54 _Option(["-in", "in", "input"],
55 "Input filename",
56 filename=True,
57 equate=False),
58 _Option(["-out", "out"],
59 "Output filename",
60 filename=True,
61 equate=False),
62 _Switch(["-diags", "diags"],
63 "Find diagonals (faster for similar sequences)"),
64 _Switch(["-profile", "profile"],
65 "Perform a profile alignment"),
66 _Option(["-in1", "in1"],
67 "First input filename for profile alignment",
68 filename=True,
69 equate=False),
70 _Option(["-in2", "in2"],
71 "Second input filename for a profile alignment",
72 filename=True,
73 equate=False),
74
75 _Option(["-anchorspacing", "anchorspacing"],
76 "Minimum spacing between anchor columns",
77 checker_function=lambda x: isinstance(x, int),
78 equate=False),
79
80
81 _Option(["-center", "center"],
82 "Center parameter - should be negative",
83 checker_function=lambda x: isinstance(x, float),
84 equate=False),
85
86 _Option(["-cluster1", "cluster1"],
87 "Clustering method used in iteration 1",
88 checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
89 equate=False),
90
91
92
93
94 _Option(["-cluster2", "cluster2"],
95 "Clustering method used in iteration 2",
96 checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
97 equate=False),
98
99
100 _Option(["-diaglength", "diaglength"],
101 "Minimum length of diagonal",
102 checker_function=lambda x: isinstance(x, int),
103 equate=True),
104
105
106
107 _Option(["-diagmargin", "diagmargin"],
108 "Discard this many positions at ends of diagonal",
109 checker_function=lambda x: isinstance(x, int),
110 equate=False),
111
112
113
114
115
116 _Option(["-distance1", "distance1"],
117 "Distance measure for iteration 1",
118 checker_function=lambda x: x in DISTANCE_MEASURES_ITER1,
119 equate=False),
120
121
122
123
124
125
126 _Option(["-distance2", "distance2"],
127 "Distance measure for iteration 2",
128 checker_function=lambda x: x in DISTANCE_MEASURES_ITER2,
129 equate=False),
130
131
132 _Option(["-gapopen", "gapopen"],
133 "Gap open score - negative number",
134 checker_function=lambda x: isinstance(x, float),
135 equate=False),
136
137
138
139 _Option(["-hydro", "hydro"],
140 "Window size for hydrophobic region",
141 checker_function=lambda x: isinstance(x, int),
142 equate=False),
143
144
145
146 _Option(["-hydrofactor", "hydrofactor"],
147 "Multiplier for gap penalties in hydrophobic regions",
148 checker_function=lambda x: isinstance(x, float),
149 equate=False),
150
151
152 _Option(["-log", "log"],
153 "Log file name",
154 filename=True,
155 equate=False),
156
157
158 _Option(["-loga", "loga"],
159 "Log file name (append to existing file)",
160 filename=True,
161 equate=False),
162
163
164
165
166
167 _Option(["-maxdiagbreak", "maxdiagbreak"],
168 "Maximum distance between two diagonals that allows "
169 "them to merge into one diagonal",
170 checker_function=lambda x: isinstance(x, int),
171 equate=False),
172
173
174
175
176
177
178
179
180 _Option(["-maxhours", "maxhours"],
181 "Maximum time to run in hours",
182 checker_function=lambda x: isinstance(x, float),
183 equate=False),
184
185
186 _Option(["-maxiters", "maxiters"],
187 "Maximum number of iterations",
188 checker_function=lambda x: isinstance(x, int),
189 equate=False),
190
191
192
193 _Option(["-maxtrees", "maxtrees"],
194 "Maximum number of trees to build in iteration 2",
195 checker_function=lambda x: isinstance(x, int),
196 equate=False),
197
198
199
200 _Option(["-minbestcolscore", "minbestcolscore"],
201 "Minimum score a column must have to be an anchor",
202 checker_function=lambda x: isinstance(x, float),
203 equate=False),
204
205
206
207 _Option(["-minsmoothscore", "minsmoothscore"],
208 "Minimum smoothed score a column must have to "
209 "be an anchor",
210 checker_function=lambda x: isinstance(x, float),
211 equate=False),
212
213
214
215
216
217
218
219
220
221
222
223
224
225 _Option(["-objscore", "objscore"],
226 "Objective score used by tree dependent refinement",
227 checker_function=lambda x: x in OBJECTIVE_SCORES,
228 equate=False),
229
230 _Option(["-root1", "root1"],
231 "Method used to root tree in iteration 1",
232 checker_function=lambda x: x in TREE_ROOT_METHODS,
233 equate=False),
234
235
236
237
238 _Option(["-root2", "root2"],
239 "Method used to root tree in iteration 2",
240 checker_function=lambda x: x in TREE_ROOT_METHODS,
241 equate=False),
242
243
244
245 _Option(["-seqtype", "seqtype"],
246 "Sequence type",
247 checker_function=lambda x: x in SEQUENCE_TYPES,
248 equate=False),
249
250
251
252 _Option(["-smoothscoreceil", "smoothscoreceil"],
253 "Maximum value of column score for smoothing",
254 checker_function=lambda x: isinstance(x, float),
255 equate=False),
256
257
258 _Option(["-smoothwindow", "smoothwindow"],
259 "Window used for anchor column smoothing",
260 checker_function=lambda x: isinstance(x, int),
261 equate=False),
262
263
264
265
266
267
268
269 _Option(["-sueff", "sueff"],
270 "Constant used in UPGMB clustering",
271 checker_function=lambda x: isinstance(x, float),
272 equate=False),
273
274 _Option(["-tree1", "tree1"],
275 "Save Newick tree from iteration 1",
276 equate=False),
277
278
279
280
281 _Option(["-tree2", "tree2"],
282 "Save Newick tree from iteration 2",
283 equate=False),
284
285 _Option(["-weight1", "weight1"],
286 "Weighting scheme used in iteration 1",
287 checker_function=lambda x: x in WEIGHTING_SCHEMES,
288 equate=False),
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307 _Option(["-weight2", "weight2"],
308 "Weighting scheme used in iteration 2",
309 checker_function=lambda x: x in WEIGHTING_SCHEMES,
310 equate=False),
311
312
313
314
315
316
317
318
319
320
321
322
323 _Switch(["-clw", "clw"],
324 "Write output in CLUSTALW format (with a MUSCLE header)"),
325
326
327
328
329
330 _Switch(["-clwstrict", "clwstrict"],
331 "Write output in CLUSTALW format with version 1.81 header"),
332
333
334
335 _Switch(["-fasta", "fasta"],
336 "Write output in FASTA format"),
337
338
339 _Switch(["-html", "html"],
340 "Write output in HTML format"),
341
342
343 _Switch(["-msf", "msf"],
344 "Write output in MSF format"),
345
346 _Switch(["-phyi", "phyi"],
347 "Write output in PHYLIP interleaved format"),
348
349 _Switch(["-phys", "phys"],
350 "Write output in PHYLIP sequential format"),
351
352 _Option(["-phyiout", "phyiout"],
353 "Write PHYLIP interleaved output to specified filename",
354 filename=True,
355 equate=False),
356 _Option(["-physout", "physout"],"Write PHYLIP sequential format to specified filename",
357 filename=True,
358 equate=False),
359 _Option(["-htmlout", "htmlout"],"Write HTML output to specified filename",
360 filename=True,
361 equate=False),
362 _Option(["-clwout", "clwout"],
363 "Write CLUSTALW output (with MUSCLE header) to specified "
364 "filename",
365 filename=True,
366 equate=False),
367 _Option(["-clwstrictout", "clwstrictout"],
368 "Write CLUSTALW output (with version 1.81 header) to "
369 "specified filename",
370 filename=True,
371 equate=False),
372 _Option(["-msfout", "msfout"],
373 "Write MSF format output to specified filename",
374 filename=True,
375 equate=False),
376 _Option(["-fastaout", "fastaout"],
377 "Write FASTA format output to specified filename",
378 filename=True,
379 equate=False),
380
381
382
383 _Switch(["-anchors", "anchors"],
384 "Use anchor optimisation in tree dependent "
385 "refinement iterations"),
386
387
388 _Switch(["-noanchors", "noanchors"],
389 "Do not use anchor optimisation in tree dependent "
390 "refinement iterations"),
391
392
393
394 _Switch(["-group", "group"],
395 "Group similar sequences in output"),
396
397
398
399 _Switch(["-stable", "stable"],
400 "Do not group similar sequences in output (not supported in v3.8)"),
401
402
403
404
405
406
407
408
409
410
411
412 _Switch(["-le", "le"],
413 "Use log-expectation profile score (VTML240)"),
414
415
416 _Switch(["-sv", "sv"],
417 "Use sum-of-pairs profile score (VTML240)"),
418
419
420 _Switch(["-sp", "sp"],
421 "Use sum-of-pairs protein profile score (PAM200)"),
422
423
424
425
426 _Switch(["-spn", "spn"],
427 "Use sum-of-pairs protein nucleotide profile score"),
428
429
430 _Switch(["-quiet", "quiet"],
431 "Use sum-of-pairs protein nucleotide profile score"),
432
433
434
435 _Switch(["-refine", "refine"],
436 "Only do tree dependent refinement"),
437
438
439 _Switch(["-core", "core"],
440 "Catch exceptions"),
441
442
443 _Switch(["-nocore", "nocore"],
444 "Do not catch exceptions"),
445
446
447
448
449
450
451
452
453
454
455
456
457
458 _Switch(["-verbose", "verbose"],
459 "Write parameter settings and progress"),
460
461 _Switch(["-version", "version"],
462 "Write version string to stdout and exit"),
463 ]
464 AbstractCommandline.__init__(self, cmd, **kwargs)
465
467 """Run the module's doctests (PRIVATE)."""
468 print "Runing MUSCLE doctests..."
469 import doctest
470 doctest.testmod()
471 print "Done"
472
473 if __name__ == "__main__":
474 _test()
475