Package Bio :: Package Align :: Package Applications :: Module _Muscle
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Muscle

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment program MUSCLE. 
  6  """ 
  7   
  8  __docformat__ = "epytext en" #Don't just use plain text in epydoc API pages! 
  9   
 10  from Bio.Application import _Option, _Switch, AbstractCommandline 
 11   
12 -class MuscleCommandline(AbstractCommandline):
13 r"""Command line wrapper for the multiple alignment program MUSCLE. 14 15 http://www.drive5.com/muscle/ 16 17 Example: 18 19 >>> from Bio.Align.Applications import MuscleCommandline 20 >>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" 21 >>> in_file = r"C:\My Documents\unaligned.fasta" 22 >>> out_file = r"C:\My Documents\aligned.fasta" 23 >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file) 24 >>> print muscle_cline 25 C:\Program Files\Aligments\muscle3.8.31_i86win32.exe -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta" 26 27 You would typically run the command line with muscle_cline() or via 28 the Python subprocess module, as described in the Biopython tutorial. 29 30 Citations: 31 32 Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high 33 accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97. 34 35 Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with 36 reduced time and space complexity. BMC Bioinformatics 5(1): 113. 37 38 Last checked against version: 3.7, briefly against 3.8 39 """
40 - def __init__(self, cmd="muscle", **kwargs):
41 CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"] 42 DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4", "kbit20_3", 43 "kmer4_6"] 44 DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \ 45 ["pctid_kimura", "pctid_log"] 46 OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"] 47 TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"] 48 SEQUENCE_TYPES = ["protein", "nucleo", "auto"] 49 WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb", 50 "gsc", "threeway"] 51 self.parameters = \ 52 [ 53 #Can't use "in" as the final alias as this is a reserved word in python: 54 _Option(["-in", "in", "input"], 55 "Input filename", 56 filename=True, 57 equate=False), 58 _Option(["-out", "out"], 59 "Output filename", 60 filename=True, 61 equate=False), 62 _Switch(["-diags", "diags"], 63 "Find diagonals (faster for similar sequences)"), 64 _Switch(["-profile", "profile"], 65 "Perform a profile alignment"), 66 _Option(["-in1", "in1"], 67 "First input filename for profile alignment", 68 filename=True, 69 equate=False), 70 _Option(["-in2", "in2"], 71 "Second input filename for a profile alignment", 72 filename=True, 73 equate=False), 74 #anchorspacing Integer 32 Minimum spacing between 75 _Option(["-anchorspacing", "anchorspacing"], 76 "Minimum spacing between anchor columns", 77 checker_function=lambda x: isinstance(x, int), 78 equate=False), 79 #center Floating point [1] Center parameter. 80 # Should be negative. 81 _Option(["-center", "center"], 82 "Center parameter - should be negative", 83 checker_function=lambda x: isinstance(x, float), 84 equate=False), 85 #cluster1 upgma upgmb Clustering method. 86 _Option(["-cluster1", "cluster1"], 87 "Clustering method used in iteration 1", 88 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 89 equate=False), 90 #cluster2 upgmb cluster1 is used in 91 # neighborjoining iteration 1 and 2, 92 # cluster2 in later 93 # iterations. 94 _Option(["-cluster2", "cluster2"], 95 "Clustering method used in iteration 2", 96 checker_function=lambda x: x in CLUSTERING_ALGORITHMS, 97 equate=False), 98 #diaglength Integer 24 Minimum length of 99 # diagonal. 100 _Option(["-diaglength", "diaglength"], 101 "Minimum length of diagonal", 102 checker_function=lambda x: isinstance(x, int), 103 equate=True), 104 #diagmargin Integer 5 Discard this many 105 # positions at ends of 106 # diagonal. 107 _Option(["-diagmargin", "diagmargin"], 108 "Discard this many positions at ends of diagonal", 109 checker_function=lambda x: isinstance(x, int), 110 equate=False), 111 #distance1 kmer6_6 Kmer6_6 (amino) or Distance measure for 112 # kmer20_3 Kmer4_6 (nucleo) iteration 1. 113 # kmer20_4 114 # kbit20_3 115 # kmer4_6 116 _Option(["-distance1", "distance1"], 117 "Distance measure for iteration 1", 118 checker_function=lambda x: x in DISTANCE_MEASURES_ITER1, 119 equate=False), 120 #distance2 kmer6_6 pctid_kimura Distance measure for 121 # kmer20_3 iterations 2, 3 ... 122 # kmer20_4 123 # kbit20_3 124 # pctid_kimura 125 # pctid_log 126 _Option(["-distance2", "distance2"], 127 "Distance measure for iteration 2", 128 checker_function=lambda x: x in DISTANCE_MEASURES_ITER2, 129 equate=False), 130 #gapopen Floating point [1] The gap open score. 131 # Must be negative. 132 _Option(["-gapopen", "gapopen"], 133 "Gap open score - negative number", 134 checker_function=lambda x: isinstance(x, float), 135 equate=False), 136 #hydro Integer 5 Window size for 137 # determining whether a 138 # region is hydrophobic. 139 _Option(["-hydro", "hydro"], 140 "Window size for hydrophobic region", 141 checker_function=lambda x: isinstance(x, int), 142 equate=False), 143 #hydrofactor Floating point 1.2 Multiplier for gap 144 # open/close penalties in 145 # hydrophobic regions. 146 _Option(["-hydrofactor", "hydrofactor"], 147 "Multiplier for gap penalties in hydrophobic regions", 148 checker_function=lambda x: isinstance(x, float), 149 equate=False), 150 #log File name None. Log file name (delete 151 # existing file). 152 _Option(["-log", "log"], 153 "Log file name", 154 filename=True, 155 equate=False), 156 #loga File name None. Log file name (append 157 # to existing file). 158 _Option(["-loga", "loga"], 159 "Log file name (append to existing file)", 160 filename=True, 161 equate=False), 162 #maxdiagbreak Integer 1 Maximum distance 163 # between two diagonals 164 # that allows them to 165 # merge into one 166 # diagonal. 167 _Option(["-maxdiagbreak", "maxdiagbreak"], 168 "Maximum distance between two diagonals that allows " 169 "them to merge into one diagonal", 170 checker_function=lambda x: isinstance(x, int), 171 equate=False), 172 #maxhours Floating point None. Maximum time to run in 173 # hours. The actual time 174 # may exceed the 175 # requested limit by a 176 # few minutes. Decimals 177 # are allowed, so 1.5 178 # means one hour and 30 179 # minutes. 180 _Option(["-maxhours", "maxhours"], 181 "Maximum time to run in hours", 182 checker_function=lambda x: isinstance(x, float), 183 equate=False), 184 #maxiters Integer 1, 2 ... 16 Maximum number of 185 # iterations. 186 _Option(["-maxiters", "maxiters"], 187 "Maximum number of iterations", 188 checker_function=lambda x: isinstance(x, int), 189 equate=False), 190 #maxtrees Integer 1 Maximum number of new 191 # trees to build in 192 # iteration 2. 193 _Option(["-maxtrees", "maxtrees"], 194 "Maximum number of trees to build in iteration 2", 195 checker_function=lambda x: isinstance(x, int), 196 equate=False), 197 #minbestcolscore Floating point [1] Minimum score a column 198 # must have to be an 199 # anchor. 200 _Option(["-minbestcolscore", "minbestcolscore"], 201 "Minimum score a column must have to be an anchor", 202 checker_function=lambda x: isinstance(x, float), 203 equate=False), 204 #minsmoothscore Floating point [1] Minimum smoothed score 205 # a column must have to 206 # be an anchor. 207 _Option(["-minsmoothscore", "minsmoothscore"], 208 "Minimum smoothed score a column must have to " 209 "be an anchor", 210 checker_function=lambda x: isinstance(x, float), 211 equate=False), 212 #objscore sp spm Objective score used by 213 # ps tree dependent 214 # dp refinement. 215 # xp sp=sum-of-pairs score. 216 # spf spf=sum-of-pairs score 217 # spm (dimer approximation) 218 # spm=sp for < 100 seqs, 219 # otherwise spf 220 # dp=dynamic programming 221 # score. 222 # ps=average profile- 223 # sequence score. 224 # xp=cross profile score. 225 _Option(["-objscore", "objscore"], 226 "Objective score used by tree dependent refinement", 227 checker_function=lambda x: x in OBJECTIVE_SCORES, 228 equate=False), 229 #root1 pseudo psuedo Method used to root 230 _Option(["-root1", "root1"], 231 "Method used to root tree in iteration 1", 232 checker_function=lambda x: x in TREE_ROOT_METHODS, 233 equate=False), 234 #root2 midlongestspan tree; root1 is used in 235 # minavgleafdist iteration 1 and 2, 236 # root2 in later 237 # iterations. 238 _Option(["-root2", "root2"], 239 "Method used to root tree in iteration 2", 240 checker_function=lambda x: x in TREE_ROOT_METHODS, 241 equate=False), 242 #seqtype protein auto Sequence type. 243 # nucleo 244 # auto 245 _Option(["-seqtype", "seqtype"], 246 "Sequence type", 247 checker_function=lambda x: x in SEQUENCE_TYPES, 248 equate=False), 249 #smoothscoreceil Floating point [1] Maximum value of column 250 # score for smoothing 251 # purposes. 252 _Option(["-smoothscoreceil", "smoothscoreceil"], 253 "Maximum value of column score for smoothing", 254 checker_function=lambda x: isinstance(x, float), 255 equate=False), 256 #smoothwindow Integer 7 Window used for anchor 257 # column smoothing. 258 _Option(["-smoothwindow", "smoothwindow"], 259 "Window used for anchor column smoothing", 260 checker_function=lambda x: isinstance(x, int), 261 equate=False), 262 #SUEFF Floating point value 0.1 Constant used in UPGMB 263 # between 0 and 1. clustering. Determines 264 # the relative fraction 265 # of average linkage 266 # (SUEFF) vs. nearest- 267 # neighbor linkage (1 268 # SUEFF). 269 _Option(["-sueff", "sueff"], 270 "Constant used in UPGMB clustering", 271 checker_function=lambda x: isinstance(x, float), 272 equate=False), 273 #tree1 File name None Save tree produced in 274 _Option(["-tree1", "tree1"], 275 "Save Newick tree from iteration 1", 276 equate=False), 277 #tree2 first or second 278 # iteration to given file 279 # in Newick (Phylip- 280 # compatible) format. 281 _Option(["-tree2", "tree2"], 282 "Save Newick tree from iteration 2", 283 equate=False), 284 #weight1 none clustalw Sequence weighting 285 _Option(["-weight1", "weight1"], 286 "Weighting scheme used in iteration 1", 287 checker_function=lambda x: x in WEIGHTING_SCHEMES, 288 equate=False), 289 #weight2 henikoff scheme. 290 # henikoffpb weight1 is used in 291 # gsc iterations 1 and 2. 292 # clustalw weight2 is used for 293 # threeway tree-dependent 294 # refinement. 295 # none=all sequences have 296 # equal weight. 297 # henikoff=Henikoff & 298 # Henikoff weighting 299 # scheme. 300 # henikoffpb=Modified 301 # Henikoff scheme as used 302 # in PSI-BLAST. 303 # clustalw=CLUSTALW 304 # method. 305 # threeway=Gotoh three- 306 # way method. 307 _Option(["-weight2", "weight2"], 308 "Weighting scheme used in iteration 2", 309 checker_function=lambda x: x in WEIGHTING_SCHEMES, 310 equate=False), 311 #################### FORMATS ####################################### 312 # Multiple formats can be specified on the command line 313 # If -msf appears it will be used regardless of other formats 314 # specified. If -clw appears (and not -msf), clustalw format will be 315 # used regardless of other formats specified. If both -clw and 316 # -clwstrict are specified -clwstrict will be used regardless of 317 # other formats specified. If -fasta is specified and not -msf, 318 # -clw, or clwstrict, fasta will be used. If -fasta and -html are 319 # specified -fasta will be used. Only if -html is specified alone 320 # will html be used. I kid ye not. 321 #clw no Write output in CLUSTALW format (default is 322 # FASTA). 323 _Switch(["-clw", "clw"], 324 "Write output in CLUSTALW format (with a MUSCLE header)"), 325 #clwstrict no Write output in CLUSTALW format with the 326 # "CLUSTAL W (1.81)" header rather than the 327 # MUSCLE version. This is useful when a post- 328 # processing step is picky about the file 329 # header. 330 _Switch(["-clwstrict", "clwstrict"], 331 "Write output in CLUSTALW format with version 1.81 header"), 332 #fasta yes Write output in FASTA format. Alternatives 333 # include clw, 334 # clwstrict, msf and html. 335 _Switch(["-fasta", "fasta"], 336 "Write output in FASTA format"), 337 #html no Write output in HTML format (default is 338 # FASTA). 339 _Switch(["-html", "html"], 340 "Write output in HTML format"), 341 #msf no Write output in MSF format (default is 342 # FASTA). 343 _Switch(["-msf", "msf"], 344 "Write output in MSF format"), 345 #Phylip interleaved - undocumented as of 3.7 346 _Switch(["-phyi", "phyi"], 347 "Write output in PHYLIP interleaved format"), 348 #Phylip sequential - undocumented as of 3.7 349 _Switch(["-phys", "phys"], 350 "Write output in PHYLIP sequential format"), 351 ################## Additional specified output files ######### 352 _Option(["-phyiout", "phyiout"], 353 "Write PHYLIP interleaved output to specified filename", 354 filename=True, 355 equate=False), 356 _Option(["-physout", "physout"],"Write PHYLIP sequential format to specified filename", 357 filename=True, 358 equate=False), 359 _Option(["-htmlout", "htmlout"],"Write HTML output to specified filename", 360 filename=True, 361 equate=False), 362 _Option(["-clwout", "clwout"], 363 "Write CLUSTALW output (with MUSCLE header) to specified " 364 "filename", 365 filename=True, 366 equate=False), 367 _Option(["-clwstrictout", "clwstrictout"], 368 "Write CLUSTALW output (with version 1.81 header) to " 369 "specified filename", 370 filename=True, 371 equate=False), 372 _Option(["-msfout", "msfout"], 373 "Write MSF format output to specified filename", 374 filename=True, 375 equate=False), 376 _Option(["-fastaout", "fastaout"], 377 "Write FASTA format output to specified filename", 378 filename=True, 379 equate=False), 380 ############## END FORMATS ################################### 381 #anchors yes Use anchor optimization in tree dependent 382 # refinement iterations. 383 _Switch(["-anchors", "anchors"], 384 "Use anchor optimisation in tree dependent " 385 "refinement iterations"), 386 #noanchors no Disable anchor optimization. Default is 387 # anchors. 388 _Switch(["-noanchors", "noanchors"], 389 "Do not use anchor optimisation in tree dependent " 390 "refinement iterations"), 391 #group yes Group similar sequences together in the 392 # output. This is the default. See also 393 # stable. 394 _Switch(["-group", "group"], 395 "Group similar sequences in output"), 396 #stable no Preserve input order of sequences in output 397 # file. Default is to group sequences by 398 # similarity (group). 399 _Switch(["-stable", "stable"], 400 "Do not group similar sequences in output (not supported in v3.8)"), 401 ############## log-expectation profile score ###################### 402 # One of either -le, -sp, or -sv 403 # 404 # According to the doc, spn is default and the only option for 405 # nucleotides: this doesnt appear to be true. -le, -sp, and -sv can 406 # be used and produce numerically different logs (what is going on?) 407 # 408 #spn fails on proteins 409 #le maybe Use log-expectation profile score (VTML240). 410 # Alternatives are to use sp or sv. This is 411 # the default for amino acid sequences. 412 _Switch(["-le", "le"], 413 "Use log-expectation profile score (VTML240)"), 414 #sv no Use sum-of-pairs profile score (VTML240). 415 # Default is le. 416 _Switch(["-sv", "sv"], 417 "Use sum-of-pairs profile score (VTML240)"), 418 #sp no Use sum-of-pairs protein profile score 419 # (PAM200). Default is le. 420 _Switch(["-sp", "sp"], 421 "Use sum-of-pairs protein profile score (PAM200)"), 422 #spn maybe Use sum-of-pairs nucleotide profile score 423 # (BLASTZ parameters). This is the only option 424 # for nucleotides, and is therefore the 425 # default. 426 _Switch(["-spn", "spn"], 427 "Use sum-of-pairs protein nucleotide profile score"), 428 ############## END log-expectation profile score ###################### 429 #quiet no Do not display progress messages. 430 _Switch(["-quiet", "quiet"], 431 "Use sum-of-pairs protein nucleotide profile score"), 432 #refine no Input file is already aligned, skip first 433 # two iterations and begin tree dependent 434 # refinement. 435 _Switch(["-refine", "refine"], 436 "Only do tree dependent refinement"), 437 #core yes in muscle, Do not catch exceptions. 438 # no in muscled. 439 _Switch(["-core", "core"], 440 "Catch exceptions"), 441 #nocore no in muscle, Catch exceptions and give an error message 442 # yes in muscled. if possible. 443 _Switch(["-nocore", "nocore"], 444 "Do not catch exceptions"), 445 #termgapsfull no Terminal gaps penalized with full penalty. 446 # [1] Not fully supported in this version. 447 # 448 #termgapshalf yes Terminal gaps penalized with half penalty. 449 # [1] Not fully supported in this version. 450 # 451 #termgapshalflonger no Terminal gaps penalized with half penalty if 452 # gap relative to 453 # longer sequence, otherwise with full 454 # penalty. 455 # [1] Not fully supported in this version. 456 #verbose no Write parameter settings and progress 457 # messages to log file. 458 _Switch(["-verbose", "verbose"], 459 "Write parameter settings and progress"), 460 #version no Write version string to stdout and exit. 461 _Switch(["-version", "version"], 462 "Write version string to stdout and exit"), 463 ] 464 AbstractCommandline.__init__(self, cmd, **kwargs)
465
466 -def _test():
467 """Run the module's doctests (PRIVATE).""" 468 print "Runing MUSCLE doctests..." 469 import doctest 470 doctest.testmod() 471 print "Done"
472 473 if __name__ == "__main__": 474 _test() 475