Package Bio :: Package Align :: Package Applications :: Module _Mafft
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Applications._Mafft

  1  # Copyright 2009 by Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Command line wrapper for the multiple alignment programme MAFFT. 
  6  """ 
  7   
  8  __docformat__ = "epytext en" #Don't just use plain text in epydoc API pages! 
  9   
 10  #TODO - Remove file exist checks? Other wrappers don't do this, and 
 11  #it prevent things like preparing commands to run on a cluster. 
 12   
 13  import os 
 14  from Bio.Application import _Option, _Switch, _Argument, AbstractCommandline 
 15   
16 -class MafftCommandline(AbstractCommandline):
17 """Command line wrapper for the multiple alignment program MAFFT. 18 19 http://align.bmr.kyushu-u.ac.jp/mafft/software/ 20 21 Example: 22 23 >>> from Bio.Align.Applications import MafftCommandline 24 >>> mafft_exe = "/opt/local/mafft" 25 >>> in_file = "../Doc/examples/opuntia.fasta" 26 >>> mafft_cline = MafftCommandline(mafft_exe, input=in_file) 27 >>> print mafft_cline 28 /opt/local/mafft ../Doc/examples/opuntia.fasta 29 30 If the mafft binary is on the path (typically the case on a Unix style 31 operating system) then you don't need to supply the executable location: 32 33 >>> from Bio.Align.Applications import MafftCommandline 34 >>> in_file = "../Doc/examples/opuntia.fasta" 35 >>> mafft_cline = MafftCommandline(input=in_file) 36 >>> print mafft_cline 37 mafft ../Doc/examples/opuntia.fasta 38 39 You would typically run the command line with mafft_cline() or via 40 the Python subprocess module, as described in the Biopython tutorial. 41 Note that MAFFT will write the alignment to stdout, which you may 42 want to save to a file and then parse, e.g. 43 44 stdout, stderr = mafft_cline() 45 handle = open("aligned.fasta", "w") 46 handle.write(stdout) 47 handle.close() 48 from Bio import AlignIO 49 align = AlignIO.read("aligned.fasta", "fasta") 50 51 Alternatively, to parse the output with AlignIO directly you can 52 use StringIO to turn the string into a handle: 53 54 stdout, stderr = mafft_cline() 55 from StringIO import StringIO 56 from Bio import AlignIO 57 align = AlignIO.read(StringIO(stdout), "fasta") 58 59 Citations: 60 61 Katoh, Toh (BMC Bioinformatics 9:212, 2008) Improved accuracy of 62 multiple ncRNA alignment by incorporating structural information into 63 a MAFFT-based framework (describes RNA structural alignment methods) 64 65 Katoh, Toh (Briefings in Bioinformatics 9:286-298, 2008) Recent 66 developments in the MAFFT multiple sequence alignment program 67 (outlines version 6) 68 69 Katoh, Toh (Bioinformatics 23:372-374, 2007) Errata PartTree: an 70 algorithm to build an approximate tree from a large number of 71 unaligned sequences (describes the PartTree algorithm) 72 73 Katoh, Kuma, Toh, Miyata (Nucleic Acids Res. 33:511-518, 2005) MAFFT 74 version 5: improvement in accuracy of multiple sequence alignment 75 (describes [ancestral versions of] the G-INS-i, L-INS-i and E-INS-i 76 strategies) 77 78 Katoh, Misawa, Kuma, Miyata (Nucleic Acids Res. 30:3059-3066, 2002) 79 80 Last checked against version: MAFFT v6.717b (2009/12/03) 81 """
82 - def __init__(self, cmd="mafft", **kwargs):
83 BLOSUM_MATRICES = ["30","45","62","80"] 84 self.parameters = \ 85 [ 86 #**** Algorithm **** 87 #Automatically selects an appropriate strategy from L-INS-i, FFT-NS- 88 #i and FFT-NS-2, according to data size. Default: off (always FFT-NS-2) 89 _Switch(["--auto", "auto"], 90 "Automatically select strategy. Default off."), 91 #Distance is calculated based on the number of shared 6mers. Default: on 92 _Switch(["--6merpair", "6merpair", "sixmerpair"], 93 "Distance is calculated based on the number of shared " 94 "6mers. Default: on"), 95 #All pairwise alignments are computed with the Needleman-Wunsch 96 #algorithm. More accurate but slower than --6merpair. Suitable for a 97 #set of globally alignable sequences. Applicable to up to ~200 98 #sequences. A combination with --maxiterate 1000 is recommended (G- 99 #INS-i). Default: off (6mer distance is used) 100 _Switch(["--globalpair", "globalpair"], 101 "All pairwise alignments are computed with the " 102 "Needleman-Wunsch algorithm. Default: off"), 103 #All pairwise alignments are computed with the Smith-Waterman 104 #algorithm. More accurate but slower than --6merpair. Suitable for a 105 #set of locally alignable sequences. Applicable to up to ~200 106 #sequences. A combination with --maxiterate 1000 is recommended (L- 107 #INS-i). Default: off (6mer distance is used) 108 _Switch(["--localpair", "localpair"], 109 "All pairwise alignments are computed with the " 110 "Smith-Waterman algorithm. Default: off"), 111 #All pairwise alignments are computed with a local algorithm with 112 #the generalized affine gap cost (Altschul 1998). More accurate but 113 #slower than --6merpair. Suitable when large internal gaps are 114 #expected. Applicable to up to ~200 sequences. A combination with -- 115 #maxiterate 1000 is recommended (E-INS-i). Default: off (6mer 116 #distance is used) 117 _Switch(["--genafpair", "genafpair"], 118 "All pairwise alignments are computed with a local " 119 "algorithm with the generalized affine gap cost " 120 "(Altschul 1998). Default: off"), 121 #All pairwise alignments are computed with FASTA (Pearson and Lipman 122 #1988). FASTA is required. Default: off (6mer distance is used) 123 _Switch(["--fastapair", "fastapair"], 124 "All pairwise alignments are computed with FASTA " 125 "(Pearson and Lipman 1988). Default: off"), 126 #Weighting factor for the consistency term calculated from pairwise 127 #alignments. Valid when either of --blobalpair, --localpair, -- 128 #genafpair, --fastapair or --blastpair is selected. Default: 2.7 129 _Option(["--weighti", "weighti"], 130 "Weighting factor for the consistency term calculated " 131 "from pairwise alignments. Default: 2.7", 132 checker_function=lambda x: isinstance(x, float), 133 equate=False), 134 #Guide tree is built number times in the progressive stage. Valid 135 #with 6mer distance. Default: 2 136 _Option(["--retree", "retree"], 137 "Guide tree is built number times in the progressive " 138 "stage. Valid with 6mer distance. Default: 2", 139 checker_function=lambda x: isinstance(x, int), 140 equate=False), 141 #Number cycles of iterative refinement are performed. Default: 0 142 _Option(["--maxiterate", "maxiterate"], 143 "Number cycles of iterative refinement are performed. " 144 "Default: 0", 145 checker_function=lambda x: isinstance(x, int), 146 equate=False), 147 #Use FFT approximation in group-to-group alignment. Default: on 148 _Switch(["--fft", "fft"], 149 "Use FFT approximation in group-to-group alignment. " 150 "Default: on"), 151 #Do not use FFT approximation in group-to-group alignment. Default: 152 #off 153 _Switch(["--nofft", "nofft"], 154 "Do not use FFT approximation in group-to-group " 155 "alignment. Default: off"), 156 #Alignment score is not checked in the iterative refinement stage. 157 #Default: off (score is checked) 158 _Switch(["--noscore", "noscore"], 159 "Alignment score is not checked in the iterative " 160 "refinement stage. Default: off (score is checked)"), 161 #Use the Myers-Miller (1988) algorithm. Default: automatically 162 #turned on when the alignment length exceeds 10,000 (aa/nt). 163 _Switch(["--memsave", "memsave"], 164 "Use the Myers-Miller (1988) algorithm. Default: " 165 "automatically turned on when the alignment length " 166 "exceeds 10,000 (aa/nt)."), 167 #Use a fast tree-building method (PartTree, Katoh and Toh 2007) with 168 #the 6mer distance. Recommended for a large number (> ~10,000) of 169 #sequences are input. Default: off 170 _Switch(["--parttree", "parttree"], 171 "Use a fast tree-building method with the 6mer " 172 "distance. Default: off"), 173 #The PartTree algorithm is used with distances based on DP. Slightly 174 #more accurate and slower than --parttree. Recommended for a large 175 #number (> ~10,000) of sequences are input. Default: off 176 _Switch(["--dpparttree", "dpparttree"], 177 "The PartTree algorithm is used with distances " 178 "based on DP. Default: off"), 179 #The PartTree algorithm is used with distances based on FASTA. 180 #Slightly more accurate and slower than --parttree. Recommended for 181 #a large number (> ~10,000) of sequences are input. FASTA is 182 #required. Default: off 183 _Switch(["--fastaparttree", "fastaparttree"], 184 "The PartTree algorithm is used with distances based " 185 "on FASTA. Default: off"), 186 #The number of partitions in the PartTree algorithm. Default: 50 187 _Option(["--partsize", "partsize"], 188 "The number of partitions in the PartTree algorithm. " 189 "Default: 50", 190 checker_function=lambda x: isinstance(x, int), 191 equate=False), 192 #Do not make alignment larger than number sequences. Valid only with 193 #the --*parttree options. Default: the number of input sequences 194 _Switch(["--groupsize", "groupsize"], 195 "Do not make alignment larger than number sequences. " 196 "Default: the number of input sequences"), 197 #**** Parameter **** 198 #Gap opening penalty at group-to-group alignment. Default: 1.53 199 _Option(["--op", "op"], 200 "Gap opening penalty at group-to-group alignment. " 201 "Default: 1.53", 202 checker_function=lambda x: isinstance(x, float), 203 equate=False), 204 #Offset value, which works like gap extension penalty, for group-to- 205 #group alignment. Deafult: 0.123 206 _Option(["--ep", "ep"], 207 "Offset value, which works like gap extension penalty, " 208 "for group-to- group alignment. Default: 0.123", 209 checker_function=lambda x: isinstance(x, float), 210 equate=False), 211 #Gap opening penalty at local pairwise alignment. Valid when the -- 212 #localpair or --genafpair option is selected. Default: -2.00 213 _Option(["--lop", "lop"], 214 "Gap opening penalty at local pairwise alignment. " 215 "Default: 0.123", 216 checker_function=lambda x: isinstance(x, float), 217 equate=False), 218 #Offset value at local pairwise alignment. Valid when the -- 219 #localpair or --genafpair option is selected. Default: 0.1 220 _Option(["--lep", "lep"], 221 "Offset value at local pairwise alignment. " 222 "Default: 0.1", 223 checker_function=lambda x: isinstance(x, float), 224 equate=False), 225 #Gap extension penalty at local pairwise alignment. Valid when the - 226 #-localpair or --genafpair option is selected. Default: -0.1 227 _Option(["--lexp", "lexp"], 228 "Gap extension penalty at local pairwise alignment. " 229 "Default: -0.1", 230 checker_function=lambda x: isinstance(x, float), 231 equate=False), 232 #Gap opening penalty to skip the alignment. Valid when the -- 233 #genafpair option is selected. Default: -6.00 234 _Option(["--LOP", "LOP"], 235 "Gap opening penalty to skip the alignment. " 236 "Default: -6.00", 237 checker_function=lambda x: isinstance(x, float), 238 equate=False), 239 #Gap extension penalty to skip the alignment. Valid when the -- 240 #genafpair option is selected. Default: 0.00 241 _Option(["--LEXP", "LEXP"], 242 "Gap extension penalty to skip the alignment. " 243 "Default: 0.00", 244 checker_function=lambda x: isinstance(x, float), 245 equate=False), 246 247 #BLOSUM number matrix (Henikoff and Henikoff 1992) is used. 248 #number=30, 45, 62 or 80. Default: 62 249 _Option(["--bl", "bl"], 250 "BLOSUM number matrix is used. Default: 62", 251 checker_function=lambda x: x in BLOSUM_MATRICES, 252 equate=False), 253 #JTT PAM number (Jones et al. 1992) matrix is used. number>0. 254 #Default: BLOSUM62 255 _Option(["--jtt", "jtt"], 256 "JTT PAM number (Jones et al. 1992) matrix is used. " 257 "number>0. Default: BLOSUM62", 258 equate=False), 259 #Transmembrane PAM number (Jones et al. 1994) matrix is used. 260 #number>0. Default: BLOSUM62 261 _Option(["--tm", "tm"], 262 "Transmembrane PAM number (Jones et al. 1994) " 263 "matrix is used. number>0. Default: BLOSUM62", 264 checker_function=os.path.exists, 265 filename=True, 266 equate=False), 267 #Use a user-defined AA scoring matrix. The format of matrixfile is 268 #the same to that of BLAST. Ignored when nucleotide sequences are 269 #input. Default: BLOSUM62 270 _Option(["--aamatrix", "aamatrix"], 271 "Use a user-defined AA scoring matrix. " 272 "Default: BLOSUM62", 273 checker_function=os.path.exists, 274 filename=True, 275 equate=False), 276 #Incorporate the AA/nuc composition information into the scoring 277 #matrix. Default: off 278 _Switch(["--fmodel", "fmodel"], 279 "Incorporate the AA/nuc composition information into " 280 "the scoring matrix (True) or not (False, default)"), 281 #**** Output **** 282 #Output format: clustal format. Default: off (fasta format) 283 _Switch(["--clustalout", "clustalout"], 284 "Output format: clustal (True) or fasta (False, default)"), 285 #Output order: same as input. Default: on 286 _Switch(["--inputorder", "inputorder"], 287 "Output order: same as input (True, default) or alignment " 288 "based (False)"), 289 #Output order: aligned. Default: off (inputorder) 290 _Switch(["--reorder", "reorder"], 291 "Output order: aligned (True) or in input order (False, " 292 "default)"), 293 #Guide tree is output to the input.tree file. Default: off 294 _Switch(["--treeout", "treeout"], 295 "Guide tree is output to the input.tree file (True) or " 296 "not (False, default)"), 297 #Do not report progress. Default: off 298 _Switch(["--quiet", "quiet"], 299 "Do not report progress (True) or not (False, default)."), 300 #**** Input **** 301 #Assume the sequences are nucleotide. Deafult: auto 302 _Switch(["--nuc", "nuc"], 303 "Assume the sequences are nucleotide (True/False). " 304 "Default: auto"), 305 #Assume the sequences are amino acid. Deafult: auto 306 _Switch(["--amino", "amino"], 307 "Assume the sequences are amino acid (True/False). " 308 "Default: auto"), 309 ###################### SEEDS ##################################### 310 # MAFFT has multiple --seed commands where the unaligned input is 311 # aligned to the seed alignment. There can be multiple seeds in the 312 # form: "mafft --seed align1 --seed align2 [etc] input" 313 # Effectively for n number of seed alignments. Here we're going to 314 # assume 6 extra are enough 315 _Option(["--seed", "seed"], 316 "Seed alignments given in alignment_n (fasta format) " 317 "are aligned with sequences in input.", 318 checker_function=os.path.exists, 319 filename=True, 320 equate=False), 321 #The old solution of also defining extra parameters with 322 #["--seed", "seed1"] etc worked, but clashes with the recent 323 #code in the base class to look for duplicate paramters and raise 324 #an error. Perhaps that check should be ignored here, or maybe 325 #we can handle this more elegantly... 326 #TODO - Create an _OptionList parameter which allows a list to be 327 #assigned to the value? 328 ####################### END SEEDS ################################ 329 #The input (must be FASTA format) 330 _Argument(["input"], 331 "Input file name", 332 checker_function=os.path.exists, 333 filename=True, 334 is_required=True), 335 ################################################################### 336 #mafft-profile takes a second alignment input as an argument: 337 #mafft-profile align1 align2 338 _Argument(["input1"], 339 "Second input file name for the mafft-profile command", 340 checker_function=os.path.exists, 341 filename=True), 342 ] 343 AbstractCommandline.__init__(self, cmd, **kwargs)
344
345 -def _test():
346 """Run the module's doctests (PRIVATE). 347 348 This will try and locate the unit tests directory, and run the doctests 349 from there in order that the relative paths used in the examples work. 350 """ 351 #TODO - Remove os.path checks on input filenames? 352 import doctest 353 import os 354 if os.path.isdir(os.path.join("..","Tests")): 355 print "Runing doctests..." 356 cur_dir = os.path.abspath(os.curdir) 357 os.chdir(os.path.join("..","Tests")) 358 doctest.testmod() 359 os.chdir(cur_dir) 360 del cur_dir 361 print "Done" 362 elif os.path.isdir(os.path.join("Tests")) : 363 print "Runing doctests..." 364 cur_dir = os.path.abspath(os.curdir) 365 os.chdir(os.path.join("Tests")) 366 doctest.testmod() 367 os.chdir(cur_dir) 368 del cur_dir 369 print "Done"
370 371 if __name__ == "__main__": 372 _test() 373