1
2
3
4
5
6 """Bio.SeqIO support for the "tab" (simple tab separated) file format.
7
8 You are expected to use this module via the Bio.SeqIO functions.
9
10 The "tab" format is an ad-hoc plain text file format where each sequence is
11 on one (long) line. Each line contains the identifier/description, followed
12 by a tab, followed by the sequence. For example, consider the following
13 short FASTA format file:
14
15 >ID123456 possible binding site?
16 CATCNAGATGACACTACGACTACGACTCAGACTAC
17 >ID123457 random sequence
18 ACACTACGACTACGACTCAGACTACAAN
19
20 Apart from the descriptions, this can be represented in the simple two column
21 tab separated format as follows:
22
23 ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC
24 ID123457(tab)ACACTACGACTACGACTCAGACTACAAN
25
26 When reading this file, "ID123456" or "ID123457" will be taken as the record's
27 .id and .name property. There is no other information to record.
28
29 Similarly, when writing to this format, Biopython will ONLY record the record's
30 .id and .seq (and not the description or any other information) as in the
31 example above.
32 """
33
34 from Bio.Alphabet import single_letter_alphabet
35 from Bio.Seq import Seq
36 from Bio.SeqRecord import SeqRecord
37 from Bio.SeqIO.Interfaces import SequentialSequenceWriter
38
39
41 """Iterates over tab separated lines (as SeqRecord objects).
42
43 Each line of the file should contain one tab only, dividing the line
44 into an identifier and the full sequence.
45
46 handle - input file
47 alphabet - optional alphabet
48
49 The first field is taken as the record's .id and .name (regardless of
50 any spaces within the text) and the second field is the sequence.
51
52 Any blank lines are ignored.
53 """
54 for line in handle:
55 try:
56 title, seq = line.split("\t")
57 except:
58 if line.strip() == "":
59
60 continue
61 raise ValueError("Each line should have one tab separating the" + \
62 " title and sequence, this line has %i tabs: %s" \
63 % (line.count("\t"), repr(line)))
64 title = title.strip()
65 seq = seq.strip()
66 yield SeqRecord(Seq(seq, alphabet),
67 id=title, name=title,
68 description="")
69
71 """Class to write simple tab separated format files.
72
73 Each line consists of "id(tab)sequence" only.
74
75 Any description, name or other annotation is not recorded.
76 """
78 """Write a single tab line to the file."""
79 assert self._header_written
80 assert not self._footer_written
81 self._record_written = True
82
83 title = self.clean(record.id)
84 seq = self._get_seq_string(record)
85 assert "\t" not in title
86 assert "\n" not in title
87 assert "\r" not in title
88 assert "\t" not in seq
89 assert "\n" not in seq
90 assert "\r" not in seq
91 self.handle.write("%s\t%s\n" % (title, seq))
92
93
94 if __name__ == "__main__":
95 print "Running quick self test"
96 from StringIO import StringIO
97
98
99 handle = StringIO("Alpha\tAAAAAAA\nBeta\tCCCCCCC\n\n")
100 records = list(TabIterator(handle))
101 assert len(records) == 2
102
103 handle = StringIO("Alpha\tAAAAAAA\tExtra\nBeta\tCCCCCCC\n")
104 try:
105 records = list(TabIterator(handle))
106 assert False, "Should have reject this invalid example!"
107 except ValueError:
108
109 pass
110
111 print "Done"
112