1 """Find and deal with motifs in biological sequence data.
2
3 Representing DNA (or RNA or proteins) in a neural network can be difficult
4 since input sequences can have different lengths. One way to get around
5 this problem is to deal with sequences by finding common motifs, and counting
6 the number of times those motifs occur in a sequence. This information can
7 then be used for creating the neural networks, with occurances of motifs
8 going into the network instead of raw sequence data.
9 """
10
11 from Bio.Alphabet import _verify_alphabet
12 from Bio.Seq import Seq
13
14
15 from Pattern import PatternRepository
16
18 """Find motifs in a set of Sequence Records.
19 """
20 - def __init__(self, alphabet_strict = 1):
21 """Initialize a finder to get motifs.
22
23 Arguments:
24
25 o alphabet_strict - Whether or not motifs should be
26 restricted to having all of there elements within the alphabet
27 of the sequences. This requires that the Sequences have a real
28 alphabet, and that all sequences have the same alphabet.
29 """
30 self.alphabet_strict = alphabet_strict
31
32 - def find(self, seq_records, motif_size):
33 """Find all motifs of the given size in the passed SeqRecords.
34
35 Arguments:
36
37 o seq_records - A list of SeqRecord objects which the motifs
38 will be found from.
39
40 o motif_size - The size of the motifs we want to look for.
41
42 Returns:
43 A PatternRepository object that contains all of the motifs (and their
44 counts) found in the training sequences).
45 """
46 motif_info = self._get_motif_dict(seq_records, motif_size)
47
48 return PatternRepository(motif_info)
49
51 """Return a dictionary with information on motifs.
52
53 This internal function essentially does all of the hard work for
54 finding motifs, and returns a dictionary containing the found motifs
55 and their counts. This is internal so it can be reused by
56 find_motif_differences.
57 """
58 if self.alphabet_strict:
59 alphabet = seq_records[0].seq.alphabet
60 else:
61 alphabet = None
62
63
64 all_motifs = {}
65 for seq_record in seq_records:
66
67 if alphabet is not None:
68 assert seq_record.seq.alphabet == alphabet, \
69 "Working with alphabet %s and got %s" % \
70 (alphabet, seq_record.seq.alphabet)
71
72
73 for start in range(len(seq_record.seq) - (motif_size - 1)):
74 motif = seq_record.seq[start:start + motif_size].tostring()
75
76
77
78 if alphabet is not None:
79 motif_seq = Seq(motif, alphabet)
80 if _verify_alphabet(motif_seq):
81 all_motifs = self._add_motif(all_motifs, motif)
82
83
84 else:
85 all_motifs = self._add_motif(all_motifs, motif)
86
87 return all_motifs
88
90 """Find motifs in two sets of records and return the differences.
91
92 This is used for finding motifs, but instead of just counting up all
93 of the motifs in a set of records, this returns the differences
94 between two listings of seq_records.
95
96 o first_records, second_records - Two listings of SeqRecord objects
97 to have their motifs compared.
98
99 o motif_size - The size of the motifs we are looking for.
100
101 Returns:
102 A PatternRepository object that has motifs, but instead of their
103 raw counts, this has the counts in the first set of records
104 subtracted from the counts in the second set.
105 """
106 first_motifs = self._get_motif_dict(first_records, motif_size)
107 second_motifs = self._get_motif_dict(second_records, motif_size)
108
109 motif_diffs = {}
110
111
112 for cur_key in first_motifs:
113 if cur_key in second_motifs:
114 motif_diffs[cur_key] = first_motifs[cur_key] - \
115 second_motifs[cur_key]
116 else:
117 motif_diffs[cur_key] = first_motifs[cur_key]
118
119
120
121 missing_motifs = list(second_motifs)
122
123
124 for added_motif in motif_diffs:
125 if added_motif in missing_motifs:
126 missing_motifs.remove(added_motif)
127
128
129 for cur_key in missing_motifs:
130 motif_diffs[cur_key] = 0 - second_motifs[cur_key]
131
132 return PatternRepository(motif_diffs)
133
135 """Add a motif to the given dictionary.
136 """
137
138 if motif_to_add in motif_dict:
139 motif_dict[motif_to_add] += 1
140
141 else:
142 motif_dict[motif_to_add] = 1
143
144 return motif_dict
145
147 """Convert motifs and a sequence into neural network representations.
148
149 This is designed to convert a sequence into a representation that
150 can be fed as an input into a neural network. It does this by
151 representing a sequence based the motifs present.
152 """
154 """Initialize an input producer with motifs to look for.
155
156 Arguments:
157
158 o motifs - A complete list of motifs, in order, that are to be
159 searched for in a sequence.
160 """
161 self._motifs = motifs
162
163
164 self._motif_size = len(self._motifs[0])
165 for motif in self._motifs:
166 if len(motif) != self._motif_size:
167 raise ValueError("Motif %s given, expected motif size %s"
168 % (motif, self._motif_size))
169
171 """Represent a sequence as a set of motifs.
172
173 Arguments:
174
175 o sequence - A Bio.Seq object to represent as a motif.
176
177 This converts a sequence into a representation based on the motifs.
178 The representation is returned as a list of the relative amount of
179 each motif (number of times a motif occured divided by the total
180 number of motifs in the sequence). The values in the list correspond
181 to the input order of the motifs specified in the initializer.
182 """
183
184 seq_motifs = {}
185 for motif in self._motifs:
186 seq_motifs[motif] = 0
187
188
189 for start in range(len(sequence) - (self._motif_size - 1)):
190 motif = sequence[start:start + self._motif_size].tostring()
191
192 if motif in seq_motifs:
193 seq_motifs[motif] += 1
194
195
196 min_count = min(seq_motifs.values())
197 max_count = max(seq_motifs.values())
198
199
200
201 if max_count > 0:
202 for motif in seq_motifs.keys():
203 seq_motifs[motif] = (float(seq_motifs[motif] - min_count)
204 / float(max_count))
205
206
207 motif_amounts = []
208 for motif in self._motifs:
209 motif_amounts.append(seq_motifs[motif])
210
211 return motif_amounts
212