1
2
3
4
5
6
7
8 """Substitution matrices, log odds matrices, and operations on them.
9
10 General:
11 -------
12
13 This module provides a class and a few routines for generating
14 substitution matrices, similar ot BLOSUM or PAM matrices, but based on
15 user-provided data.
16 The class used for these matrices is SeqMat
17
18 Matrices are implemented as a dictionary. Each index contains a 2-tuple,
19 which are the two residue/nucleotide types replaced. The value differs
20 according to the matrix's purpose: e.g in a log-odds frequency matrix, the
21 value would be log(Pij/(Pi*Pj)) where:
22 Pij: frequency of substitution of letter (residue/nucletide) i by j
23 Pi, Pj: expected frequencies of i and j, respectively.
24
25 Usage:
26 -----
27 The following section is layed out in the order by which most people wish
28 to generate a log-odds matrix. Of course, interim matrices can be
29 generated and investigated. Most people just want a log-odds matrix,
30 that's all.
31
32 Generating an Accepted Replacement Matrix:
33 -----------------------------------------
34 Initially, you should generate an accepted replacement matrix (ARM)
35 from your data. The values in ARM are the _counted_ number of
36 replacements according to your data. The data could be a set of pairs
37 or multiple alignments. So for instance if Alanine was replaced by
38 Cysteine 10 times, and Cysteine by Alanine 12 times, the corresponding
39 ARM entries would be:
40 ['A','C']: 10,
41 ['C','A'] 12
42 As order doesn't matter, user can already provide only one entry:
43 ['A','C']: 22
44 A SeqMat instance may be initialized with either a full (first
45 method of counting: 10, 12) or half (the latter method, 22) matrices. A
46 Full protein alphabet matrix would be of the size 20x20 = 400. A Half
47 matrix of that alphabet would be 20x20/2 + 20/2 = 210. That is because
48 same-letter entries don't change. (The matrix diagonal). Given an
49 alphabet size of N:
50 Full matrix size:N*N
51 Half matrix size: N(N+1)/2
52
53 If you provide a full matrix, the constructore will create a half-matrix
54 automatically.
55 If you provide a half-matrix, make sure of a (low, high) sorted order in
56 the keys: there should only be
57 a ('A','C') not a ('C','A').
58
59 Internal functions:
60
61 Generating the observed frequency matrix (OFM):
62 ----------------------------------------------
63 Use: OFM = _build_obs_freq_mat(ARM)
64 The OFM is generated from the ARM, only instead of replacement counts, it
65 contains replacement frequencies.
66
67 Generating an expected frequency matrix (EFM):
68 ---------------------------------------------
69 Use: EFM = _build_exp_freq_mat(OFM,exp_freq_table)
70 exp_freq_table: should be a freqTableC instantiation. See freqTable.py for
71 detailed information. Briefly, the expected frequency table has the
72 frequencies of appearance for each member of the alphabet
73
74 Generating a substitution frequency matrix (SFM):
75 ------------------------------------------------
76 Use: SFM = _build_subs_mat(OFM,EFM)
77 Accepts an OFM, EFM. Provides the division product of the corresponding
78 values.
79
80 Generating a log-odds matrix (LOM):
81 ----------------------------------
82 Use: LOM=_build_log_odds_mat(SFM[,logbase=10,factor=10.0,roundit=1])
83 Accepts an SFM. logbase: base of the logarithm used to generate the
84 log-odds values. factor: factor used to multiply the log-odds values.
85 roundit: default - true. Whether to round the values.
86 Each entry is generated by log(LOM[key])*factor
87 And rounded if required.
88
89 External:
90 ---------
91 In most cases, users will want to generate a log-odds matrix only, without
92 explicitly calling the OFM --> EFM --> SFM stages. The function
93 build_log_odds_matrix does that. User provides an ARM and an expected
94 frequency table. The function returns the log-odds matrix.
95
96 Methods for subtraction, addition and multiplication of matrices:
97 -----------------------------------------------------------------
98 * Generation of an expected frequency table from an observed frequency
99 matrix.
100 * Calculation of linear correlation coefficient between two matrices.
101 * Calculation of relative entropy is now done using the
102 _make_relative_entropy method and is stored in the member
103 self.relative_entropy
104 * Calculation of entropy is now done using the _make_entropy method and
105 is stored in the member self.entropy.
106 * Jensen-Shannon distance between the distributions from which the
107 matrices are derived. This is a distance function based on the
108 distribution's entropies.
109 """
110
111
112 import re
113 import sys
114 import copy
115 import math
116 import warnings
117
118
119 import Bio
120 from Bio import Alphabet
121 from Bio.SubsMat import FreqTable
122
123 log = math.log
124
125 NOTYPE = 0
126 ACCREP = 1
127 OBSFREQ = 2
128 SUBS = 3
129 EXPFREQ = 4
130 LO = 5
131 EPSILON = 0.00000000000001
132
133
135 """A Generic sequence matrix class
136 The key is a 2-tuple containing the letter indices of the matrix. Those
137 should be sorted in the tuple (low, high). Because each matrix is dealt
138 with as a half-matrix."""
139
141 ab_dict = {}
142 s = ''
143 for i in self:
144 ab_dict[i[0]] = 1
145 ab_dict[i[1]] = 1
146 for i in sorted(ab_dict):
147 s += i
148 self.alphabet.letters = s
149
150 - def __init__(self, data=None, alphabet=None, mat_name='', build_later=0):
151
152
153
154
155
156
157
158
159
160 assert type(mat_name) == type('')
161
162
163
164
165
166
167
168
169 if data:
170 try:
171 self.update(data)
172 except ValueError:
173 raise ValueError, "Failed to store data in a dictionary"
174 if alphabet == None:
175 alphabet = Alphabet.Alphabet()
176 assert Alphabet.generic_alphabet.contains(alphabet)
177 self.alphabet = alphabet
178
179
180 if not self.alphabet.letters:
181 self._alphabet_from_matrix()
182
183 if not build_later:
184 N = len(self.alphabet.letters)
185 assert len(self) == N**2 or len(self) == N*(N+1)/2
186 self.ab_list = list(self.alphabet.letters)
187 self.ab_list.sort()
188
189 self.mat_name = mat_name
190 if build_later:
191 self._init_zero()
192 else:
193
194 self._full_to_half()
195 self._correct_matrix()
196 self.sum_letters = {}
197 self.relative_entropy = 0
198
200 keylist = self.keys()
201 for key in keylist:
202 if key[0] > key[1]:
203 self[(key[1],key[0])] = self[key]
204 del self[key]
205
207 """
208 Convert a full-matrix to a half-matrix
209 """
210
211
212
213
214 N = len(self.alphabet.letters)
215
216 if len(self) == N*(N+1)/2:
217 return
218 for i in self.ab_list:
219 for j in self.ab_list[:self.ab_list.index(i)+1]:
220 if i != j:
221 self[j,i] = self[j,i] + self[i,j]
222 del self[i,j]
223
225 for i in self.ab_list:
226 for j in self.ab_list[:self.ab_list.index(i)+1]:
227 self[j,i] = 0.
228
230 self.entropy = 0
231 for i in self:
232 if self[i] > EPSILON:
233 self.entropy += self[i]*log(self[i])/log(2)
234 self.entropy = -self.entropy
235
237 result = {}
238 for letter in self.alphabet.letters:
239 result[letter] = 0.0
240 for pair, value in self.iteritems():
241 i1, i2 = pair
242 if i1==i2:
243 result[i1] += value
244 else:
245 result[i1] += value / 2
246 result[i2] += value / 2
247 return result
248
249 - def print_full_mat(self,f=None,format="%4d",topformat="%4s",
250 alphabet=None,factor=1,non_sym=None):
251 f = f or sys.stdout
252
253
254 assert non_sym == None or type(non_sym) == type(1.) or \
255 type(non_sym) == type(1)
256 full_mat = copy.copy(self)
257 for i in self:
258 if i[0] != i[1]:
259 full_mat[(i[1],i[0])] = full_mat[i]
260 if not alphabet:
261 alphabet = self.ab_list
262 topline = ''
263 for i in alphabet:
264 topline = topline + topformat % i
265 topline = topline + '\n'
266 f.write(topline)
267 for i in alphabet:
268 outline = i
269 for j in alphabet:
270 if alphabet.index(j) > alphabet.index(i) and non_sym is not None:
271 val = non_sym
272 else:
273 val = full_mat[i,j]
274 val *= factor
275 if val <= -999:
276 cur_str = ' ND'
277 else:
278 cur_str = format % val
279
280 outline = outline+cur_str
281 outline = outline+'\n'
282 f.write(outline)
283
284 - def print_mat(self,f=None,format="%4d",bottomformat="%4s",
285 alphabet=None,factor=1):
286 """Print a nice half-matrix. f=sys.stdout to see on the screen
287 User may pass own alphabet, which should contain all letters in the
288 alphabet of the matrix, but may be in a different order. This
289 order will be the order of the letters on the axes"""
290
291 f = f or sys.stdout
292 if not alphabet:
293 alphabet = self.ab_list
294 bottomline = ''
295 for i in alphabet:
296 bottomline = bottomline + bottomformat % i
297 bottomline = bottomline + '\n'
298 for i in alphabet:
299 outline = i
300 for j in alphabet[:alphabet.index(i)+1]:
301 try:
302 val = self[j,i]
303 except KeyError:
304 val = self[i,j]
305 val *= factor
306 if val == -999:
307 cur_str = ' ND'
308 else:
309 cur_str = format % val
310
311 outline = outline+cur_str
312 outline = outline+'\n'
313 f.write(outline)
314 f.write(bottomline)
315
317 """Print a nice half-matrix."""
318 output = ""
319 alphabet = self.ab_list
320 n = len(alphabet)
321 for i in range(n):
322 c1 = alphabet[i]
323 output += c1
324 for j in range(i+1):
325 c2 = alphabet[j]
326 try:
327 val = self[c2,c1]
328 except KeyError:
329 val = self[c1,c2]
330 if val == -999:
331 output += ' ND'
332 else:
333 output += "%4d" % val
334 output += '\n'
335 output += '%4s' * n % tuple(alphabet) + "\n"
336 return output
337
339 """ returns a number which is the subtraction product of the two matrices"""
340 mat_diff = 0
341 for i in self:
342 mat_diff += (self[i] - other[i])
343 return mat_diff
344
346 """ returns a matrix for which each entry is the multiplication product of the
347 two matrices passed"""
348 new_mat = copy.copy(self)
349 for i in self:
350 new_mat[i] *= other[i]
351 return new_mat
352
354 new_mat = copy.copy(self)
355 for i in self:
356 new_mat[i] += other[i]
357 return new_mat
358
360 """Accepted replacements matrix"""
361
363 """Observed frequency matrix"""
364
366 """Expected frequency matrix"""
367
368
370 """Substitution matrix"""
371
373 """Calculate and return the relative entropy with respect to an
374 observed frequency matrix"""
375 relative_entropy = 0.
376 for key, value in self.iteritems():
377 if value > EPSILON:
378 relative_entropy += obs_freq_mat[key]*log(value)
379 relative_entropy /= log(2)
380 return relative_entropy
381
382
384 """Log odds matrix"""
385
387 """Calculate and return the relative entropy with respect to an
388 observed frequency matrix"""
389 relative_entropy = 0.
390 for key, value in self.iteritems():
391 relative_entropy += obs_freq_mat[key]*value/log(2)
392 return relative_entropy
393
394
396 """
397 build_obs_freq_mat(acc_rep_mat):
398 Build the observed frequency matrix, from an accepted replacements matrix
399 The acc_rep_mat matrix should be generated by the user.
400 """
401
402 total = float(sum(acc_rep_mat.values()))
403 obs_freq_mat = ObservedFrequencyMatrix(alphabet=acc_rep_mat.alphabet,
404 build_later=1)
405 for i in acc_rep_mat:
406 obs_freq_mat[i] = acc_rep_mat[i]/total
407 return obs_freq_mat
408
410 exp_freq_table = {}
411 for i in obs_freq_mat.alphabet.letters:
412 exp_freq_table[i] = 0.
413 for i in obs_freq_mat:
414 if i[0] == i[1]:
415 exp_freq_table[i[0]] += obs_freq_mat[i]
416 else:
417 exp_freq_table[i[0]] += obs_freq_mat[i] / 2.
418 exp_freq_table[i[1]] += obs_freq_mat[i] / 2.
419 return FreqTable.FreqTable(exp_freq_table,FreqTable.FREQ)
420
422 """Build an expected frequency matrix
423 exp_freq_table: should be a FreqTable instance
424 """
425 exp_freq_mat = ExpectedFrequencyMatrix(alphabet=exp_freq_table.alphabet,
426 build_later=1)
427 for i in exp_freq_mat:
428 if i[0] == i[1]:
429 exp_freq_mat[i] = exp_freq_table[i[0]]**2
430 else:
431 exp_freq_mat[i] = 2.0*exp_freq_table[i[0]]*exp_freq_table[i[1]]
432 return exp_freq_mat
433
434
435
437 """ Build the substitution matrix """
438 if obs_freq_mat.ab_list != exp_freq_mat.ab_list:
439 raise ValueError("Alphabet mismatch in passed matrices")
440 subs_mat = SubstitutionMatrix(obs_freq_mat)
441 for i in obs_freq_mat:
442 subs_mat[i] = obs_freq_mat[i]/exp_freq_mat[i]
443 return subs_mat
444
445
446
447
449 """_build_log_odds_mat(subs_mat,logbase=10,factor=10.0,round_digit=1):
450 Build a log-odds matrix
451 logbase=2: base of logarithm used to build (default 2)
452 factor=10.: a factor by which each matrix entry is multiplied
453 round_digit: roundoff place after decimal point
454 keep_nd: if true, keeps the -999 value for non-determined values (for which there
455 are no substitutions in the frequency substitutions matrix). If false, plants the
456 minimum log-odds value of the matrix in entries containing -999
457 """
458 lo_mat = LogOddsMatrix(subs_mat)
459 for key, value in subs_mat.iteritems():
460 if value < EPSILON:
461 lo_mat[key] = -999
462 else:
463 lo_mat[key] = round(factor*log(value)/log(logbase),round_digit)
464 mat_min = min(lo_mat.values())
465 if not keep_nd:
466 for i in lo_mat:
467 if lo_mat[i] <= -999:
468 lo_mat[i] = mat_min
469 return lo_mat
470
471
472
473
474
475
476
477 -def make_log_odds_matrix(acc_rep_mat,exp_freq_table=None,logbase=2,
478 factor=1.,round_digit=9,keep_nd=0):
486
492
493 -def read_text_matrix(data_file):
494 matrix = {}
495 tmp = data_file.read().split("\n")
496 table=[]
497 for i in tmp:
498 table.append(i.split())
499
500 for rec in table[:]:
501 if (rec.count('#') > 0):
502 table.remove(rec)
503
504
505 while (table.count([]) > 0):
506 table.remove([])
507
508 alphabet = table[0]
509 j = 0
510 for rec in table[1:]:
511
512 row = alphabet[j]
513
514 if re.compile('[A-z\*]').match(rec[0]):
515 first_col = 1
516 else:
517 first_col = 0
518 i = 0
519 for field in rec[first_col:]:
520 col = alphabet[i]
521 matrix[(row,col)] = float(field)
522 i += 1
523 j += 1
524
525 for i in matrix.keys():
526 if '*' in i: del(matrix[i])
527 ret_mat = SeqMat(matrix)
528 return ret_mat
529
530 diagNO = 1
531 diagONLY = 2
532 diagALL = 3
533
535 rel_ent = 0.
536 key_list_1 = sorted(mat_1)
537 key_list_2 = sorted(mat_2)
538 key_list = []
539 sum_ent_1 = 0.; sum_ent_2 = 0.
540 for i in key_list_1:
541 if i in key_list_2:
542 key_list.append(i)
543 if len(key_list_1) != len(key_list_2):
544 sys.stderr.write("Warning:first matrix has more entries than the second\n")
545 if key_list_1 != key_list_2:
546 sys.stderr.write("Warning: indices not the same between matrices\n")
547 for key in key_list:
548 if diag == diagNO and key[0] == key[1]:
549 continue
550 if diag == diagONLY and key[0] != key[1]:
551 continue
552 if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
553 sum_ent_1 += mat_1[key]
554 sum_ent_2 += mat_2[key]
555
556 for key in key_list:
557 if diag == diagNO and key[0] == key[1]:
558 continue
559 if diag == diagONLY and key[0] != key[1]:
560 continue
561 if mat_1[key] > EPSILON and mat_2[key] > EPSILON:
562 val_1 = mat_1[key] / sum_ent_1
563 val_2 = mat_2[key] / sum_ent_2
564
565 rel_ent += val_1 * log(val_1/val_2)/log(logbase)
566 return rel_ent
567
568
570 try:
571 import numpy
572 except ImportError:
573 raise ImportError, "Please install Numerical Python (numpy) if you want to use this function"
574 values = []
575 assert mat_1.ab_list == mat_2.ab_list
576 for ab_pair in mat_1:
577 try:
578 values.append((mat_1[ab_pair], mat_2[ab_pair]))
579 except KeyError:
580 raise ValueError, "%s is not a common key" % ab_pair
581 correlation_matrix = numpy.corrcoef(values, rowvar=0)
582 correlation = correlation_matrix[0,1]
583 return correlation
584
585
586
588 assert mat_1.ab_list == mat_2.ab_list
589 assert pi_1 > 0 and pi_2 > 0 and pi_1< 1 and pi_2 <1
590 assert not (pi_1 + pi_2 - 1.0 > EPSILON)
591 sum_mat = SeqMat(build_later=1)
592 sum_mat.ab_list = mat_1.ab_list
593 for i in mat_1:
594 sum_mat[i] = pi_1 * mat_1[i] + pi_2 * mat_2[i]
595 sum_mat.make_entropy()
596 mat_1.make_entropy()
597 mat_2.make_entropy()
598
599 dJS = sum_mat.entropy - pi_1 * mat_1.entropy - pi_2 *mat_2.entropy
600 return dJS
601
602 """
603 This isn't working yet. Boo hoo!
604 def two_mat_print(mat_1, mat_2, f=None,alphabet=None,factor_1=1, factor_2=1,
605 format="%4d",bottomformat="%4s",topformat="%4s",
606 topindent=7*" ", bottomindent=1*" "):
607 f = f or sys.stdout
608 if not alphabet:
609 assert mat_1.ab_list == mat_2.ab_list
610 alphabet = mat_1.ab_list
611 len_alphabet = len(alphabet)
612 print_mat = {}
613 topline = topindent
614 bottomline = bottomindent
615 for i in alphabet:
616 bottomline += bottomformat % i
617 topline += topformat % alphabet[len_alphabet-alphabet.index(i)-1]
618 topline += '\n'
619 bottomline += '\n'
620 f.write(topline)
621 for i in alphabet:
622 for j in alphabet:
623 print_mat[i,j] = -999
624 diag_1 = {}; diag_2 = {}
625 for i in alphabet:
626 for j in alphabet[:alphabet.index(i)+1]:
627 if i == j:
628 diag_1[i] = mat_1[(i,i)]
629 diag_2[i] = mat_2[(alphabet[len_alphabet-alphabet.index(i)-1],
630 alphabet[len_alphabet-alphabet.index(i)-1])]
631 else:
632 if i > j:
633 key = (j,i)
634 else:
635 key = (i,j)
636 mat_2_key = [alphabet[len_alphabet-alphabet.index(key[0])-1],
637 alphabet[len_alphabet-alphabet.index(key[1])-1]]
638 # print mat_2_key
639 mat_2_key.sort(); mat_2_key = tuple(mat_2_key)
640 # print key ,"||", mat_2_key
641 print_mat[key] = mat_2[mat_2_key]
642 print_mat[(key[1],key[0])] = mat_1[key]
643 for i in alphabet:
644 outline = i
645 for j in alphabet:
646 if i == j:
647 if diag_1[i] == -999:
648 val_1 = ' ND'
649 else:
650 val_1 = format % (diag_1[i]*factor_1)
651 if diag_2[i] == -999:
652 val_2 = ' ND'
653 else:
654 val_2 = format % (diag_2[i]*factor_2)
655 cur_str = val_1 + " " + val_2
656 else:
657 if print_mat[(i,j)] == -999:
658 val = ' ND'
659 elif alphabet.index(i) > alphabet.index(j):
660 val = format % (print_mat[(i,j)]*factor_1)
661 else:
662 val = format % (print_mat[(i,j)]*factor_2)
663 cur_str = val
664 outline += cur_str
665 outline += bottomformat % (alphabet[len_alphabet-alphabet.index(i)-1] +
666 '\n')
667 f.write(outline)
668 f.write(bottomline)
669 """
670