1 """Parser for FSSP files, used in a database of protein fold classifications.
2
3 This is a module to handle FSSP files. For now it parses only the header,
4 summary and alignment sections.
5
6 See: Holm and Sander (1996) The FSSP database: fold classification based on
7 structure-structure alignment of proteins.
8
9 functions: read_fssp(file_handle): reads an fssp file into the records. Returns a
10 tuple of two instances.
11 mult_align: returns a Biopython alignment object
12 """
13 import re
14 import fssp_rec
15 from Bio.Align import Generic
16 from Bio import Alphabet
17 fff_rec = fssp_rec.fff_rec
18 header_records = {
19 'database' : re.compile('^DATABASE'),
20 'pdbid': re.compile('^PDBID'),
21 'header': re.compile('^HEADER'),
22 'compnd': re.compile('^COMPND'),
23 'author': re.compile('^AUTHOR'),
24 'source': re.compile('^SOURCE'),
25 'seqlength': re.compile('^SEQLENGTH'),
26 'nalign': re.compile('^NALIGN')
27 }
28
29 summary_title = re.compile('## +SUMMARY')
30 summary_rec = re.compile(' *[0-9]+: +[1-9][0-9a-z]{3,3}')
31 alignments_title= re.compile('## +ALIGNMENTS')
32 alignments_rec = re.compile(' *[0-9]+ +-{0,1}[0-9]+')
33 equiv_title = re.compile('## +EQUIVALENCES')
34
37 self.database = None
38 self.pdbid = ''
39 self.header = ''
40 self.compnd = ''
41 self.source = ''
42 self.author = []
43 self.seqlength = 0
44 self.nalign = 0
46 for i in header_records:
47 if header_records[i].match(inline):
48 if i == 'database' or i == 'seqlength' or i == 'nalign':
49 setattr(self,i,int(inline.split()[1]))
50 elif i == 'compnd' or i == 'author':
51 setattr(self,i,inline.split()[1:])
52 elif i == 'source' or i == 'header':
53 attr = inline[inline.find(' ')+1:].strip()
54 setattr(self,i,attr)
55 else:
56 setattr(self,i,inline.split()[1])
57
60 inStr = inStr.strip()
61 if len(inStr) != 1 and len(inStr)!= 2:
62 raise ValueError('PosAlign: length not 2 chars' + inStr)
63 if inStr == '..':
64 self.aa = '-'
65 self.gap = 1
66 else:
67 self.gap = 0
68 self.aa = inStr[0]
69 if self.aa == self.aa.lower():
70 self.aa = 'C'
71 if len(inStr) == 2:
72 self.ss = inStr[1].upper()
73 else:
74 self.ss = '0'
75
77 if self.gap:
78 outstring = '..'
79 else:
80 outstring = self.aa+self.ss.lower()
81 return outstring
82
83 __str__ = __repr__
84
85
86
87
89 """ Contains info from an FSSP summary record"""
91 self.raw = in_str
92 in_rec = in_str.strip().split()
93
94 self.nr = int(in_rec[0][:-1])
95 self.pdb1 = in_rec[1][:4]
96 if len(in_rec[1]) == 4:
97 self.chain1='0'
98 elif len(in_rec[1]) == 5:
99 self.chain1=in_rec[1][4]
100 else:
101 raise ValueError('Bad PDB ID 1')
102 self.pdb2 = in_rec[2][:4]
103 if len(in_rec[2]) == 4:
104 self.chain2='0'
105 elif len(in_rec[2]) == 5:
106 self.chain2=in_rec[2][4]
107 else:
108 raise ValueError('Bad PDB ID 2')
109 self.zscore = float(in_rec[3])
110 self.rmsd = float(in_rec[4])
111 self.lali = float(in_rec[5])
112 self.lseq2 = float(in_rec[6])
113 self.pID = float(in_rec[7])
114 self.revers = int(in_rec[8])
115 self.permut = int(in_rec[9])
116 self.nfrag = int(in_rec[10])
117 self.topo = in_rec[11]
118 self.doc = ''
119 for i in in_rec[12:]:
120 self.doc = self.doc + i + ' '
121 self.doc = self.doc.rstrip() + '\n'
122
125 __str__ = __repr__
126
148 j = 1
149 for i in self.PosAlignList:
150 self.pos_align_dict[j] = i
151 j = j + 1
152
153
156
157
158
159 self.pdb_res_dict = {}
160 self.abs_res_dict = {}
161 self.data = {}
166
167
169 return self[self.abs_res_dict[num]]
170
171
173 return self[self.pdb_res_dict[num]]
174
175
177 s = ''
178 sorted_pos_nums = self.abs_res_dict.keys()
179 sorted_pos_nums.sort()
180 for i in sorted_pos_nums:
181 s += self.abs(i).pos_align_dict[num].aa
182 return s
183
185 mult_align_dict = {}
186 for j in self.abs(1).pos_align_dict:
187 mult_align_dict[j] = ''
188 for fssp_rec in self.itervalues():
189 for j in fssp_rec.pos_align_dict:
190 mult_align_dict[j] += fssp_rec.pos_align_dict[j].aa
191 seq_order = mult_align_dict.keys()
192 seq_order.sort()
193 out_str = ''
194 for i in seq_order:
195 out_str += '> %d\n' % i
196 k = 0
197 for j in mult_align_dict[i]:
198 k += 1
199 if k % 72 == 0:
200 out_str += '\n'
201 out_str += j
202 out_str += '\n'
203 return out_str
204
207
208
209
210
211
267