Package Bio :: Package ExPASy :: Module Prosite
[hide private]
[frames] | no frames]

Source Code for Module Bio.ExPASy.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  3  # Revisions Copyright 2007 by Peter Cock.  All rights reserved. 
  4  # Revisions Copyright 2009 by Michiel de Hoon.  All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """ 
  9  This module provides code to work with the prosite dat file from 
 10  Prosite. 
 11  http://www.expasy.ch/prosite/ 
 12   
 13  Tested with: 
 14  Release 20.43, 10-Feb-2009 
 15   
 16   
 17  Functions: 
 18  read                  Reads a Prosite file containing one Prosite record 
 19  parse                 Iterates over records in a Prosite file. 
 20   
 21  Classes: 
 22  Record                Holds Prosite data. 
 23  """ 
 24   
25 -def parse(handle):
26 """Parse Prosite records. 27 28 This function is for parsing Prosite files containing multiple 29 records. 30 31 handle - handle to the file.""" 32 while True: 33 record = __read(handle) 34 if not record: 35 break 36 yield record
37
38 -def read(handle):
39 """Read one Prosite record. 40 41 This function is for parsing Prosite files containing 42 exactly one record. 43 44 handle - handle to the file.""" 45 46 record = __read(handle) 47 # We should have reached the end of the record by now 48 remainder = handle.read() 49 if remainder: 50 raise ValueError("More than one Prosite record found") 51 return record
52
53 -class Record(object):
54 """Holds information from a Prosite record. 55 56 Members: 57 name ID of the record. e.g. ADH_ZINC 58 type Type of entry. e.g. PATTERN, MATRIX, or RULE 59 accession e.g. PS00387 60 created Date the entry was created. (MMM-YYYY) 61 data_update Date the 'primary' data was last updated. 62 info_update Date data other than 'primary' data was last updated. 63 pdoc ID of the PROSITE DOCumentation. 64 65 description Free-format description. 66 pattern The PROSITE pattern. See docs. 67 matrix List of strings that describes a matrix entry. 68 rules List of rule definitions (from RU lines). (strings) 69 prorules List of prorules (from PR lines). (strings) 70 71 NUMERICAL RESULTS 72 nr_sp_release SwissProt release. 73 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 74 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 75 nr_positive True positives. tuple of (hits, seqs) 76 nr_unknown Could be positives. tuple of (hits, seqs) 77 nr_false_pos False positives. tuple of (hits, seqs) 78 nr_false_neg False negatives. (int) 79 nr_partial False negatives, because they are fragments. (int) 80 81 COMMENTS 82 cc_taxo_range Taxonomic range. See docs for format 83 cc_max_repeat Maximum number of repetitions in a protein 84 cc_site Interesting site. list of tuples (pattern pos, desc.) 85 cc_skip_flag Can this entry be ignored? 86 cc_matrix_type 87 cc_scaling_db 88 cc_author 89 cc_ft_key 90 cc_ft_desc 91 cc_version version number (introduced in release 19.0) 92 93 DATA BANK REFERENCES - The following are all 94 lists of tuples (swiss-prot accession, 95 swiss-prot name) 96 dr_positive 97 dr_false_neg 98 dr_false_pos 99 dr_potential Potential hits, but fingerprint region not yet available. 100 dr_unknown Could possibly belong 101 102 pdb_structs List of PDB entries. 103 104 """
105 - def __init__(self):
106 self.name = '' 107 self.type = '' 108 self.accession = '' 109 self.created = '' 110 self.data_update = '' 111 self.info_update = '' 112 self.pdoc = '' 113 114 self.description = '' 115 self.pattern = '' 116 self.matrix = [] 117 self.rules = [] 118 self.prorules = [] 119 self.postprocessing = [] 120 121 self.nr_sp_release = '' 122 self.nr_sp_seqs = '' 123 self.nr_total = (None, None) 124 self.nr_positive = (None, None) 125 self.nr_unknown = (None, None) 126 self.nr_false_pos = (None, None) 127 self.nr_false_neg = None 128 self.nr_partial = None 129 130 self.cc_taxo_range = '' 131 self.cc_max_repeat = '' 132 self.cc_site = [] 133 self.cc_skip_flag = '' 134 135 self.dr_positive = [] 136 self.dr_false_neg = [] 137 self.dr_false_pos = [] 138 self.dr_potential = [] 139 self.dr_unknown = [] 140 141 self.pdb_structs = []
142 143 144 # Everything below are private functions 145
146 -def __read(handle):
147 import re 148 record = None 149 for line in handle: 150 keyword, value = line[:2], line[5:].rstrip() 151 if keyword=='ID': 152 record = Record() 153 cols = value.split("; ") 154 if len(cols) != 2: 155 raise ValueError("I don't understand identification line\n%s" \ 156 % line) 157 record.name = cols[0] 158 record.type = cols[1].rstrip('.') # don't want '.' 159 elif keyword=='AC': 160 record.accession = value.rstrip(';') 161 elif keyword=='DT': 162 dates = value.rstrip('.').split("; ") 163 if (not dates[0].endswith('(CREATED)')) or \ 164 (not dates[1].endswith('(DATA UPDATE)')) or \ 165 (not dates[2].endswith('(INFO UPDATE)')): 166 raise ValueError("I don't understand date line\n%s" % line) 167 record.created = dates[0].rstrip(' (CREATED)') 168 record.data_update = dates[1].rstrip(' (DATA UPDATE)') 169 record.info_update = dates[2].rstrip(' (INFO UPDATE)') 170 elif keyword=='DE': 171 record.description = value 172 elif keyword=='PA': 173 record.pattern += value 174 elif keyword=='MA': 175 record.matrix.append(value) 176 elif keyword=='PP': 177 record.postprocessing.extend(value.split(";")) 178 elif keyword=='RU': 179 record.rules.append(value) 180 elif keyword=='NR': 181 cols = value.split(";") 182 for col in cols: 183 if not col: 184 continue 185 qual, data = [word.lstrip() for word in col.split("=")] 186 if qual == '/RELEASE': 187 release, seqs = data.split(",") 188 record.nr_sp_release = release 189 record.nr_sp_seqs = int(seqs) 190 elif qual == '/FALSE_NEG': 191 record.nr_false_neg = int(data) 192 elif qual == '/PARTIAL': 193 record.nr_partial = int(data) 194 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 195 m = re.match(r'(\d+)\((\d+)\)', data) 196 if not m: 197 raise Exception("Broken data %s in comment line\n%s" \ 198 % (repr(data), line)) 199 hits = tuple(map(int, m.groups())) 200 if(qual == "/TOTAL"): 201 record.nr_total = hits 202 elif(qual == "/POSITIVE"): 203 record.nr_positive = hits 204 elif(qual == "/UNKNOWN"): 205 record.nr_unknown = hits 206 elif(qual == "/FALSE_POS"): 207 record.nr_false_pos = hits 208 else: 209 raise ValueError("Unknown qual %s in comment line\n%s" \ 210 % (repr(qual), line)) 211 elif keyword=='CC': 212 #Expect CC lines like this: 213 #CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; 214 #Can (normally) split on ";" and then on "=" 215 cols = value.split(";") 216 for col in cols: 217 if not col or col[:17] == 'Automatic scaling': 218 # DNAJ_2 in Release 15 has a non-standard comment line: 219 # CC Automatic scaling using reversed database 220 # Throw it away. (Should I keep it?) 221 continue 222 if col.count("=") == 0: 223 #Missing qualifier! Can we recover gracefully? 224 #For example, from Bug 2403, in PS50293 have: 225 #CC /AUTHOR=K_Hofmann; N_Hulo 226 continue 227 qual, data = [word.lstrip() for word in col.split("=")] 228 if qual == '/TAXO-RANGE': 229 record.cc_taxo_range = data 230 elif qual == '/MAX-REPEAT': 231 record.cc_max_repeat = data 232 elif qual == '/SITE': 233 pos, desc = data.split(",") 234 record.cc_site.append((int(pos), desc)) 235 elif qual == '/SKIP-FLAG': 236 record.cc_skip_flag = data 237 elif qual == '/MATRIX_TYPE': 238 record.cc_matrix_type = data 239 elif qual == '/SCALING_DB': 240 record.cc_scaling_db = data 241 elif qual == '/AUTHOR': 242 record.cc_author = data 243 elif qual == '/FT_KEY': 244 record.cc_ft_key = data 245 elif qual == '/FT_DESC': 246 record.cc_ft_desc = data 247 elif qual == '/VERSION': 248 record.cc_version = data 249 else: 250 raise ValueError("Unknown qual %s in comment line\n%s" \ 251 % (repr(qual), line)) 252 elif keyword=='DR': 253 refs = value.split(";") 254 for ref in refs: 255 if not ref: 256 continue 257 acc, name, type = [word.strip() for word in ref.split(",")] 258 if type == 'T': 259 record.dr_positive.append((acc, name)) 260 elif type == 'F': 261 record.dr_false_pos.append((acc, name)) 262 elif type == 'N': 263 record.dr_false_neg.append((acc, name)) 264 elif type == 'P': 265 record.dr_potential.append((acc, name)) 266 elif type == '?': 267 record.dr_unknown.append((acc, name)) 268 else: 269 raise ValueError("I don't understand type flag %s" % type) 270 elif keyword=='3D': 271 cols = value.split() 272 for id in cols: 273 record.pdb_structs.append(id.rstrip(';')) 274 elif keyword=='PR': 275 rules = value.split(";") 276 record.prorules.extend(rules) 277 elif keyword=='DO': 278 record.pdoc = value.rstrip(';') 279 elif keyword=='CC': 280 continue 281 elif keyword=='//': 282 if not record: 283 # Then this was the copyright statement 284 continue 285 break 286 else: 287 raise ValueError("Unknown keyword %s found" % keyword) 288 else: 289 return 290 if not record: 291 raise ValueError("Unexpected end of stream.") 292 return record
293