Package Bio :: Package SwissProt
[hide private]
[frames] | no frames]

Source Code for Package Bio.SwissProt

  1  # Copyright 2007 by Michiel de Hoon.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  This module provides code to work with the sprotXX.dat file from 
  7  SwissProt. 
  8  http://www.expasy.ch/sprot/sprot-top.html 
  9   
 10  Tested with: 
 11  Release 56.9, 03-March-2009. 
 12   
 13   
 14  Classes: 
 15  Record             Holds SwissProt data. 
 16  Reference          Holds reference data from a SwissProt record. 
 17   
 18  Functions: 
 19  read               Read one SwissProt record 
 20  parse              Read multiple SwissProt records 
 21   
 22  """ 
 23   
 24  from Bio._py3k import _as_string 
 25   
26 -class Record(object):
27 """Holds information from a SwissProt record. 28 29 Members: 30 entry_name Name of this entry, e.g. RL1_ECOLI. 31 data_class Either 'STANDARD' or 'PRELIMINARY'. 32 molecule_type Type of molecule, 'PRT', 33 sequence_length Number of residues. 34 35 accessions List of the accession numbers, e.g. ['P00321'] 36 created A tuple of (date, release). 37 sequence_update A tuple of (date, release). 38 annotation_update A tuple of (date, release). 39 40 description Free-format description. 41 gene_name Gene name. See userman.txt for description. 42 organism The source of the sequence. 43 organelle The origin of the sequence. 44 organism_classification The taxonomy classification. List of strings. 45 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 46 taxonomy_id A list of NCBI taxonomy id's. 47 host_organism A list of names of the hosts of a virus, if any. 48 host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any. 49 references List of Reference objects. 50 comments List of strings. 51 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 52 keywords List of the keywords. 53 features List of tuples (key name, from, to, description). 54 from and to can be either integers for the residue 55 numbers, '<', '>', or '?' 56 57 seqinfo tuple of (length, molecular weight, CRC32 value) 58 sequence The sequence. 59 60 """
61 - def __init__(self):
62 self.entry_name = None 63 self.data_class = None 64 self.molecule_type = None 65 self.sequence_length = None 66 67 self.accessions = [] 68 self.created = None 69 self.sequence_update = None 70 self.annotation_update = None 71 72 self.description = [] 73 self.gene_name = '' 74 self.organism = [] 75 self.organelle = '' 76 self.organism_classification = [] 77 self.taxonomy_id = [] 78 self.host_organism = [] 79 self.host_taxonomy_id = [] 80 self.references = [] 81 self.comments = [] 82 self.cross_references = [] 83 self.keywords = [] 84 self.features = [] 85 86 self.seqinfo = None 87 self.sequence = ''
88 89
90 -class Reference(object):
91 """Holds information from one reference in a SwissProt entry. 92 93 Members: 94 number Number of reference in an entry. 95 positions Describes extent of work. list of strings. 96 comments Comments. List of (token, text). 97 references References. List of (dbname, identifier) 98 authors The authors of the work. 99 title Title of the work. 100 location A citation for the work. 101 102 """
103 - def __init__(self):
104 self.number = None 105 self.positions = [] 106 self.comments = [] 107 self.references = [] 108 self.authors = [] 109 self.title = [] 110 self.location = []
111 112
113 -def parse(handle):
114 while True: 115 record = _read(handle) 116 if not record: 117 return 118 yield record
119 120
121 -def read(handle):
122 record = _read(handle) 123 if not record: 124 raise ValueError("No SwissProt record found") 125 # We should have reached the end of the record by now 126 remainder = handle.read() 127 if remainder: 128 raise ValueError("More than one SwissProt record found") 129 return record
130 131 132 # Everything below is considered private 133 134
135 -def _read(handle):
136 record = None 137 unread = "" 138 for line in handle: 139 #This is for Python 3 to cope with a binary handle (byte strings), 140 #or a text handle (unicode strings): 141 line = _as_string(line) 142 key, value = line[:2], line[5:].rstrip() 143 if unread: 144 value = unread + " " + value 145 unread = "" 146 if key=='**': 147 #See Bug 2353, some files from the EBI have extra lines 148 #starting "**" (two asterisks/stars). They appear 149 #to be unofficial automated annotations. e.g. 150 #** 151 #** ################# INTERNAL SECTION ################## 152 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 153 pass 154 elif key=='ID': 155 record = Record() 156 _read_id(record, line) 157 _sequence_lines = [] 158 elif key=='AC': 159 accessions = [word for word in value.rstrip(";").split("; ")] 160 record.accessions.extend(accessions) 161 elif key=='DT': 162 _read_dt(record, line) 163 elif key=='DE': 164 record.description.append(value.strip()) 165 elif key=='GN': 166 if record.gene_name: 167 record.gene_name += " " 168 record.gene_name += value 169 elif key=='OS': 170 record.organism.append(value) 171 elif key=='OG': 172 record.organelle += line[5:] 173 elif key=='OC': 174 cols = [col for col in value.rstrip(";.").split("; ")] 175 record.organism_classification.extend(cols) 176 elif key=='OX': 177 _read_ox(record, line) 178 elif key=='OH': 179 _read_oh(record, line) 180 elif key=='RN': 181 reference = Reference() 182 _read_rn(reference, value) 183 record.references.append(reference) 184 elif key=='RP': 185 assert record.references, "RP: missing RN" 186 record.references[-1].positions.append(value) 187 elif key=='RC': 188 assert record.references, "RC: missing RN" 189 reference = record.references[-1] 190 unread = _read_rc(reference, value) 191 elif key=='RX': 192 assert record.references, "RX: missing RN" 193 reference = record.references[-1] 194 _read_rx(reference, value) 195 elif key=='RL': 196 assert record.references, "RL: missing RN" 197 reference = record.references[-1] 198 reference.location.append(value) 199 # In UniProt release 1.12 of 6/21/04, there is a new RG 200 # (Reference Group) line, which references a group instead of 201 # an author. Each block must have at least 1 RA or RG line. 202 elif key=='RA': 203 assert record.references, "RA: missing RN" 204 reference = record.references[-1] 205 reference.authors.append(value) 206 elif key=='RG': 207 assert record.references, "RG: missing RN" 208 reference = record.references[-1] 209 reference.authors.append(value) 210 elif key=="RT": 211 assert record.references, "RT: missing RN" 212 reference = record.references[-1] 213 reference.title.append(value) 214 elif key=='CC': 215 _read_cc(record, line) 216 elif key=='DR': 217 _read_dr(record, value) 218 elif key=='PE': 219 #TODO - Record this information? 220 pass 221 elif key=='KW': 222 cols = value.rstrip(";.").split('; ') 223 record.keywords.extend(cols) 224 elif key=='FT': 225 _read_ft(record, line) 226 elif key=='SQ': 227 cols = value.split() 228 assert len(cols) == 7, "I don't understand SQ line %s" % line 229 # Do more checking here? 230 record.seqinfo = int(cols[1]), int(cols[3]), cols[5] 231 elif key==' ': 232 _sequence_lines.append(value.replace(" ", "").rstrip()) 233 elif key=='//': 234 # Join multiline data into one string 235 record.description = " ".join(record.description) 236 record.organism = " ".join(record.organism) 237 record.organelle = record.organelle.rstrip() 238 for reference in record.references: 239 reference.authors = " ".join(reference.authors).rstrip(";") 240 reference.title = " ".join(reference.title).rstrip(";") 241 if reference.title.startswith('"') and reference.title.endswith('"'): 242 reference.title = reference.title[1:-1] #remove quotes 243 reference.location = " ".join(reference.location) 244 record.sequence = "".join(_sequence_lines) 245 return record 246 else: 247 raise ValueError("Unknown keyword '%s' found" % key) 248 if record: 249 raise ValueError("Unexpected end of stream.")
250 251
252 -def _read_id(record, line):
253 cols = line[5:].split() 254 #Prior to release 51, included with MoleculeType: 255 #ID EntryName DataClass; MoleculeType; SequenceLength AA. 256 # 257 #Newer files lack the MoleculeType: 258 #ID EntryName DataClass; SequenceLength AA. 259 if len(cols) == 5: 260 record.entry_name = cols[0] 261 record.data_class = cols[1].rstrip(";") 262 record.molecule_type = cols[2].rstrip(";") 263 record.sequence_length = int(cols[3]) 264 elif len(cols) == 4: 265 record.entry_name = cols[0] 266 record.data_class = cols[1].rstrip(";") 267 record.molecule_type = None 268 record.sequence_length = int(cols[2]) 269 else: 270 raise ValueError("ID line has unrecognised format:\n"+line) 271 # check if the data class is one of the allowed values 272 allowed = ('STANDARD', 'PRELIMINARY', 'IPI', 'Reviewed', 'Unreviewed') 273 if record.data_class not in allowed: 274 raise ValueError("Unrecognized data class %s in line\n%s" % \ 275 (record.data_class, line)) 276 # molecule_type should be 'PRT' for PRoTein 277 # Note that has been removed in recent releases (set to None) 278 if record.molecule_type not in (None, 'PRT'): 279 raise ValueError("Unrecognized molecule type %s in line\n%s" % \ 280 (record.molecule_type, line))
281 282
283 -def _read_dt(record, line):
284 value = line[5:] 285 uprline = value.upper() 286 cols = value.rstrip().split() 287 if 'CREATED' in uprline \ 288 or 'LAST SEQUENCE UPDATE' in uprline \ 289 or 'LAST ANNOTATION UPDATE' in uprline: 290 # Old style DT line 291 # ================= 292 # e.g. 293 # DT 01-FEB-1995 (Rel. 31, Created) 294 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 295 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 296 # 297 # or: 298 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 299 # ... 300 301 # find where the version information will be located 302 # This is needed for when you have cases like IPI where 303 # the release verison is in a different spot: 304 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 305 uprcols = uprline.split() 306 rel_index = -1 307 for index in range(len(uprcols)): 308 if uprcols[index].find("REL.") >= 0: 309 rel_index = index 310 assert rel_index >= 0, \ 311 "Could not find Rel. in DT line: %s" % line 312 version_index = rel_index + 1 313 # get the version information 314 str_version = cols[version_index].rstrip(",") 315 # no version number 316 if str_version == '': 317 version = 0 318 # dot versioned 319 elif str_version.find(".") >= 0: 320 version = str_version 321 # integer versioned 322 else: 323 version = int(str_version) 324 date = cols[0] 325 326 if 'CREATED' in uprline: 327 record.created = date, version 328 elif 'LAST SEQUENCE UPDATE' in uprline: 329 record.sequence_update = date, version 330 elif 'LAST ANNOTATION UPDATE' in uprline: 331 record.annotation_update = date, version 332 else: 333 assert False, "Shouldn't reach this line!" 334 elif 'INTEGRATED INTO' in uprline \ 335 or 'SEQUENCE VERSION' in uprline \ 336 or 'ENTRY VERSION' in uprline: 337 # New style DT line 338 # ================= 339 # As of UniProt Knowledgebase release 7.0 (including 340 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 341 # format of the DT lines and the version information 342 # in them was changed - the release number was dropped. 343 # 344 # For more information see bug 1948 and 345 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 346 # 347 # e.g. 348 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 349 # DT 15-OCT-2001, sequence version 3. 350 # DT 01-APR-2004, entry version 14. 351 # 352 #This is a new style DT line... 353 354 # The date should be in string cols[1] 355 # Get the version number if there is one. 356 # For the three DT lines above: 0, 3, 14 357 try: 358 version = int(cols[-1]) 359 except ValueError: 360 version = 0 361 date = cols[0].rstrip(",") 362 363 # Re-use the historical property names, even though 364 # the meaning has changed slighty: 365 if "INTEGRATED" in uprline: 366 record.created = date, version 367 elif 'SEQUENCE VERSION' in uprline: 368 record.sequence_update = date, version 369 elif 'ENTRY VERSION' in uprline: 370 record.annotation_update = date, version 371 else: 372 assert False, "Shouldn't reach this line!" 373 else: 374 raise ValueError("I don't understand the date line %s" % line)
375 376
377 -def _read_ox(record, line):
378 # The OX line is in the format: 379 # OX DESCRIPTION=ID[, ID]...; 380 # If there are too many id's to fit onto a line, then the ID's 381 # continue directly onto the next line, e.g. 382 # OX DESCRIPTION=ID[, ID]... 383 # OX ID[, ID]...; 384 # Currently, the description is always "NCBI_TaxID". 385 # To parse this, I need to check to see whether I'm at the 386 # first line. If I am, grab the description and make sure 387 # it's an NCBI ID. Then, grab all the id's. 388 if record.taxonomy_id: 389 ids = line[5:].rstrip().rstrip(";") 390 else: 391 descr, ids = line[5:].rstrip().rstrip(";").split("=") 392 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 393 record.taxonomy_id.extend(ids.split(', '))
394 395
396 -def _read_oh(record, line):
397 # Line type OH (Organism Host) for viral hosts 398 assert line[5:].startswith("NCBI_TaxID="), "Unexpected %s" % line 399 line = line[16:].rstrip() 400 assert line[-1]=="." and line.count(";")==1, line 401 taxid, name = line[:-1].split(";") 402 record.host_taxonomy_id.append(taxid.strip()) 403 record.host_organism.append(name.strip())
404 405
406 -def _read_rn(reference, rn):
407 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 408 reference.number = int(rn[1:-1])
409 410
411 -def _read_rc(reference, value):
412 cols = value.split(';') 413 if value[-1]==';': 414 unread = "" 415 else: 416 cols, unread = cols[:-1], cols[-1] 417 for col in cols: 418 if not col: # last column will be the empty string 419 return 420 # The token is everything before the first '=' character. 421 i = col.find("=") 422 if i>=0: 423 token, text = col[:i], col[i+1:] 424 comment = token.lstrip(), text 425 reference.comments.append(comment) 426 else: 427 comment = reference.comments[-1] 428 comment = "%s %s" % (comment, col) 429 reference.comments[-1] = comment 430 return unread
431 432
433 -def _read_rx(reference, value):
434 # The basic (older?) RX line is of the form: 435 # RX MEDLINE; 85132727. 436 # but there are variants of this that need to be dealt with (see below) 437 438 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 439 # have extraneous information in the RX line. Check for 440 # this and chop it out of the line. 441 # (noticed by katel@worldpath.net) 442 value = value.replace(' [NCBI, ExPASy, Israel, Japan]','') 443 444 # RX lines can also be used of the form 445 # RX PubMed=9603189; 446 # reported by edvard@farmasi.uit.no 447 # and these can be more complicated like: 448 # RX MEDLINE=95385798; PubMed=7656980; 449 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 450 # We look for these cases first and deal with them 451 if "=" in value: 452 cols = value.split("; ") 453 cols = [x.strip() for x in cols] 454 cols = [x for x in cols if x] 455 for col in cols: 456 x = col.split("=") 457 assert len(x) == 2, "I don't understand RX line %s" % value 458 key, value = x[0], x[1].rstrip(";") 459 reference.references.append((key, value)) 460 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 461 else: 462 cols = value.split("; ") 463 # normally we split into the three parts 464 assert len(cols) == 2, "I don't understand RX line %s" % value 465 reference.references.append((cols[0].rstrip(";"), cols[1].rstrip(".")))
466 467
468 -def _read_cc(record, line):
469 key, value = line[5:8], line[9:].rstrip() 470 if key=='-!-': # Make a new comment 471 record.comments.append(value) 472 elif key==' ': # add to the previous comment 473 if not record.comments: 474 # TCMO_STRGA in Release 37 has comment with no topic 475 record.comments.append(value) 476 else: 477 record.comments[-1] += " " + value
478 479
480 -def _read_dr(record, value):
481 # Remove the comments at the end of the line 482 i = value.find(' [') 483 if i >= 0: 484 value = value[:i] 485 cols = value.rstrip(".").split('; ') 486 record.cross_references.append(tuple(cols))
487 488
489 -def _read_ft(record, line):
490 line = line[5:] # get rid of junk in front 491 name = line[0:8].rstrip() 492 try: 493 from_res = int(line[9:15]) 494 except ValueError: 495 from_res = line[9:15].lstrip() 496 try: 497 to_res = int(line[16:22]) 498 except ValueError: 499 to_res = line[16:22].lstrip() 500 #if there is a feature_id (FTId), store it away 501 if line[29:35]==r"/FTId=": 502 ft_id = line[35:70].rstrip()[:-1] 503 description = "" 504 else: 505 ft_id ="" 506 description = line[29:70].rstrip() 507 if not name: # is continuation of last one 508 assert not from_res and not to_res 509 name, from_res, to_res, old_description,old_ft_id = record.features[-1] 510 del record.features[-1] 511 description = ("%s %s" % (old_description, description)).strip() 512 513 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 514 if name == "VARSPLIC": 515 # Remove unwanted spaces in sequences. 516 # During line carryover, the sequences in VARSPLIC can get mangled 517 # with unwanted spaces like: 518 # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 519 # We want to check for this case and correct it as it happens. 520 descr_cols = description.split(" -> ") 521 if len(descr_cols) == 2: 522 first_seq, second_seq = descr_cols 523 extra_info = '' 524 # we might have more information at the end of the 525 # second sequence, which should be in parenthesis 526 extra_info_pos = second_seq.find(" (") 527 if extra_info_pos != -1: 528 extra_info = second_seq[extra_info_pos:] 529 second_seq = second_seq[:extra_info_pos] 530 # now clean spaces out of the first and second string 531 first_seq = first_seq.replace(" ", "") 532 second_seq = second_seq.replace(" ", "") 533 # reassemble the description 534 description = first_seq + " -> " + second_seq + extra_info 535 record.features.append((name, from_res, to_res, description,ft_id))
536 537 538 if __name__ == "__main__": 539 print "Quick self test..." 540 541 example_filename = "../../Tests/SwissProt/sp008" 542 543 import os 544 if not os.path.isfile(example_filename): 545 print "Missing test file %s" % example_filename 546 else: 547 #Try parsing it! 548 549 handle = open(example_filename) 550 records = parse(handle) 551 for record in records: 552 print record.entry_name 553 print ",".join(record.accessions) 554 print record.keywords 555 print repr(record.organism) 556 print record.sequence[:20] + "..." 557 handle.close() 558