Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # Modifications Copyright 2010 Jeffrey Finkelstein. All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Changes made by James Casbon: 
 10  # - New Astral class 
 11  # - SQL functionality for both Scop and Astral classes 
 12  # - All sunids are int not strings 
 13  # 
 14  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 15  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 16   
 17  """ SCOP: Structural Classification of Proteins. 
 18   
 19  The SCOP database aims to provide a manually constructed classification of 
 20  all know protein structures into a hierarchy, the main levels of which 
 21  are family, superfamily and fold. 
 22   
 23  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 24   
 25  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 26   
 27  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 28   
 29  The Scop object in this module represents the entire SCOP classification. It 
 30  can be built from the three SCOP parsable files, modified is so desired, and 
 31  converted back to the same file formats. A single SCOP domain (represented 
 32  by the Domain class) can be obtained from Scop using the domain's SCOP 
 33  identifier (sid). 
 34   
 35   
 36  nodeCodeDict  -- A mapping between known 2 letter node codes and a longer 
 37                    description. The known node types are 'cl' (class), 'cf' 
 38                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain),  
 39                    'sp' (species), 'px' (domain). Additional node types may 
 40                    be added in the future. 
 41   
 42  This module also provides code to access SCOP over the WWW. 
 43   
 44  Functions: 
 45  search        -- Access the main CGI script. 
 46  _open         -- Internally used function. 
 47   
 48  """ 
 49   
 50  from types import * 
 51  import os 
 52   
 53  import Des 
 54  import Cla 
 55  import Hie 
 56  from Residues import * 
 57  from Bio import SeqIO 
 58  from Bio.Seq import Seq 
 59   
 60   
 61  nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily', 
 62                   'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'} 
 63   
 64   
 65  _nodetype_to_code= { 'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 66                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 67   
 68  nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ]  
 69   
 70  astralBibIds = [10,20,25,30,35,40,50,70,90,95,100] 
 71   
 72  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 73               1e-20, 1e-25, 1e-50] 
 74   
 75  astralEv_to_file = { 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 76                       0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 77                       1e-4: 'e-4',  1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 78                       1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' } 
 79   
 80  astralEv_to_sql = { 10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 81                       0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 82                       1e-4: 'e_4',  1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 83                       1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50' } 
 84   
 85  try: 
 86      #See if the cmp function exists (will on Python 2) 
 87      _cmp = cmp 
 88  except NameError: 
89 - def _cmp(a,b):
90 """Implementation of cmp(x,y) for Python 3 (PRIVATE). 91 92 Based on Python 3 docs which say if you really need the cmp() 93 functionality, you could use the expression (a > b) - (a < b) 94 as the equivalent for cmp(a, b) 95 """ 96 return (a > b) - (a < b)
97
98 -def cmp_sccs(sccs1, sccs2):
99 """Order SCOP concise classification strings (sccs). 100 101 a.4.5.1 < a.4.5.11 < b.1.1.1 102 103 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 104 The letter represents the class, and the numbers are the fold, 105 superfamily, and family, respectively. 106 107 """ 108 109 s1 = sccs1.split(".") 110 s2 = sccs2.split(".") 111 112 if s1[0] != s2[0]: return _cmp(s1[0], s2[0]) 113 114 s1 = list(map(int, s1[1:])) 115 s2 = list(map(int, s2[1:])) 116 117 return _cmp(s1,s2)
118 119 120 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 121
122 -def parse_domain(str):
123 """Convert an ASTRAL header string into a Scop domain. 124 125 An ASTRAL (http://astral.stanford.edu/) header contains a concise 126 description of a SCOP domain. A very similar format is used when a 127 Domain object is converted into a string. The Domain returned by this 128 method contains most of the SCOP information, but it will not be located 129 within the SCOP hierarchy (i.e. The parent node will be None). The 130 description is composed of the SCOP protein and species descriptions. 131 132 A typical ASTRAL header looks like -- 133 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 134 """ 135 136 m = _domain_re.match(str) 137 if (not m) : raise ValueError("Domain: "+ str) 138 139 dom = Domain() 140 dom.sid = m.group(1) 141 dom.sccs = m.group(2) 142 dom.residues = Residues(m.group(3)) 143 if not dom.residues.pdbid: 144 dom.residues.pdbid= dom.sid[1:5] 145 dom.description = m.group(4).strip() 146 147 return dom
148
149 -def _open_scop_file(scop_dir_path, version, filetype):
150 filename = "dir.%s.scop.txt_%s" % (filetype,version) 151 handle = open(os.path.join( scop_dir_path, filename)) 152 return handle
153 154
155 -class Scop(object):
156 """The entire SCOP hierarchy. 157 158 root -- The root node of the hierarchy 159 """
160 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 161 dir_path=None, db_handle=None, version=None):
162 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 163 164 If no file handles are given, then a Scop object with a single 165 empty root node is returned. 166 167 If a directory and version are given (with dir_path=.., version=...) or 168 file handles for each file, the whole scop tree will be built in memory. 169 170 If a MySQLdb database handle is given, the tree will be built as needed, 171 minimising construction times. To build the SQL database to the methods 172 write_xxx_sql to create the tables. 173 174 """ 175 self._sidDict = {} 176 self._sunidDict = {} 177 178 if cla_handle==des_handle==hie_handle==dir_path==db_handle==None: return 179 180 if dir_path is None and db_handle is None: 181 if cla_handle == None or des_handle==None or hie_handle==None: 182 raise RuntimeError("Need CLA, DES and HIE files to build SCOP") 183 184 sunidDict = {} 185 186 self.db_handle = db_handle 187 try: 188 189 if db_handle: 190 # do nothing if we have a db handle, we'll do it all on the fly 191 pass 192 193 else: 194 # open SCOP parseable files 195 if dir_path: 196 if not version: 197 raise RuntimeError("Need SCOP version to find parsable files in directory") 198 if cla_handle or des_handle or hie_handle: 199 raise RuntimeError("Cannot specify SCOP directory and specific files") 200 201 cla_handle = _open_scop_file( dir_path, version, 'cla') 202 des_handle = _open_scop_file( dir_path, version, 'des') 203 hie_handle = _open_scop_file( dir_path, version, 'hie') 204 205 root = Node() 206 domains = [] 207 root.sunid=0 208 root.type='ro' 209 sunidDict[root.sunid] = root 210 self.root = root 211 root.description = 'SCOP Root' 212 213 # Build the rest of the nodes using the DES file 214 records = Des.parse(des_handle) 215 for record in records: 216 if record.nodetype =='px': 217 n = Domain() 218 n.sid = record.name 219 domains.append(n) 220 else : 221 n = Node() 222 n.sunid = record.sunid 223 n.type = record.nodetype 224 n.sccs = record.sccs 225 n.description = record.description 226 227 sunidDict[n.sunid] = n 228 229 # Glue all of the Nodes together using the HIE file 230 records = Hie.parse(hie_handle) 231 for record in records: 232 if record.sunid not in sunidDict: 233 print record.sunid 234 235 n = sunidDict[record.sunid] 236 237 if record.parent != '' : # Not root node 238 239 if record.parent not in sunidDict: 240 raise ValueError("Incomplete data?") 241 242 n.parent = sunidDict[record.parent] 243 244 for c in record.children: 245 if c not in sunidDict: 246 raise ValueError("Incomplete data?") 247 n.children.append(sunidDict[c]) 248 249 250 # Fill in the gaps with information from the CLA file 251 sidDict = {} 252 records = Cla.parse(cla_handle) 253 for record in records: 254 n = sunidDict[record.sunid] 255 assert n.sccs == record.sccs 256 assert n.sid == record.sid 257 n.residues = record.residues 258 sidDict[n.sid] = n 259 260 # Clean up 261 self._sunidDict = sunidDict 262 self._sidDict = sidDict 263 self._domains = tuple(domains) 264 265 finally: 266 if dir_path: 267 # If we opened the files, we close the files 268 if cla_handle : cla_handle.close() 269 if des_handle : des_handle.close() 270 if hie_handle : hie_handle.close()
271 272
273 - def getRoot(self):
274 return self.getNodeBySunid(0)
275 276
277 - def getDomainBySid(self, sid):
278 """Return a domain from its sid""" 279 if sid in self._sidDict: 280 return self._sidDict[sid] 281 if self.db_handle: 282 self.getDomainFromSQL(sid=sid) 283 if sid in self._sidDict: 284 return self._sidDict[sid] 285 else: 286 return None
287 288
289 - def getNodeBySunid(self, sunid):
290 """Return a node from its sunid""" 291 if sunid in self._sunidDict: 292 return self._sunidDict[sunid] 293 if self.db_handle: 294 self.getDomainFromSQL(sunid=sunid) 295 if sunid in self._sunidDict: 296 return self._sunidDict[sunid] 297 else: 298 return None
299 300
301 - def getDomains(self):
302 """Returns an ordered tuple of all SCOP Domains""" 303 if self.db_handle: 304 return self.getRoot().getDescendents('px') 305 else: 306 return self._domains
307 308 309
310 - def write_hie(self, handle):
311 """Build an HIE SCOP parsable file from this object""" 312 nodes = self._sunidDict.values() 313 # We order nodes to ease comparison with original file 314 nodes.sort(key = lambda n: n.sunid) 315 for n in nodes: 316 handle.write(str(n.toHieRecord()))
317 318
319 - def write_des(self, handle):
320 """Build a DES SCOP parsable file from this object""" 321 nodes = self._sunidDict.values() 322 # Origional SCOP file is not ordered? 323 nodes.sort(key = lambda n: n.sunid) 324 for n in nodes: 325 if n != self.root: 326 handle.write(str(n.toDesRecord()))
327 328
329 - def write_cla(self, handle):
330 """Build a CLA SCOP parsable file from this object""" 331 nodes = self._sidDict.values() 332 # We order nodes to ease comparison with original file 333 nodes.sort(key = lambda n: n.sunid) 334 for n in nodes: 335 handle.write(str(n.toClaRecord()))
336 337
338 - def getDomainFromSQL(self, sunid=None, sid=None):
339 """Load a node from the SQL backend using sunid or sid""" 340 if sunid==sid==None: return None 341 342 cur = self.db_handle.cursor() 343 344 if sid: 345 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 346 res = cur.fetchone() 347 if res is None: 348 return None 349 sunid = res[0] 350 351 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 352 data = cur.fetchone() 353 354 if data is not None: 355 n = None 356 357 #determine if Node or Domain 358 if data[1] != "px": 359 n = Node(scop=self) 360 361 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 362 children = [] 363 for c in cur.fetchall(): 364 children.append(c[0]) 365 n.children = children 366 367 368 else: 369 n = Domain(scop=self) 370 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 371 sunid) 372 373 [n.sid,n.residues,pdbid] = cur.fetchone() 374 n.residues = Residues(n.residues) 375 n.residues.pdbid=pdbid 376 self._sidDict[n.sid] = n 377 378 [n.sunid,n.type,n.sccs,n.description] = data 379 380 if data[1] != 'ro': 381 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 382 n.parent = cur.fetchone()[0] 383 384 n.sunid = int(n.sunid) 385 386 self._sunidDict[n.sunid] = n
387 388
389 - def getAscendentFromSQL(self, node, type):
390 """Get ascendents using SQL backend""" 391 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): return None 392 393 cur = self.db_handle.cursor() 394 cur.execute("SELECT "+type+" from cla WHERE "+node.type+"=%s", (node.sunid)) 395 result = cur.fetchone() 396 if result is not None: 397 return self.getNodeBySunid(result[0]) 398 else: 399 return None
400 401
402 - def getDescendentsFromSQL(self, node, type):
403 """Get descendents of a node using the database backend. This avoids 404 repeated iteration of SQL calls and is therefore much quicker than 405 repeatedly calling node.getChildren(). 406 """ 407 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): return [] 408 409 des_list = [] 410 411 412 # SQL cla table knows nothing about 'ro' 413 if node.type == 'ro': 414 for c in node.getChildren(): 415 for d in self.getDescendentsFromSQL(c,type): 416 des_list.append(d) 417 return des_list 418 419 cur = self.db_handle.cursor() 420 421 422 if type != 'px': 423 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 424 cla,des WHERE cla."+node.type+"=%s AND cla."+type+"=des.sunid", (node.sunid)) 425 data = cur.fetchall() 426 for d in data: 427 if int(d[0]) not in self._sunidDict: 428 n = Node(scop=self) 429 [n.sunid,n.type,n.sccs,n.description] = d 430 n.sunid=int(n.sunid) 431 self._sunidDict[n.sunid] = n 432 433 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 434 n.parent = cur.fetchone()[0] 435 436 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 437 children = [] 438 for c in cur.fetchall(): 439 children.append(c[0]) 440 n.children = children 441 442 443 des_list.append( self._sunidDict[int(d[0])] ) 444 445 else: 446 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 447 FROM cla,des where cla.sunid=des.sunid and cla."+node.type+"=%s", 448 node.sunid) 449 450 data = cur.fetchall() 451 for d in data: 452 if int(d[0]) not in self._sunidDict: 453 n = Domain(scop=self) 454 #[n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 455 #n.description,n.parent] = data 456 [n.sunid,n.sid, pdbid,n.residues,n.sccs,n.type,n.description, 457 n.parent] = d[0:8] 458 n.residues = Residues(n.residues) 459 n.residues.pdbid = pdbid 460 n.sunid = int(n.sunid) 461 self._sunidDict[n.sunid] = n 462 self._sidDict[n.sid] = n 463 464 465 des_list.append( self._sunidDict[int(d[0])] ) 466 467 return des_list
468 469 470 471
472 - def write_hie_sql(self, handle):
473 """Write HIE data to SQL database""" 474 cur = handle.cursor() 475 476 cur.execute("DROP TABLE IF EXISTS hie") 477 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 478 INDEX (parent) )") 479 480 for p in self._sunidDict.itervalues(): 481 for c in p.children: 482 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
483 484
485 - def write_cla_sql(self, handle):
486 """Write CLA data to SQL database""" 487 cur = handle.cursor() 488 489 cur.execute("DROP TABLE IF EXISTS cla") 490 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 491 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 492 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 493 494 for n in self._sidDict.itervalues(): 495 c = n.toClaRecord() 496 cur.execute( "INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 497 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 498 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 499 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 500 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 501 n.sunid ))
502 503
504 - def write_des_sql(self, handle):
505 """Write DES data to SQL database""" 506 cur = handle.cursor() 507 508 cur.execute("DROP TABLE IF EXISTS des") 509 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 510 description VARCHAR(255),\ 511 PRIMARY KEY (sunid) )") 512 513 for n in self._sunidDict.itervalues(): 514 cur.execute( "INSERT INTO des VALUES (%s,%s,%s,%s)", 515 ( n.sunid, n.type, n.sccs, n.description ) )
516 517 518
519 -class Node(object):
520 """ A node in the Scop hierarchy 521 522 sunid -- SCOP unique identifiers. e.g. '14986' 523 524 parent -- The parent node 525 526 children -- A list of child nodes 527 528 sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 529 530 type -- A 2 letter node type code. e.g. 'px' for domains 531 532 description -- 533 534 """
535 - def __init__(self, scop=None):
536 """Create a Node in the scop hierarchy. If a Scop instance is provided to the 537 constructor, this will be used to lookup related references using the SQL 538 methods. If no instance is provided, it is assumed the whole tree exists 539 and is connected.""" 540 self.sunid='' 541 self.parent = None 542 self.children=[] 543 self.sccs = '' 544 self.type ='' 545 self.description ='' 546 self.scop=scop
547
548 - def __str__(self):
549 s = [] 550 s.append(str(self.sunid)) 551 s.append(self.sccs) 552 s.append(self.type) 553 s.append(self.description) 554 555 return " ".join(s)
556
557 - def toHieRecord(self):
558 """Return an Hie.Record""" 559 rec = Hie.Record() 560 rec.sunid = str(self.sunid) 561 if self.getParent() : #Not root node 562 rec.parent = str(self.getParent().sunid) 563 else: 564 rec.parent = '-' 565 for c in self.getChildren(): 566 rec.children.append(str(c.sunid)) 567 return rec
568
569 - def toDesRecord(self):
570 """Return a Des.Record""" 571 rec = Des.Record() 572 rec.sunid = str(self.sunid) 573 rec.nodetype = self.type 574 rec.sccs = self.sccs 575 rec.description = self.description 576 return rec
577
578 - def getChildren(self):
579 """Return a list of children of this Node""" 580 if self.scop is None: 581 return self.children 582 else: 583 return map ( self.scop.getNodeBySunid, self.children )
584
585 - def getParent(self):
586 """Return the parent of this Node""" 587 if self.scop is None: 588 return self.parent 589 else: 590 return self.scop.getNodeBySunid( self.parent )
591
592 - def getDescendents( self, node_type):
593 """ Return a list of all decendent nodes of the given type. Node type can a 594 two letter code or longer description. e.g. 'fa' or 'family' 595 """ 596 if node_type in _nodetype_to_code: 597 node_type = _nodetype_to_code[node_type] 598 599 nodes = [self] 600 if self.scop: 601 return self.scop.getDescendentsFromSQL(self,node_type) 602 while nodes[0].type != node_type: 603 if nodes[0].type == 'px' : return [] # Fell of the bottom of the hierarchy 604 child_list = [] 605 for n in nodes: 606 for child in n.getChildren(): 607 child_list.append( child ) 608 nodes = child_list 609 610 return nodes
611 612
613 - def getAscendent( self, node_type):
614 """ Return the ancenstor node of the given type, or None.Node type can a 615 two letter code or longer description. e.g. 'fa' or 'family'""" 616 if node_type in _nodetype_to_code: 617 node_type = _nodetype_to_code[node_type] 618 619 if self.scop: 620 return self.scop.getAscendentFromSQL(self,node_type) 621 else: 622 n = self 623 if n.type == node_type: return None 624 while n.type != node_type: 625 if n.type == 'ro': return None # Fell of the top of the hierarchy 626 n = n.getParent() 627 628 return n
629 630 631
632 -class Domain(Node):
633 """ A SCOP domain. A leaf node in the Scop hierarchy. 634 635 sid -- The SCOP domain identifier. e.g. 'd5hbib_' 636 637 residues -- A Residue object. It defines the collection 638 of PDB atoms that make up this domain. 639 """
640 - def __init__(self,scop=None):
641 Node.__init__(self,scop=scop) 642 self.sid = '' 643 self.residues = None
644
645 - def __str__(self):
646 s = [] 647 s.append(self.sid) 648 s.append(self.sccs) 649 s.append("("+str(self.residues)+")") 650 651 if not self.getParent(): 652 s.append(self.description) 653 else: 654 sp = self.getParent() 655 dm = sp.getParent() 656 s.append(dm.description) 657 s.append("{"+sp.description+"}") 658 659 return " ".join(s)
660
661 - def toDesRecord(self):
662 """Return a Des.Record""" 663 rec = Node.toDesRecord(self) 664 rec.name = self.sid 665 return rec
666
667 - def toClaRecord(self):
668 """Return a Cla.Record""" 669 rec = Cla.Record() 670 rec.sid = self.sid 671 rec.residues = self.residues 672 rec.sccs = self.sccs 673 rec.sunid = self.sunid 674 675 n = self 676 while n.sunid != 0: #Not root node 677 rec.hierarchy[n.type] = str(n.sunid) 678 n = n.getParent() 679 680 # Order does not matter in the hierarchy field. For more info, see 681 # http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html 682 #rec.hierarchy.reverse() 683 684 return rec
685
686 -class Astral(object):
687 """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 688 as well as clusterings by percent id or evalue. 689 """ 690
691 - def __init__( self, dir_path=None, version=None, scop=None, 692 astral_file=None, db_handle=None):
693 """ 694 Initialise the astral database. 695 696 You must provide either a directory of SCOP files: 697 698 dir_path - string, the path to location of the scopseq-x.xx directory 699 (not the directory itself), and 700 version -a version number. 701 702 or, a FASTA file: 703 704 astral_file - string, a path to a fasta file (which will be loaded in memory) 705 706 or, a MYSQL database: 707 708 db_handle - a database handle for a MYSQL database containing a table 709 'astral' with the astral data in it. This can be created 710 using writeToSQL. 711 """ 712 713 if astral_file==dir_path==db_handle==None: 714 raise RuntimeError("Need either file handle, or (dir_path + "\ 715 + "version) or database handle to construct Astral") 716 if not scop: 717 raise RuntimeError("Must provide a Scop instance to construct") 718 719 self.scop = scop 720 self.db_handle = db_handle 721 722 723 if not astral_file and not db_handle: 724 if dir_path == None or version == None: 725 raise RuntimeError("must provide dir_path and version") 726 727 self.version = version 728 self.path = os.path.join( dir_path, "scopseq-%s" % version) 729 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 730 astral_file = os.path.join (self.path, astral_file) 731 732 if astral_file: 733 #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 734 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(open(astral_file), "fasta")) 735 736 self.astral_file = astral_file 737 self.EvDatasets = {} 738 self.EvDatahash = {} 739 self.IdDatasets = {} 740 self.IdDatahash = {}
741 742
743 - def domainsClusteredByEv(self,id):
744 """get domains clustered by evalue""" 745 if id not in self.EvDatasets: 746 if self.db_handle: 747 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 748 749 else: 750 if not self.path: 751 raise RuntimeError("No scopseq directory specified") 752 753 file_prefix = "astral-scopdom-seqres-sel-gs" 754 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id] , 755 self.version) 756 filename = os.path.join(self.path,filename) 757 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 758 return self.EvDatasets[id]
759 760
761 - def domainsClusteredById(self,id):
762 """get domains clustered by percent id""" 763 if id not in self.IdDatasets: 764 if self.db_handle: 765 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id"+str(id)) 766 767 else: 768 if not self.path: 769 raise RuntimeError("No scopseq directory specified") 770 771 file_prefix = "astral-scopdom-seqres-sel-gs" 772 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 773 filename = os.path.join(self.path,filename) 774 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 775 return self.IdDatasets[id]
776 777
778 - def getAstralDomainsFromFile(self,filename=None,file_handle=None):
779 """Get the scop domains from a file containing a list of sids""" 780 if file_handle == filename == None: 781 raise RuntimeError("You must provide a filename or handle") 782 if not file_handle: 783 file_handle = open(filename) 784 doms = [] 785 while 1: 786 line = file_handle.readline() 787 if not line: 788 break 789 line = line.rstrip() 790 doms.append(line) 791 if filename: 792 file_handle.close() 793 794 doms = filter( lambda a: a[0]=='d', doms ) 795 doms = map( self.scop.getDomainBySid, doms ) 796 return doms
797
798 - def getAstralDomainsFromSQL(self, column):
799 """Load a set of astral domains from a column in the astral table of a MYSQL 800 database (which can be created with writeToSQL(...)""" 801 cur = self.db_handle.cursor() 802 cur.execute("SELECT sid FROM astral WHERE "+column+"=1") 803 data = cur.fetchall() 804 data = map( lambda x: self.scop.getDomainBySid(x[0]), data) 805 806 return data
807 808
809 - def getSeqBySid(self,domain):
810 """get the seq record of a given domain from its sid""" 811 if self.db_handle is None: 812 return self.fasta_dict[domain].seq 813 814 else: 815 cur = self.db_handle.cursor() 816 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 817 return Seq(cur.fetchone()[0])
818
819 - def getSeq(self,domain):
820 """Return seq associated with domain""" 821 return self.getSeqBySid(domain.sid)
822 823
824 - def hashedDomainsById(self,id):
825 """Get domains clustered by sequence identity in a dict""" 826 if id not in self.IdDatahash: 827 self.IdDatahash[id] = {} 828 for d in self.domainsClusteredById(id): 829 self.IdDatahash[id][d] = 1 830 return self.IdDatahash[id]
831
832 - def hashedDomainsByEv(self,id):
833 """Get domains clustered by evalue in a dict""" 834 if id not in self.EvDatahash: 835 self.EvDatahash[id] = {} 836 for d in self.domainsClusteredByEv(id): 837 self.EvDatahash[id][d] = 1 838 return self.EvDatahash[id]
839 840
841 - def isDomainInId(self,dom,id):
842 """Returns true if the domain is in the astral clusters for percent ID""" 843 return dom in self.hashedDomainsById(id)
844
845 - def isDomainInEv(self,dom,id):
846 """Returns true if the domain is in the ASTRAL clusters for evalues""" 847 return dom in self.hashedDomainsByEv(id)
848 849
850 - def writeToSQL(self, db_handle):
851 """Write the ASTRAL database to a MYSQL database""" 852 cur = db_handle.cursor() 853 854 cur.execute("DROP TABLE IF EXISTS astral") 855 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 856 857 for dom in self.fasta_dict: 858 cur.execute("INSERT INTO astral (sid,seq) values (%s,%s)", 859 (dom, self.fasta_dict[dom].seq.data)) 860 861 for i in astralBibIds: 862 cur.execute("ALTER TABLE astral ADD (id"+str(i)+" TINYINT)") 863 864 for d in self.domainsClusteredById(i): 865 cur.execute("UPDATE astral SET id"+str(i)+"=1 WHERE sid=%s", 866 d.sid) 867 868 for ev in astralEvs: 869 cur.execute("ALTER TABLE astral ADD ("+astralEv_to_sql[ev]+" TINYINT)") 870 871 for d in self.domainsClusteredByEv(ev): 872 873 cur.execute("UPDATE astral SET "+astralEv_to_sql[ev]+"=1 WHERE sid=%s", 874 d.sid)
875
876 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 877 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
878 """search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 879 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds) 880 881 Access search.cgi and return a handle to the results. See the 882 online help file for an explanation of the parameters: 883 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 884 885 Raises an IOError if there's a network error. 886 887 """ 888 params = {'pdb' : pdb, 'key' : key, 'sid' : sid, 'disp' : disp, 889 'dir' : dir, 'loc' : loc} 890 variables = {} 891 for k, v in params.iteritmes(): 892 if v is not None: 893 variables[k] = v 894 variables.update(keywds) 895 return _open(cgi, variables)
896
897 -def _open(cgi, params={}, get=1):
898 """_open(cgi, params={}, get=1) -> UndoHandle 899 900 Open a handle to SCOP. cgi is the URL for the cgi script to access. 901 params is a dictionary with the options to pass to it. get is a boolean 902 that describes whether a GET should be used. Does some 903 simple error checking, and will raise an IOError if it encounters one. 904 905 """ 906 import urllib 907 from Bio import File 908 # Open a handle to SCOP. 909 options = urllib.urlencode(params) 910 if get: # do a GET 911 fullcgi = cgi 912 if options: 913 fullcgi = "%s?%s" % (cgi, options) 914 handle = urllib.urlopen(fullcgi) 915 else: # do a POST 916 handle = urllib.urlopen(cgi, options) 917 918 # Wrap the handle inside an UndoHandle. 919 uhandle = File.UndoHandle(handle) 920 # Should I check for 404? timeout? etc? 921 return uhandle
922