1
2
3
4
5
6
7
8 """Functions to calculate assorted sequence checksums."""
9
10
11
12
13 from binascii import crc32 as _crc32
14 from Bio._py3k import _as_bytes
15
17 """Returns the crc32 checksum for a sequence (string or Seq object)."""
18
19
20
21 try:
22
23 return _crc32(_as_bytes(seq.tostring()))
24 except AttributeError:
25
26 return _crc32(_as_bytes(seq))
27
29 _table_h = []
30 for i in range(256):
31 l = i
32 part_h = 0
33 for j in range(8):
34 rflag = l & 1
35 l >>= 1
36 if part_h & 1: l |= (1L << 31)
37 part_h >>= 1L
38 if rflag: part_h ^= 0xd8000000L
39 _table_h.append(part_h)
40 return _table_h
41
42
43 _table_h = _init_table_h()
44
46 """Returns the crc64 checksum for a sequence (string or Seq object)."""
47 crcl = 0
48 crch = 0
49 for c in s:
50 shr = (crch & 0xFF) << 24
51 temp1h = crch >> 8
52 temp1l = (crcl >> 8) | shr
53 idx = (crcl ^ ord(c)) & 0xFF
54 crch = temp1h ^ _table_h[idx]
55 crcl = temp1l
56
57 return "CRC-%08X%08X" % (crch, crcl)
58
59
61 """Returns the GCG checksum (int) for a sequence (string or Seq object).
62
63 Given a nucleotide or amino-acid secuence (or any string),
64 returns the GCG checksum (int). Checksum used by GCG program.
65 seq type = str.
66 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
67 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.
68 All sequences are converted to uppercase """
69 try:
70
71 seq = seq.tostring()
72 except AttributeError:
73
74 pass
75 index = checksum = 0
76 for char in seq:
77 index += 1
78 checksum += index * ord(char.upper())
79 if index == 57: index = 0
80 return checksum % 10000
81
83 """Returns the SEGUID (string) for a sequence (string or Seq object).
84
85 Given a nucleotide or amino-acid secuence (or any string),
86 returns the SEGUID string (A SEquence Globally Unique IDentifier).
87 seq type = str.
88 For more information about SEGUID, see:
89 http://bioinformatics.anl.gov/seguid/
90 DOI: 10.1002/pmic.200600032 """
91 try:
92
93 import hashlib
94 m = hashlib.sha1()
95 except:
96
97 import sha
98 m = sha.new()
99 import base64
100 try:
101
102 seq = seq.tostring()
103 except AttributeError:
104
105 pass
106 m.update(_as_bytes(seq.upper()))
107 try:
108
109 return base64.encodebytes(m.digest()).decode().replace("\n","").rstrip("=")
110 except AttributeError:
111 pass
112 try:
113
114 return base64.b64encode(m.digest()).rstrip("=")
115 except:
116
117 import os
118
119
120
121 return base64.encodestring(m.digest()).replace("\n","").rstrip("=")
122
123 if __name__ == "__main__":
124 print "Quick self test"
125
126 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
127 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
128 + "YCSSYAGSSTLVFGGGTKLTVL"
129
130 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
131 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
132 + "YCCSYAGSSTWVFGGGTKLTVL"
133
134 assert crc64(str_light_chain_one) == crc64(str_light_chain_two)
135 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one)
136
137 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one)
138 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two)
139
140 print "Done"
141