Package Bio :: Package SeqUtils :: Module CheckSum
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CheckSum

  1  # crc32, crc64, gcg, and seguid 
  2  # crc64 is adapted from BioPerl 
  3   
  4  from binascii import crc32 as _crc32 
  5   
6 -def crc32(seq) :
7 """Returns the crc32 checksum for a sequence (string or Seq object)""" 8 try : 9 #Assume its a Seq object 10 return _crc32(seq.tostring()) 11 except AttributeError : 12 #Assume its a string 13 return _crc32(seq)
14
15 -def _init_table_h():
16 _table_h = [] 17 for i in range(256): 18 l = i 19 part_h = 0 20 for j in range(8): 21 rflag = l & 1 22 l >>= 1 23 if part_h & 1: l |= (1L << 31) 24 part_h >>= 1L 25 if rflag: part_h ^= 0xd8000000L 26 _table_h.append(part_h) 27 return _table_h
28 29 # Initialisation 30 _table_h = _init_table_h() 31
32 -def crc64(s):
33 """Returns the crc64 checksum for a sequence (string or Seq object)""" 34 crcl = 0 35 crch = 0 36 for c in s: 37 shr = (crch & 0xFF) << 24 38 temp1h = crch >> 8 39 temp1l = (crcl >> 8) | shr 40 idx = (crcl ^ ord(c)) & 0xFF 41 crch = temp1h ^ _table_h[idx] 42 crcl = temp1l 43 44 return "CRC-%08X%08X" % (crch, crcl)
45 46
47 -def gcg(seq):
48 """Returns the GCG checksum (int) for a sequence (string or Seq object) 49 50 Given a nucleotide or amino-acid secuence (or any string), 51 returns the GCG checksum (int). Checksum used by GCG program. 52 seq type = str. 53 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi 54 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. 55 All sequences are converted to uppercase """ 56 index = checksum = 0 57 if type(seq)!=type("aa"): 58 seq=seq.tostring() 59 for char in seq: 60 index += 1 61 checksum += index * ord(char.upper()) 62 if index == 57: index = 0 63 return checksum % 10000
64
65 -def seguid(seq):
66 """Returns the SEGUID (string) for a sequence (string or Seq object) 67 68 Given a nucleotide or amino-acid secuence (or any string), 69 returns the SEGUID string (A SEquence Globally Unique IDentifier). 70 seq type = str. 71 For more information about SEGUID, see: 72 http://bioinformatics.anl.gov/seguid/ 73 DOI: 10.1002/pmic.200600032 """ 74 try: 75 #Python 2.5 sha1 is in hashlib 76 import hashlib 77 m = hashlib.sha1() 78 except: 79 #For older versions 80 import sha 81 m = sha.new() 82 import base64 83 if type(seq)!=type("aa"): 84 seq=seq.tostring().upper() 85 else: 86 seq=seq.upper() 87 m.update(seq) 88 try: 89 #For Python 2.5 90 return base64.b64encode(m.digest()).rstrip("=") 91 except: 92 #For older versions 93 import os 94 #Note: Using os.linesep doesn't work on Windows, 95 #where os.linesep= "\r\n" but the encoded string 96 #contains "\n" but not "\r\n" 97 return base64.encodestring(m.digest()).replace("\n","").rstrip("=")
98 99 if __name__ == "__main__" : 100 print "Quick self test" 101 102 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 103 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 104 + "YCSSYAGSSTLVFGGGTKLTVL" 105 106 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 107 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 108 + "YCCSYAGSSTWVFGGGTKLTVL" 109 110 assert crc64(str_light_chain_one) == crc64(str_light_chain_two) 111 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one) 112 113 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one) 114 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two) 115 116 print "Done" 117