Package Bio :: Package SeqUtils :: Module CheckSum
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CheckSum

  1  # Copyright 2002 by Yves Bastide and Brad Chapman. 
  2  # All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Functions to calculate assorted sequence checksums.""" 
  8   
  9  # crc32, crc64, gcg, and seguid 
 10  # crc64 is adapted from BioPerl 
 11   
 12  from binascii import crc32 as _crc32 
 13   
14 -def crc32(seq) :
15 """Returns the crc32 checksum for a sequence (string or Seq object)""" 16 try : 17 #Assume its a Seq object 18 return _crc32(seq.tostring()) 19 except AttributeError : 20 #Assume its a string 21 return _crc32(seq)
22
23 -def _init_table_h():
24 _table_h = [] 25 for i in range(256): 26 l = i 27 part_h = 0 28 for j in range(8): 29 rflag = l & 1 30 l >>= 1 31 if part_h & 1: l |= (1L << 31) 32 part_h >>= 1L 33 if rflag: part_h ^= 0xd8000000L 34 _table_h.append(part_h) 35 return _table_h
36 37 # Initialisation 38 _table_h = _init_table_h() 39
40 -def crc64(s):
41 """Returns the crc64 checksum for a sequence (string or Seq object)""" 42 crcl = 0 43 crch = 0 44 for c in s: 45 shr = (crch & 0xFF) << 24 46 temp1h = crch >> 8 47 temp1l = (crcl >> 8) | shr 48 idx = (crcl ^ ord(c)) & 0xFF 49 crch = temp1h ^ _table_h[idx] 50 crcl = temp1l 51 52 return "CRC-%08X%08X" % (crch, crcl)
53 54
55 -def gcg(seq):
56 """Returns the GCG checksum (int) for a sequence (string or Seq object) 57 58 Given a nucleotide or amino-acid secuence (or any string), 59 returns the GCG checksum (int). Checksum used by GCG program. 60 seq type = str. 61 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi 62 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. 63 All sequences are converted to uppercase """ 64 index = checksum = 0 65 if type(seq)!=type("aa"): 66 seq=seq.tostring() 67 for char in seq: 68 index += 1 69 checksum += index * ord(char.upper()) 70 if index == 57: index = 0 71 return checksum % 10000
72
73 -def seguid(seq):
74 """Returns the SEGUID (string) for a sequence (string or Seq object) 75 76 Given a nucleotide or amino-acid secuence (or any string), 77 returns the SEGUID string (A SEquence Globally Unique IDentifier). 78 seq type = str. 79 For more information about SEGUID, see: 80 http://bioinformatics.anl.gov/seguid/ 81 DOI: 10.1002/pmic.200600032 """ 82 try: 83 #Python 2.5 sha1 is in hashlib 84 import hashlib 85 m = hashlib.sha1() 86 except: 87 #For older versions 88 import sha 89 m = sha.new() 90 import base64 91 if type(seq)!=type("aa"): 92 seq=seq.tostring().upper() 93 else: 94 seq=seq.upper() 95 m.update(seq) 96 try: 97 #For Python 2.5 98 return base64.b64encode(m.digest()).rstrip("=") 99 except: 100 #For older versions 101 import os 102 #Note: Using os.linesep doesn't work on Windows, 103 #where os.linesep= "\r\n" but the encoded string 104 #contains "\n" but not "\r\n" 105 return base64.encodestring(m.digest()).replace("\n","").rstrip("=")
106 107 if __name__ == "__main__" : 108 print "Quick self test" 109 110 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 111 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 112 + "YCSSYAGSSTLVFGGGTKLTVL" 113 114 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 115 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 116 + "YCCSYAGSSTWVFGGGTKLTVL" 117 118 assert crc64(str_light_chain_one) == crc64(str_light_chain_two) 119 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one) 120 121 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one) 122 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two) 123 124 print "Done" 125