Package Bio :: Package Restriction :: Package _Update :: Module RestrictionCompiler
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction._Update.RestrictionCompiler

  1  #!/usr/bin/env python 
  2  # 
  3  #      Restriction Analysis Libraries. 
  4  #      Copyright (C) 2004. Frederic Sohm. 
  5  # 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9  # 
 10  #   this script is used to produce the dictionary which will contains the data 
 11  #   about the restriction enzymes from the Emboss/Rebase data files 
 12  #   namely 
 13  #   emboss_e.### (description of the sites), 
 14  #   emboss_r.### (origin, methylation, references) 
 15  #   emboss_s.### (suppliers) 
 16  #   where ### is a number of three digits : 1 for the year two for the month 
 17  # 
 18  #   very dirty implementation but it does the job, so... 
 19  #   Not very quick either but you are not supposed to use it frequently. 
 20  # 
 21  #   The results are stored in 
 22  #   path/to/site-packages/Bio/Restriction/Restriction_Dictionary.py 
 23  #   the file contains two dictionary : 
 24  #   'rest_dict' which contains the data for the enzymes 
 25  #   and 
 26  #   'suppliers' which map the name of the suppliers to their abbreviation. 
 27  # 
 28   
 29  """Convert a serie of Rebase files into a Restriction_Dictionary.py module. 
 30   
 31  The Rebase files are in the emboss format : 
 32   
 33      emboss_e.###    -> contains informations about the restriction sites. 
 34      emboss_r.###    -> contains general informations about the enzymes. 
 35      emboss_s.###    -> contains informations about the suppliers. 
 36       
 37  ### is a 3 digit number. The first digit is the year and the two last the month. 
 38  """ 
 39   
 40  import sre 
 41  import os 
 42  import itertools 
 43  import pprint 
 44  import time 
 45  import sys 
 46  import site 
 47  import shutil 
 48   
 49  from Bio.Seq import Seq 
 50  from Bio.Alphabet import IUPAC 
 51   
 52  import Bio.Restriction.Restriction 
 53  from Bio.Restriction.Restriction import AbstractCut, RestrictionType, NoCut, OneCut,\ 
 54  TwoCuts, Meth_Dep, Meth_Undep, Palindromic, NonPalindromic, Unknown, Blunt,\ 
 55  Ov5, Ov3, NotDefined, Defined, Ambiguous, Commercially_available, Not_available  
 56   
 57  import Bio.Restriction.RanaConfig as config 
 58  from Bio.Restriction._Update.Update import RebaseUpdate 
 59  from Bio.Restriction.Restriction import * 
 60  from Bio.Restriction.DNAUtils import antiparallel 
 61   
 62  DNA=Seq 
 63  dna_alphabet = {'A':'A', 'C':'C', 'G':'G', 'T':'T', 
 64                  'R':'AG', 'Y':'CT', 'W':'AT', 'S':'CG', 'M':'AC', 'K':'GT', 
 65                  'H':'ACT', 'B':'CGT', 'V':'ACG', 'D':'AGT', 
 66                  'N':'ACGT', 
 67                  'a': 'a', 'c': 'c', 'g': 'g', 't': 't', 
 68                  'r':'ag', 'y':'ct', 'w':'at', 's':'cg', 'm':'ac', 'k':'gt', 
 69                  'h':'act', 'b':'cgt', 'v':'acg', 'd':'agt', 
 70                  'n':'acgt'} 
 71   
 72   
 73  complement_alphabet = {'A':'T', 'T':'A', 'C':'G', 'G':'C','R':'Y', 'Y':'R', 
 74                         'W':'W', 'S':'S', 'M':'K', 'K':'M', 'H':'D', 'D':'H', 
 75                         'B':'V', 'V':'B', 'N':'N','a':'t', 'c':'g', 'g':'c', 
 76                         't':'a', 'r':'y', 'y':'r', 'w':'w', 's':'s','m':'k', 
 77                         'k':'m', 'h':'d', 'd':'h', 'b':'v', 'v':'b', 'n':'n'} 
 78  enzymedict = {} 
 79  suppliersdict = {} 
 80  classdict = {} 
 81  typedict = {} 
 82   
 83   
84 -class OverhangError(ValueError) :
85 """Exception for dealing with overhang.""" 86 pass
87
88 -def BaseExpand(base) :
89 """BaseExpand(base) -> string. 90 91 given a degenerated base, returns its meaning in IUPAC alphabet. 92 93 i.e: 94 b= 'A' -> 'A' 95 b= 'N' -> 'ACGT' 96 etc...""" 97 base = base.upper() 98 return dna_alphabet[base]
99
100 -def regex(site) :
101 """regex(site) -> string. 102 103 Construct a regular expression from a DNA sequence. 104 i.e. : 105 site = 'ABCGN' -> 'A[CGT]CG.'""" 106 reg_ex = site 107 for base in reg_ex : 108 if base in ('A', 'T', 'C', 'G', 'a', 'c', 'g', 't') : 109 pass 110 if base in ('N', 'n') : 111 reg_ex = '.'.join(reg_ex.split('N')) 112 reg_ex = '.'.join(reg_ex.split('n')) 113 if base in ('R', 'Y', 'W', 'M', 'S', 'K', 'H', 'D', 'B', 'V') : 114 expand = '['+ str(BaseExpand(base))+']' 115 reg_ex = expand.join(reg_ex.split(base)) 116 return reg_ex
117
118 -def Antiparallel(sequence) :
119 """Antiparallel(sequence) -> string. 120 121 returns a string which represents the reverse complementary strand of 122 a DNA sequence.""" 123 return antiparallel(sequence.tostring())
124
125 -def is_palindrom(sequence) :
126 """is_palindrom(sequence) -> bool. 127 128 True is the sequence is a palindrom. 129 sequence is a DNA object.""" 130 return sequence == DNA(Antiparallel(sequence))
131
132 -def LocalTime() :
133 """LocalTime() -> string. 134 135 LocalTime calculate the extension for emboss file for the current year and 136 month.""" 137 t = time.gmtime() 138 year = str(t.tm_year)[-1] 139 month = str(t.tm_mon) 140 if len(month) == 1 : month = '0'+month 141 return year+month
142 143
144 -class newenzyme(object) :
145 """construct the attributes of the enzyme corresponding to 'name'."""
146 - def __init__(cls, name) :
147 cls.opt_temp = 37 148 cls.inact_temp = 65 149 cls.substrat = 'DNA' 150 target = enzymedict[name] 151 cls.site = target[0] 152 cls.size = target[1] 153 cls.suppl = tuple(target[9]) 154 cls.freq = target[11] 155 cls.ovhg = target[13] 156 cls.ovhgseq = target[14] 157 cls.bases = () 158 # 159 # Is the site palindromic? 160 # Important for the way the DNA is search for the site. 161 # Palindromic sites needs to be looked for only over 1 strand. 162 # Non Palindromic needs to be search for on the reverse complement 163 # as well. 164 # 165 if target[10] : cls.bases += ('Palindromic',) 166 else : cls.bases += ('NonPalindromic',) 167 # 168 # Number of cut the enzyme produce. 169 # 0 => unknown, the enzyme has not been fully characterised. 170 # 2 => 1 cut, (because one cut is realised by cutting 2 strands 171 # 4 => 2 cuts, same logic. 172 # A little bit confusing but it is the way EMBOSS/Rebase works. 173 # 174 if not target[2] : 175 # 176 # => undefined enzymes, nothing to be done. 177 # 178 cls.bases += ('NoCut','Unknown', 'NotDefined') 179 cls.fst5 = None 180 cls.fst3 = None 181 cls.scd5 = None 182 cls.scd3 = None 183 cls.ovhg = None 184 cls.ovhgseq = None 185 else : 186 # 187 # we will need to calculate the overhang. 188 # 189 if target[2] == 2 : 190 cls.bases += ('OneCut',) 191 cls.fst5 = target[4] 192 cls.fst3 = target[5] 193 cls.scd5 = None 194 cls.scd3 = None 195 else : 196 cls.bases += ('TwoCuts',) 197 cls.fst5 = target[4] 198 cls.fst3 = target[5] 199 cls.scd5 = target[6] 200 cls.scd3 = target[7] 201 # 202 # Now, prepare the overhangs which will be added to the DNA 203 # after the cut. 204 # Undefined enzymes will not be allowed to catalyse, 205 # they are not available commercially anyway. 206 # I assumed that if an enzyme cut twice the overhang will be of 207 # the same kind. The only exception is HaeIV. I do not deal 208 # with that at the moment (ie I don't include it, 209 # need to be fixed). 210 # They generally cut outside their recognition site and 211 # therefore the overhang is undetermined and dependent of 212 # the DNA sequence upon which the enzyme act. 213 # 214 if target[3] : 215 # 216 # rebase field for blunt: blunt == 1, other == 0. 217 # The enzyme is blunt. No overhang. 218 # 219 cls.bases += ('Blunt', 'Defined') 220 cls.ovhg = 0 221 elif isinstance(cls.ovhg, int) : 222 # 223 # => overhang is sequence dependent 224 # 225 if cls.ovhg > 0 : 226 # 227 # 3' overhang, ambiguous site (outside recognition site 228 # or site containing ambiguous bases (N, W, R,...) 229 # 230 cls.bases += ('Ov3', 'Ambiguous') 231 elif cls.ovhg < 0 : 232 # 233 # 5' overhang, ambiguous site (outside recognition site 234 # or site containing ambiguous bases (N, W, R,...) 235 # 236 cls.bases += ('Ov5', 'Ambiguous') 237 else : 238 # 239 # cls.ovhg is a string => overhang is constant 240 # 241 if cls.fst5 - (cls.fst3 + cls.size) < 0 : 242 cls.bases += ('Ov5', 'Defined') 243 cls.ovhg = - len(cls.ovhg) 244 else : 245 cls.bases += ('Ov3', 'Defined') 246 cls.ovhg = + len(cls.ovhg) 247 # 248 # Next class : sensibility to methylation. 249 # Set by EmbossMixer from emboss_r.txt file 250 # Not really methylation dependent at the moment, stands rather for 251 # 'is the site methylable?'. 252 # Proper methylation sensibility has yet to be implemented. 253 # But the class is there for further development. 254 # 255 if target[8] : 256 cls.bases += ('Meth_Dep', ) 257 cls.compsite = target[12] 258 else : 259 cls.bases += ('Meth_Undep',) 260 cls.compsite = target[12] 261 # 262 # Next class will allow to select enzymes in function of their 263 # suppliers. Not essential but can be useful. 264 # 265 if cls.suppl : 266 cls.bases += ('Commercially_available', ) 267 else : 268 cls.bases += ('Not_available', ) 269 cls.bases += ('AbstractCut', 'RestrictionType') 270 cls.__name__ = name 271 cls.results = None 272 cls.dna = None 273 cls.__bases__ = cls.bases 274 cls.charac = (cls.fst5, cls.fst3, cls.scd5, cls.scd3, cls.site) 275 if not target[2] and cls.suppl : 276 supp = ', '.join([suppliersdict[s][0] for s in cls.suppl]) 277 print 'WARNING : It seems that %s is both commercially available\ 278 \n\tand its characteristics are unknown. \ 279 \n\tThis seems counter-intuitive.\ 280 \n\tThere is certainly an error either in ranacompiler or\ 281 \n\tin this REBASE release.\ 282 \n\tThe supplier is : %s.' % (name, supp) 283 return
284 285
286 -class TypeCompiler(object) :
287 288 """Build the different types possible for Restriction Enzymes""" 289
290 - def __init__(self) :
291 """TypeCompiler() -> new TypeCompiler instance.""" 292 pass
293
294 - def buildtype(self) :
295 """TC.buildtype() -> generator. 296 297 build the new types that will be needed for constructing the 298 restriction enzymes.""" 299 baT = (AbstractCut, RestrictionType) 300 cuT = (NoCut, OneCut, TwoCuts) 301 meT = (Meth_Dep, Meth_Undep) 302 paT = (Palindromic, NonPalindromic) 303 ovT = (Unknown, Blunt, Ov5, Ov3) 304 deT = (NotDefined, Defined, Ambiguous) 305 coT = (Commercially_available, Not_available) 306 All = (baT, cuT, meT, paT, ovT, deT, coT) 307 # 308 # Now build the types. Only the most obvious are left out. 309 # Modified even the most obvious are not so obvious. 310 # emboss_*.403 AspCNI is unknown and commercially available. 311 # So now do not remove the most obvious. 312 # 313 types = [(p,c,o,d,m,co,baT[0],baT[1]) 314 for p in paT for c in cuT for o in ovT 315 for d in deT for m in meT for co in coT] 316 n= 1 317 for ty in types : 318 dct = {} 319 for t in ty : 320 dct.update(t.__dict__) 321 # 322 # here we need to customize the dictionary. 323 # i.e. types deriving from OneCut have always scd5 and scd3 324 # equal to None. No need therefore to store that in a specific 325 # enzyme of this type. but it then need to be in the type. 326 # 327 dct['results'] = [] 328 dct['substrat'] = 'DNA' 329 dct['dna'] = None 330 if t == NoCut : 331 dct.update({'fst5':None,'fst3':None, 332 'scd5':None,'scd3':None, 333 'ovhg':None,'ovhgseq':None}) 334 elif t == OneCut : 335 dct.update({'scd5':None, 'scd3':None}) 336 class klass(type) : 337 def __new__(cls) : 338 return type.__new__(cls, 'type%i'%n,ty,dct)
339 def __init__(cls) : 340 super(klass, cls).__init__('type%i'%n,ty,dct)
341 yield klass() 342 n+=1 343 344 start = '\n\ 345 #!/usr/bin/env python\n\ 346 #\n\ 347 # Restriction Analysis Libraries.\n\ 348 # Copyright (C) 2004. Frederic Sohm.\n\ 349 #\n\ 350 # This code is part of the Biopython distribution and governed by its\n\ 351 # license. Please see the LICENSE file that should have been included\n\ 352 # as part of this package.\n\ 353 #\n\ 354 #\n\ 355 rest_dict = \\\n' 356
357 -class DictionaryBuilder(object) :
358
359 - def __init__(self, e_mail='', ftp_proxy='') :
360 """DictionaryBuilder([e_mail[, ftp_proxy]) -> DictionaryBuilder instance. 361 362 If the emboss files used for the construction need to be updated this 363 class will download them if the ftp connection is correctly set. 364 either in RanaConfig.py or given at run time. 365 366 e_mail is the e-mail address used as password for the anonymous 367 ftp connection. 368 369 proxy is the ftp_proxy to use if any.""" 370 self.rebase_pass = e_mail or config.Rebase_password 371 self.proxy = ftp_proxy or config.ftp_proxy
372
373 - def build_dict(self) :
374 """DB.build_dict() -> None. 375 376 Construct the dictionary and build the files containing the new 377 dictionaries.""" 378 # 379 # first parse the emboss files. 380 # 381 emboss_e, emboss_r, emboss_s = self.lastrebasefile() 382 # 383 # the results will be stored into enzymedict. 384 # 385 self.information_mixer(emboss_r, emboss_e, emboss_s) 386 emboss_r.close() 387 emboss_e.close() 388 emboss_s.close() 389 # 390 # we build all the possible type 391 # 392 tdct = {} 393 for klass in TypeCompiler().buildtype() : 394 exec klass.__name__ +'= klass' 395 exec "tdct['"+klass.__name__+"'] = klass" 396 397 # 398 # Now we build the enzymes from enzymedict 399 # and store them in a dictionary. 400 # The type we will need will also be stored. 401 # 402 403 for name in enzymedict : 404 # 405 # the class attributes first: 406 # 407 cls = newenzyme(name) 408 # 409 # Now select the right type for the enzyme. 410 # 411 bases = cls.bases 412 clsbases = tuple([eval(x) for x in bases]) 413 typestuff = '' 414 for n, t in tdct.iteritems() : 415 # 416 # if the bases are the same. it is the right type. 417 # create the enzyme and remember the type 418 # 419 if t.__bases__ == clsbases : 420 typestuff = t 421 typename = t.__name__ 422 continue 423 # 424 # now we build the dictionaries. 425 # 426 dct = dict(cls.__dict__) 427 del dct['bases'] 428 del dct['__bases__'] 429 del dct['__name__']# no need to keep that, it's already in the type. 430 classdict[name] = dct 431 432 commonattr = ['fst5', 'fst3', 'scd5', 'scd3', 'substrat', 433 'ovhg', 'ovhgseq','results', 'dna'] 434 if typename in typedict : 435 typedict[typename][1].append(name) 436 else : 437 enzlst= [] 438 tydct = dict(typestuff.__dict__) 439 tydct = dict([(k,v) for k,v in tydct.iteritems() if k in commonattr]) 440 enzlst.append(name) 441 typedict[typename] = (bases, enzlst) 442 for letter in cls.__dict__['suppl'] : 443 supplier = suppliersdict[letter] 444 suppliersdict[letter][1].append(name) 445 if not classdict or not suppliersdict or not typedict : 446 print 'One of the new dictionaries is empty.' 447 print 'Check the integrity of the emboss file before continuing.' 448 print 'Update aborted.' 449 sys.exit() 450 # 451 # How many enzymes this time? 452 # 453 print '\nThe new database contains %i enzymes.\n' % len(classdict) 454 # 455 # the dictionaries are done. Build the file 456 # 457 #update = config.updatefolder 458 459 update = os.getcwd() 460 results = open(os.path.join(update, 'Restriction_Dictionary.py'), 'w') 461 print 'Writing the dictionary containing the new Restriction classes.\t', 462 results.write(start) 463 a = pprint.PrettyPrinter(1, 80, None, results) 464 a.pprint(classdict) 465 print 'OK.\n' 466 print 'Writing the dictionary containing the suppliers datas.\t\t', 467 results.write('suppliers = \\\n') 468 a.pprint(suppliersdict) 469 print 'OK.\n' 470 print 'Writing the dictionary containing the Restriction types.\t', 471 results.write('typedict = \\\n') 472 a.pprint(typedict) 473 print 'OK.\n' 474 results.close() 475 return
476
477 - def install_dict(self) :
478 """DB.install_dict() -> None. 479 480 Install the newly created dictionary in the site-packages folder. 481 482 May need super user privilege on some architectures.""" 483 print '\n ' +'*'*78 + ' \n' 484 print '\n\t\tInstalling Restriction_Dictionary.py' 485 try : 486 import Bio.Restriction.Restriction_Dictionary as rd 487 except ImportError : 488 print '\ 489 \n Unable to locate the previous Restriction_Dictionary.py module\ 490 \n Aborting installation.' 491 sys.exit() 492 # 493 # first save the old file in Updates 494 # 495 old = os.path.join(os.path.split(rd.__file__)[0], 496 'Restriction_Dictionary.py') 497 #update_folder = config.updatefolder 498 update_folder = os.getcwd() 499 shutil.copyfile(old, os.path.join(update_folder, 500 'Restriction_Dictionary.old')) 501 # 502 # Now test and install. 503 # 504 new = os.path.join(update_folder, 'Restriction_Dictionary.py') 505 try : 506 execfile(new) 507 print '\ 508 \n\tThe new file seems ok. Proceeding with the installation.' 509 except SyntaxError : 510 print '\ 511 \n The new dictionary file is corrupted. Aborting the installation.' 512 return 513 try : 514 shutil.copyfile(new, old) 515 print'\n\t Everything ok. If you need it a version of the old\ 516 \n\t dictionary have been saved in the Updates folder under\ 517 \n\t the name Restriction_Dictionary.old.' 518 print '\n ' +'*'*78 + ' \n' 519 except IOError : 520 print '\n ' +'*'*78 + ' \n' 521 print '\ 522 \n\t WARNING : Impossible to install the new dictionary.\ 523 \n\t Are you sure you have write permission to the folder :\n\ 524 \n\t %s ?\n\n' % os.path.split(old)[0] 525 return self.no_install() 526 return
527
528 - def no_install(self) :
529 """BD.no_install() -> None. 530 531 build the new dictionary but do not install the dictionary.""" 532 print '\n ' +'*'*78 + '\n' 533 #update = config.updatefolder 534 try : 535 import Bio.Restriction.Restriction_Dictionary as rd 536 except ImportError : 537 print '\ 538 \n Unable to locate the previous Restriction_Dictionary.py module\ 539 \n Aborting installation.' 540 sys.exit() 541 # 542 # first save the old file in Updates 543 # 544 old = os.path.join(os.path.split(rd.__file__)[0], 545 'Restriction_Dictionary.py') 546 update = os.getcwd() 547 shutil.copyfile(old, os.path.join(update, 'Restriction_Dictionary.old')) 548 places = update, os.path.split(Bio.Restriction.Restriction.__file__)[0] 549 print "\t\tCompilation of the new dictionary : OK.\ 550 \n\t\tInstallation : No.\n\ 551 \n You will find the newly created 'Restriction_Dictionary.py' file\ 552 \n in the folder : \n\ 553 \n\t%s\n\ 554 \n Make a copy of 'Restriction_Dictionary.py' and place it with \ 555 \n the other Restriction libraries.\n\ 556 \n note : \ 557 \n This folder should be :\n\ 558 \n\t%s\n" % places 559 print '\n ' +'*'*78 + '\n' 560 return
561 562
563 - def lastrebasefile(self) :
564 """BD.lastrebasefile() -> None. 565 566 Check the emboss files are up to date and download them if they are not. 567 """ 568 embossnames = ('emboss_e', 'emboss_r', 'emboss_s') 569 # 570 # first check if we have the last update : 571 # 572 emboss_now = ['.'.join((x,LocalTime())) for x in embossnames] 573 update_needed = False 574 #dircontent = os.listdir(config.Rebase) # local database content 575 dircontent = os.listdir(os.getcwd()) 576 base = os.getcwd() # added for biopython current directory 577 for name in emboss_now : 578 if name in dircontent : 579 pass 580 else : 581 update_needed = True 582 583 if not update_needed : 584 # 585 # nothing to be done 586 # 587 print '\n Using the files : %s'% ', '.join(emboss_now) 588 return tuple([open(os.path.join(base, n)) for n in emboss_now]) 589 else : 590 # 591 # may be download the files. 592 # 593 print '\n The rebase files are more than one month old.\ 594 \n Would you like to update them before proceeding?(y/n)' 595 r = raw_input(' update [n] >>> ') 596 if r in ['y', 'yes', 'Y', 'Yes'] : 597 updt = RebaseUpdate(self.rebase_pass, self.proxy) 598 updt.openRebase() 599 updt.getfiles() 600 updt.close() 601 print '\n Update complete. Creating the dictionaries.\n' 602 print '\n Using the files : %s'% ', '.join(emboss_now) 603 return tuple([open(os.path.join(base, n)) for n in emboss_now]) 604 else : 605 # 606 # we will use the last files found without updating. 607 # But first we check we have some file to use. 608 # 609 class NotFoundError(Exception) : 610 pass
611 for name in embossnames : 612 try : 613 for file in dircontent : 614 if file.startswith(name) : 615 break 616 else : 617 pass 618 raise NotFoundError 619 except NotFoundError : 620 print "\nNo %s file found. Upgrade is impossible.\n"%name 621 sys.exit() 622 continue 623 pass 624 # 625 # now find the last file. 626 # 627 last = [0] 628 for file in dircontent : 629 fs = file.split('.') 630 try : 631 if fs[0] in embossnames and int(fs[1]) > int(last[-1]) : 632 if last[0] : last.append(fs[1]) 633 else : last[0] = fs[1] 634 else : 635 continue 636 except ValueError : 637 continue 638 last.sort() 639 last = last[::-1] 640 if int(last[-1]) < 100 : last[0], last[-1] = last[-1], last[0] 641 for number in last : 642 files = [(name, name+'.%s'%number) for name in embossnames] 643 strmess = '\nLast EMBOSS files found are :\n' 644 try : 645 for name,file in files : 646 if os.path.isfile(os.path.join(base, file)) : 647 strmess += '\t%s.\n'%file 648 else : 649 raise ValueError 650 print strmess 651 emboss_e = open(os.path.join(base, 'emboss_e.%s'%number),'r') 652 emboss_r = open(os.path.join(base, 'emboss_r.%s'%number),'r') 653 emboss_s = open(os.path.join(base, 'emboss_s.%s'%number),'r') 654 return emboss_e, emboss_r, emboss_s 655 except ValueError : 656 continue
657
658 - def parseline(self, line) :
659 line = [line[0]]+[line[1].upper()]+[int(i) for i in line[2:9]]+line[9:] 660 name = line[0] 661 site = line[1] # sequence of the recognition site 662 dna = DNA(site) 663 size = line[2] # size of the recognition site 664 # 665 # Calculate the overhang. 666 # 667 fst5 = line[5] # first site sense strand 668 fst3 = line[6] # first site antisense strand 669 scd5 = line[7] # second site sense strand 670 scd3 = line[8] # second site antisense strand 671 672 # 673 # the overhang is the difference between the two cut 674 # 675 ovhg1 = fst5 - fst3 676 ovhg2 = scd5 - scd3 677 678 # 679 # 0 has the meaning 'do not cut' in rebase. So we get short of 1 680 # for the negative numbers so we add 1 to negative sites for now. 681 # We will deal with the record later. 682 # 683 684 if fst5 < 0 : fst5 += 1 685 if fst3 < 0 : fst3 += 1 686 if scd5 < 0 : scd5 += 1 687 if scd3 < 0 : scd3 += 1 688 689 if ovhg2 != 0 and ovhg1 != ovhg2 : 690 # 691 # different length of the overhang of the first and second cut 692 # it's a pain to deal with and at the moment it concerns only 693 # one enzyme which is not commercially available (HaeIV). 694 # So we don't deal with it but we check the progression 695 # of the affair. 696 # Should HaeIV become commercially available or other similar 697 # new enzymes be added, this might be modified. 698 # 699 print '\ 700 \nWARNING : %s cut twice with different overhang length each time.\ 701 \n\tUnable to deal with this behaviour. \ 702 \n\tThis enzyme will not be included in the database. Sorry.' %name 703 print '\tChecking :', 704 raise OverhangError 705 if 0 <= fst5 <= size and 0 <= fst3 <= size : 706 # 707 # cut inside recognition site 708 # 709 if fst5 < fst3 : 710 # 711 # 5' overhang 712 # 713 ovhg1 = ovhgseq = site[fst5:fst3] 714 elif fst5 > fst3 : 715 # 716 # 3' overhang 717 # 718 ovhg1 = ovhgseq = site[fst3:fst5] 719 else : 720 # 721 # blunt 722 # 723 ovhg1 = ovhgseq = '' 724 for base in 'NRYWMSKHDBV' : 725 if base in ovhg1 : 726 # 727 # site and overhang degenerated 728 # 729 ovhgseq = ovhg1 730 if fst5 < fst3 : ovhg1 = - len(ovhg1) 731 else : ovhg1 = len(ovhg1) 732 break 733 else : 734 continue 735 elif 0 <= fst5 <= size : 736 # 737 # 5' cut inside the site 3' outside 738 # 739 if fst5 < fst3 : 740 # 741 # 3' cut after the site 742 # 743 ovhgseq = site[fst5:] + (fst3 - size) * 'N' 744 elif fst5 > fst3 : 745 # 746 # 3' cut before the site 747 # 748 ovhgseq = abs(fst3) * 'N' + site[:fst5] 749 else : 750 # 751 # blunt outside 752 # 753 ovhg1 = ovhgseq = '' 754 elif 0 <= fst3 <= size : 755 # 756 # 3' cut inside the site, 5' outside 757 # 758 if fst5 < fst3 : 759 # 760 # 5' cut before the site 761 # 762 ovhgseq = abs(fst5) * 'N' + site[:fst3] 763 elif fst5 > fst3 : 764 # 765 # 5' cut after the site 766 # 767 ovhgseq = site[fst3:] + (fst5 - size) * 'N' 768 else : 769 # 770 # should not happend 771 # 772 raise ValueError('Error in #1') 773 elif fst3 < 0 and size < fst5 : 774 # 775 # 3' overhang. site is included. 776 # 777 ovhgseq = abs(fst3)*'N' + site + (fst5-size)*'N' 778 elif fst5 < 0 and size <fst3 : 779 # 780 # 5' overhang. site is included. 781 # 782 ovhgseq = abs(fst5)*'N' + site + (fst3-size)*'N' 783 else : 784 # 785 # 5' and 3' outside of the site 786 # 787 ovhgseq = 'N' * abs(ovhg1) 788 # 789 # Now line[5] to [8] are the location of the cut but we have to 790 # deal with the weird mathematics of biologists. 791 # 792 # EMBOSS sequence numbering give : 793 # DNA = 'a c g t A C G T' 794 # -1 1 2 3 4 795 # 796 # Biologists do not know about 0. Too much use of latin certainly. 797 # 798 # To compensate, we add 1 to the positions if they are negative. 799 # No need to modify 0 as it means no cut and will not been used. 800 # Positive numbers should be ok since our sequence starts 1. 801 # 802 # Moreover line[6] and line[8] represent cut on the reverse strand. 803 # They will be used for non palindromic sites and sre.finditer 804 # will detect the site in inverse orientation so we need to add the 805 # length of the site to compensate (+1 if they are negative). 806 # 807 for x in (5, 7) : 808 if line[x] < 0 : line[x] += 1 809 for x in (6, 8) : 810 if line[x] > 0 : line[x] -= size 811 elif line[x] < 0 : line[x] = line[x] - size + 1 812 # 813 # now is the site palindromic? 814 # produce the regular expression which correspond to the site. 815 # tag of the regex will be the name of the enzyme for palindromic 816 # enzymesband two tags for the other, the name for the sense sequence 817 # and the name with '_as' at the end for the antisense sequence. 818 # 819 rg = '' 820 if is_palindrom(dna) : 821 line.append(True) 822 rg = ''.join(['(?P<', name, '>', regex(site.upper()), ')']) 823 else : 824 line.append(False) 825 sense = ''.join(['(?P<', name, '>', regex(site.upper()), ')']) 826 antisense = ''.join(['(?P<', name, '_as>', 827 regex(Antiparallel(dna)), ')']) 828 rg = sense + '|' + antisense 829 # 830 # exact frequency of the site. (ie freq(N) == 1, ...) 831 # 832 f = [4/len(dna_alphabet[l]) for l in site.upper()] 833 freq = reduce(lambda x, y : x*y, f) 834 line.append(freq) 835 # 836 # append regex and ovhg1, they have not been appended before not to 837 # break the factory class. simply to leazy to make the changes there. 838 # 839 line.append(rg) 840 line.append(ovhg1) 841 line.append(ovhgseq) 842 return line
843
844 - def removestart(self, file) :
845 # 846 # remove the heading of the file. 847 # 848 return [l for l in itertools.dropwhile(lambda l:l.startswith('#'),file)]
849
850 - def getblock(self, file, index) :
851 # 852 # emboss_r.txt, separation between blocks is // 853 # 854 take = itertools.takewhile 855 block = [l for l in take(lambda l :not l.startswith('//'),file[index:])] 856 index += len(block)+1 857 return block, index
858
859 - def get(self, block) :
860 # 861 # take what we want from the block. 862 # Each block correspond to one enzyme. 863 # block[0] => enzyme name 864 # block[3] => methylation (position and type) 865 # block[5] => suppliers (as a string of single letter) 866 # 867 bl3 = block[3].strip() 868 if not bl3 : bl3 = False # site is not methylable 869 return (block[0].strip(), bl3, block[5].strip())
870
871 - def information_mixer(self, file1, file2, file3) :
872 # 873 # Mix all the information from the 3 files and produce a coherent 874 # restriction record. 875 # 876 methfile = self.removestart(file1) 877 sitefile = self.removestart(file2) 878 supplier = self.removestart(file3) 879 880 i1, i2= 0, 0 881 try : 882 while True : 883 block, i1 = self.getblock(methfile, i1) 884 bl = self.get(block) 885 line = (sitefile[i2].strip()).split() 886 name = line[0] 887 if name == bl[0] : 888 line.append(bl[1]) # -> methylation 889 line.append(bl[2]) # -> suppliers 890 else : 891 bl = self.get(oldblock) 892 if line[0] == bl[0] : 893 line.append(bl[1]) 894 line.append(bl[2]) 895 i2 += 1 896 else : 897 raise TypeError 898 oldblock = block 899 i2 += 1 900 try : 901 line = self.parseline(line) 902 except OverhangError : # overhang error 903 n = name # do not include the enzyme 904 if not bl[2] : 905 print 'Anyway, %s is not commercially available.\n' %n 906 else : 907 print 'Unfortunately, %s is commercially available.\n'%n 908 909 continue 910 if name in enzymedict : 911 # 912 # deal with TaqII and its two sites. 913 # 914 print '\nWARNING :', 915 print name, 'has two different sites.\n' 916 dna = DNA(line[1]) 917 sense1 = regex(dna.tostring()) 918 antisense1 = regex(Antiparallel(dna)) 919 dna = DNA(enzymedict[line[0]][0]) 920 sense2 = regex(dna.tostring()) 921 antisense2 = regex(Antiparallel(dna)) 922 sense = '(?P<'+line[0]+'>'+sense1+'|'+sense2+')' 923 antisense = '(?P<'+line[0]+'_as>'+antisense1+'|'+antisense2 + ')' 924 reg = sense + '|' + antisense 925 line[1] = line[1] + '|' + enzymedict[line[0]][0] 926 line[-1] = reg 927 # 928 # the data to produce the enzyme class are then stored in 929 # enzymedict. 930 # 931 enzymedict[name] = line[1:] 932 except IndexError : 933 pass 934 for i in supplier : 935 # 936 # construction of the list of suppliers. 937 # 938 t = i.strip().split(' ', 1) 939 suppliersdict[t[0]] = (t[1], []) 940 return
941