Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  import re, time 
 11  from Bio import SeqIO 
 12  from Bio import Translate 
 13  from Bio.Seq import Seq 
 14  from Bio import Alphabet 
 15  from Bio.Alphabet import IUPAC 
 16  from Bio.Data import IUPACData, CodonTable 
 17   
 18   
 19  ###################################### 
 20  # DNA 
 21  ###################### 
 22  # {{{  
 23   
24 -def reverse(seq):
25 """Reverse the sequence. Works on string sequences. 26 """ 27 r = map(None, seq) 28 r.reverse() 29 return ''.join(r)
30
31 -def GC(seq):
32 """ calculates G+C content """ 33 # 19/8/03: Iddo: added provision for lowercase 34 # 19/8/03: Iddo: divide by the sequence's length rather than by the 35 # A+T+G+C number. In that way, make provision for N. 36 37 d = {} 38 for nt in ['A','T','G','C','a','t','g','c','S','s']: 39 d[nt] = seq.count(nt) 40 gc = d.get('G',0) + d.get('C',0) + d.get('g',0) + d.get('c',0) + \ 41 d.get('S',0) + d.get('s',0) 42 43 if gc == 0: return 0 44 # return gc*100.0/(d['A'] +d['T'] + gc) 45 return gc*100.0/len(seq)
46
47 -def GC123(seq):
48 " calculates total G+C content plus first, second and third position " 49 50 d= {} 51 for nt in ['A','T','G','C']: 52 d[nt] = [0,0,0] 53 54 for i in range(0,len(seq),3): 55 codon = seq[i:i+3] 56 if len(codon) <3: codon += ' ' 57 for pos in range(0,3): 58 for nt in ['A','T','G','C']: 59 if codon[pos] == nt or codon[pos] == nt.lower(): 60 d[nt][pos] = d[nt][pos] +1 61 62 63 gc = {} 64 gcall = 0 65 nall = 0 66 for i in range(0,3): 67 try: 68 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 69 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 70 except: 71 gc[i] = 0 72 73 gcall = gcall + d['G'][i] + d['C'][i] 74 nall = nall + n 75 76 gcall = 100.0*gcall/nall 77 return gcall, gc[0], gc[1], gc[2]
78
79 -def GC_skew(seq, window = 100):
80 """ calculates GC skew (G-C)/(G+C) """ 81 # 8/19/03: Iddo: added lowercase 82 values = [] 83 for i in range(0, len(seq), window): 84 s = seq[i: i + window] 85 g = s.count('G') + s.count('g') 86 c = s.count('C') + s.count('c') 87 skew = (g-c)/float(g+c) 88 values.append(skew) 89 return values
90 91 # 8/19/03 Iddo: moved these imports from within the function as 92 # ``import * '' is only 93 # allowed within the module 94 # Brad -- added an import check so you don't have to have Tkinter 95 # installed to use other sequtils functions. A little ugly but 96 # it really should be fixed by not using 'import *' which I'm not 97 # really excited about doing right now. 98 try: 99 from Tkinter import * 100 except ImportError: 101 pass 102 from math import pi, sin, cos, log
103 -def xGC_skew(seq, window = 1000, zoom = 100, 104 r = 300, px = 100, py = 100):
105 " calculates and plots normal and accumulated GC skew (GRAPHICS !!!) " 106 107 108 yscroll = Scrollbar(orient = VERTICAL) 109 xscroll = Scrollbar(orient = HORIZONTAL) 110 canvas = Canvas(yscrollcommand = yscroll.set, 111 xscrollcommand = xscroll.set, background = 'white') 112 win = canvas.winfo_toplevel() 113 win.geometry('700x700') 114 115 yscroll.config(command = canvas.yview) 116 xscroll.config(command = canvas.xview) 117 yscroll.pack(side = RIGHT, fill = Y) 118 xscroll.pack(side = BOTTOM, fill = X) 119 canvas.pack(fill=BOTH, side = LEFT, expand = 1) 120 canvas.update() 121 122 X0, Y0 = r + px, r + py 123 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r 124 125 ty = Y0 126 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 127 ty +=20 128 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) 129 ty +=20 130 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') 131 ty +=20 132 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') 133 ty +=20 134 canvas.create_oval(x1,y1, x2, y2) 135 136 acc = 0 137 start = 0 138 for gc in GC_skew(seq, window): 139 r1 = r 140 acc+=gc 141 # GC skew 142 alpha = pi - (2*pi*start)/len(seq) 143 r2 = r1 - gc*zoom 144 x1 = X0 + r1 * sin(alpha) 145 y1 = Y0 + r1 * cos(alpha) 146 x2 = X0 + r2 * sin(alpha) 147 y2 = Y0 + r2 * cos(alpha) 148 canvas.create_line(x1,y1,x2,y2, fill = 'blue') 149 # accumulated GC skew 150 r1 = r - 50 151 r2 = r1 - acc 152 x1 = X0 + r1 * sin(alpha) 153 y1 = Y0 + r1 * cos(alpha) 154 x2 = X0 + r2 * sin(alpha) 155 y2 = Y0 + r2 * cos(alpha) 156 canvas.create_line(x1,y1,x2,y2, fill = 'magenta') 157 158 canvas.update() 159 start = start + window 160 161 162 canvas.configure(scrollregion = canvas.bbox(ALL))
163
164 -def molecular_weight(seq):
165 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) 166 weight_table = IUPACData.unambiguous_dna_weights 167 sum = 0 168 for x in seq: 169 sum += weight_table[x] 170 return sum
171
172 -def nt_search(seq, subseq):
173 """ search for a DNA subseq in sequence 174 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 175 searches only on forward strand 176 """ 177 pattern = '' 178 for nt in subseq: 179 value = IUPACData.ambiguous_dna_values[nt] 180 if len(value) == 1: 181 pattern += value 182 else: 183 pattern += '[%s]' % value 184 185 pos = -1 186 result = [pattern] 187 l = len(seq) 188 while 1: 189 pos+=1 190 s = seq[pos:] 191 m = re.search(pattern, s) 192 if not m: break 193 pos += int(m.start(0)) 194 result.append(pos) 195 196 return result
197 198 # }}} 199 200 ###################################### 201 # Protein 202 ###################### 203 # {{{ 204 205 # temporary hack for exception free translation of "dirty" DNA 206 # should be moved to ??? 207
208 -class ProteinX(Alphabet.ProteinAlphabet):
209 letters = IUPACData.extended_protein_letters + "X"
210 211 proteinX = ProteinX() 212
213 -class MissingTable:
214 - def __init__(self, table):
215 self._table = table
216 - def get(self, codon, stop_symbol):
217 try: 218 return self._table.get(codon, stop_symbol) 219 except CodonTable.TranslationError: 220 return 'X'
221
222 -def makeTableX(table):
223 assert table.protein_alphabet == IUPAC.extended_protein 224 return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX, 225 MissingTable(table.forward_table), 226 table.back_table, table.start_codons, 227 table.stop_codons)
228 229 230 # end of hacks 231
232 -def seq3(seq):
233 """ 234 Method that returns the amino acid sequence as a 235 list of three letter codes. Output follows the IUPAC standard plus 'Ter' for 236 terminator. Any unknown character, including the default 237 unknown character 'X', is changed into 'Xaa'. A noncoded 238 aminoacid selenocystein is recognized (Sel, U). 239 """ 240 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 241 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 242 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 243 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 244 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 245 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', 246 'U':'Sel' 247 } 248 249 return ''.join([threecode.get(aa,'Xer') for aa in seq])
250 251 252 # }}} 253 254 ###################################### 255 # Mixed ??? 256 ###################### 257 # {{{ 258
259 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
260 " translation of DNA in one of the six different reading frames " 261 if frame not in [1,2,3,-1,-2,-3]: 262 raise ValueError, 'invalid frame' 263 264 if not translator: 265 table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code]) 266 translator = Translate.Translator(table) 267 268 return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
269
270 -def GC_Frame(seq, genetic_code = 1):
271 " just an alias for six_frame_translations " 272 return six_frame_translations(seq, genetic_code)
273
274 -def six_frame_translations(seq, genetic_code = 1):
275 """ 276 nice looking 6 frame translation with GC content - code from xbbtools 277 similar to DNA Striders six-frame translation 278 """ 279 from Bio.Seq import reverse_complement 280 anti = reverse_complement(seq) 281 comp = anti[::-1] 282 length = len(seq) 283 frames = {} 284 for i in range(0,3): 285 frames[i+1] = translate(seq[i:], genetic_code) 286 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) 287 288 # create header 289 if length > 20: 290 short = '%s ... %s' % (seq[:10], seq[-10:]) 291 else: 292 short = seq 293 date = time.strftime('%y %b %d, %X', time.localtime(time.time())) 294 header = 'GC_Frame: %s, ' % date 295 for nt in ['a','t','g','c']: 296 header += '%s:%d ' % (nt, seq.count(nt.upper())) 297 298 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) 299 res = header 300 301 for i in range(0,length,60): 302 subseq = seq[i:i+60] 303 csubseq = comp[i:i+60] 304 p = i/3 305 res = res + '%d/%d\n' % (i+1, i/3+1) 306 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' 307 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' 308 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' 309 # seq 310 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) 311 res = res + csubseq.lower() + '\n' 312 # - frames 313 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' 314 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' 315 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' 316 return res
317 318 # }}} 319 320 ###################################### 321 # FASTA file utilities 322 ###################### 323 # {{{ 324
325 -def fasta_uniqids(file):
326 " checks and changes the name/ID's to be unique identifiers by adding numbers " 327 dict = {} 328 txt = open(file).read() 329 entries = [] 330 for entry in txt.split('>')[1:]: 331 name, seq= entry.split('\n',1) 332 name = name.split()[0].split(',')[0] 333 334 if dict.has_key(name): 335 n = 1 336 while 1: 337 n = n + 1 338 _name = name + str(n) 339 if not dict.has_key(_name): 340 name = _name 341 break 342 343 dict[name] = seq 344 345 for name, seq in dict.items(): 346 print '>%s\n%s' % (name, seq)
347
348 -def quick_FASTA_reader(file):
349 " simple and FASTA reader, preferable to be used on large files " 350 txt = open(file).read() 351 entries = [] 352 for entry in txt.split('>')[1:]: 353 name,seq= entry.split('\n',1) 354 seq = seq.replace('\n','').replace(' ','').upper() 355 entries.append((name, seq)) 356 357 return entries
358
359 -def apply_on_multi_fasta(file, function, *args):
360 " apply function on each sequence in a multiple FASTA file " 361 try: 362 f = globals()[function] 363 except: 364 raise NotImplementedError, "%s not implemented" % function 365 366 handle = open(file, 'r') 367 records = SeqIO.parse(handle, "fasta") 368 results = [] 369 for record in records: 370 arguments = [record.sequence] 371 for arg in args: arguments.append(arg) 372 result = f(*arguments) 373 if result: 374 results.append('>%s\n%s' % (record.title, result)) 375 return results
376
377 -def quicker_apply_on_multi_fasta(file, function, *args):
378 " apply function on each sequence in a multiple FASTA file " 379 try: 380 f = globals()[function] 381 except: 382 raise NotImplementedError, "%s not implemented" % function 383 384 entries = quick_FASTA_reader(file) 385 results = [] 386 for name, seq in entries: 387 arguments = [seq] 388 for arg in args: arguments.append(arg) 389 result = f(*arguments) 390 if result: 391 results.append('>%s\n%s' % (name, result)) 392 return results
393 394 # }}} 395 396 ###################################### 397 # Main 398 ##################### 399 # {{{ 400 401 if __name__ == '__main__': 402 import sys, getopt 403 # crude command line options to use most functions directly on a FASTA file 404 options = {'apply_on_multi_fasta':0, 405 'quick':0, 406 'uniq_ids':0, 407 } 408 409 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=', 410 'help', 'quick', 'uniq_ids', 'search=']) 411 for arg in optlist: 412 if arg[0] in ['-h', '--help']: 413 pass 414 elif arg[0] in ['--describe']: 415 # get all new functions from this file 416 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)] 417 mol_funcs.sort() 418 print 'available functions:' 419 for f in mol_funcs: print '\t--%s' % f 420 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas' 421 422 sys.exit(0) 423 elif arg[0] in ['--apply_on_multi_fasta']: 424 options['apply_on_multi_fasta'] = arg[1] 425 elif arg[0] in ['--search']: 426 options['search'] = arg[1] 427 else: 428 key = re.search('-*(.+)', arg[0]).group(1) 429 options[key] = 1 430 431 432 if options.get('apply_on_multi_fasta'): 433 file = args[0] 434 function = options['apply_on_multi_fasta'] 435 arguments = [] 436 if options.get('search'): 437 arguments = options['search'] 438 if function == 'xGC_skew': 439 arguments = 1000 440 if options.get('quick'): 441 results = quicker_apply_on_multi_fasta(file, function, arguments) 442 else: 443 results = apply_on_multi_fasta(file, function, arguments) 444 for result in results: print result 445 446 elif options.get('uniq_ids'): 447 file = args[0] 448 fasta_uniqids(file) 449 450 # }}} 451