1
2
3
4
5
6
7
8
9
10 """Miscellaneous functions for dealing with sequences."""
11
12 import re, time
13 from Bio import SeqIO
14 from Bio.Seq import Seq
15 from Bio import Alphabet
16 from Bio.Alphabet import IUPAC
17 from Bio.Data import IUPACData, CodonTable
18
19
20
21
22
23
24
26 """Reverse the sequence. Works on string sequences.
27
28 e.g.
29 >>> reverse("ACGGT")
30 'TGGCA'
31
32 """
33 r = list(seq)
34 r.reverse()
35 return ''.join(r)
36
38 """Calculates G+C content, returns the percentage (float between 0 and 100).
39
40 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C)
41 when counting the G and C content. The percentage is calculated against
42 the full length, e.g.:
43
44 >>> from Bio.SeqUtils import GC
45 >>> GC("ACTGN")
46 40.0
47
48 Note that this will return zero for an empty sequence.
49 """
50 try :
51 gc = sum(map(seq.count,['G','C','g','c','S','s']))
52 return gc*100.0/len(seq)
53 except ZeroDivisionError :
54 return 0.0
55
56
58 """Calculates total G+C content plus first, second and third positions.
59
60 Returns a tuple of four floats (percentages between 0 and 100) for the
61 entire sequence, and the three codon positions. e.g.
62
63 >>> from Bio.SeqUtils import GC123
64 >>> GC123("ACTGTN")
65 (40.0, 50.0, 50.0, 0.0)
66
67 Copes with mixed case sequences, but does NOT deal with ambiguous
68 nucleotides.
69 """
70 d= {}
71 for nt in ['A','T','G','C']:
72 d[nt] = [0,0,0]
73
74 for i in range(0,len(seq),3):
75 codon = seq[i:i+3]
76 if len(codon) <3: codon += ' '
77 for pos in range(0,3):
78 for nt in ['A','T','G','C']:
79 if codon[pos] == nt or codon[pos] == nt.lower():
80 d[nt][pos] += 1
81 gc = {}
82 gcall = 0
83 nall = 0
84 for i in range(0,3):
85 try:
86 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
87 gc[i] = (d['G'][i] + d['C'][i])*100.0/n
88 except:
89 gc[i] = 0
90
91 gcall = gcall + d['G'][i] + d['C'][i]
92 nall = nall + n
93
94 gcall = 100.0*gcall/nall
95 return gcall, gc[0], gc[1], gc[2]
96
98 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence.
99
100 Returns a list of ratios (floats), controlled by the length of the sequence
101 and the size of the window.
102
103 Does NOT look at any ambiguous nucleotides.
104 """
105
106 values = []
107 for i in range(0, len(seq), window):
108 s = seq[i: i + window]
109 g = s.count('G') + s.count('g')
110 c = s.count('C') + s.count('c')
111 skew = (g-c)/float(g+c)
112 values.append(skew)
113 return values
114
115 from math import pi, sin, cos, log
116 -def xGC_skew(seq, window = 1000, zoom = 100,
117 r = 300, px = 100, py = 100):
118 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!)."""
119 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \
120 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y
121 yscroll = Scrollbar(orient = VERTICAL)
122 xscroll = Scrollbar(orient = HORIZONTAL)
123 canvas = Canvas(yscrollcommand = yscroll.set,
124 xscrollcommand = xscroll.set, background = 'white')
125 win = canvas.winfo_toplevel()
126 win.geometry('700x700')
127
128 yscroll.config(command = canvas.yview)
129 xscroll.config(command = canvas.xview)
130 yscroll.pack(side = RIGHT, fill = Y)
131 xscroll.pack(side = BOTTOM, fill = X)
132 canvas.pack(fill=BOTH, side = LEFT, expand = 1)
133 canvas.update()
134
135 X0, Y0 = r + px, r + py
136 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r
137
138 ty = Y0
139 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq)))
140 ty +=20
141 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq)))
142 ty +=20
143 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue')
144 ty +=20
145 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta')
146 ty +=20
147 canvas.create_oval(x1,y1, x2, y2)
148
149 acc = 0
150 start = 0
151 for gc in GC_skew(seq, window):
152 r1 = r
153 acc+=gc
154
155 alpha = pi - (2*pi*start)/len(seq)
156 r2 = r1 - gc*zoom
157 x1 = X0 + r1 * sin(alpha)
158 y1 = Y0 + r1 * cos(alpha)
159 x2 = X0 + r2 * sin(alpha)
160 y2 = Y0 + r2 * cos(alpha)
161 canvas.create_line(x1,y1,x2,y2, fill = 'blue')
162
163 r1 = r - 50
164 r2 = r1 - acc
165 x1 = X0 + r1 * sin(alpha)
166 y1 = Y0 + r1 * cos(alpha)
167 x2 = X0 + r2 * sin(alpha)
168 y2 = Y0 + r2 * cos(alpha)
169 canvas.create_line(x1,y1,x2,y2, fill = 'magenta')
170
171 canvas.update()
172 start += window
173
174 canvas.configure(scrollregion = canvas.bbox(ALL))
175
181
183 """Search for a DNA subseq in sequence.
184
185 use ambiguous values (like N = A or T or C or G, R = A or G etc.)
186 searches only on forward strand
187 """
188 pattern = ''
189 for nt in subseq:
190 value = IUPACData.ambiguous_dna_values[nt]
191 if len(value) == 1:
192 pattern += value
193 else:
194 pattern += '[%s]' % value
195
196 pos = -1
197 result = [pattern]
198 l = len(seq)
199 while True:
200 pos+=1
201 s = seq[pos:]
202 m = re.search(pattern, s)
203 if not m: break
204 pos += int(m.start(0))
205 result.append(pos)
206 return result
207
208
209
210
211
212
213
214
215
216
217
218 -class ProteinX(Alphabet.ProteinAlphabet):
220
221 proteinX = ProteinX()
222
226 - def get(self, codon, stop_symbol):
231
238
239
240
242 """Turn a one letter code protein sequence into one with three letter codes.
243
244 The single input argument 'seq' should be a protein sequence using single
245 letter codes, either as a python string or as a Seq or MutableSeq object.
246
247 This function returns the amino acid sequence as a string using the three
248 letter amino acid codes. Output follows the IUPAC standard (including
249 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
250 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown
251 character (including possible gap characters), is changed into 'Xaa'.
252
253 e.g.
254 >>> from Bio.SeqUtils import seq3
255 >>> seq3("MAIVMGRWKGAR*")
256 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'
257
258 This function was inspired by BioPerl's seq3.
259 """
260 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
261 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
262 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
263 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
264 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
265 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter',
266 'U':'Sel', 'O':'Pyl', 'J':'Xle',
267 }
268
269
270 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
271
272
273
274
275
276
277
278
279
280 -def translate(seq, frame = 1, genetic_code = 1, translator = None):
281 """Translation of DNA in one of the six different reading frames (DEPRECATED).
282
283 Use the Bio.Seq.Translate function, or the Seq object's translate method
284 instead:
285
286 >>> from Bio.Seq import Seq
287 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG")
288 >>> my_seq = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUA")
289 >>> for frame in [0,1,2] :
290 ... print my_seq[frame:].translate()
291 ...
292 MAIVMGR*KGAR*
293 WPL*WAAERVPDS
294 GHCNGPLKGCPIV
295 >>> for frame in [0,1,2] :
296 ... print my_seq.reverse_complement()[frame:].translate()
297 ...
298 YYRAPFQRPITMA
299 TIGHPFSGPLQWP
300 LSGTLSAAHYNGH
301 """
302 import warnings
303 warnings.warn("Bio.SeqUtils.translate() has been deprecated, and we intend" \
304 +" to remove it in a future release of Biopython. Please use"\
305 +" the method or function in Bio.Seq instead, as described in"\
306 +" the Tutorial.", DeprecationWarning)
307 from Bio import Translate
308
309 if frame not in [1,2,3,-1,-2,-3]:
310 raise ValueError('invalid frame')
311
312 if not translator:
313 table = makeTableX(CodonTable.ambiguous_dna_by_id[genetic_code])
314 translator = Translate.Translator(table)
315
316
317 return translator.translate(Seq(seq[frame-1:], IUPAC.ambiguous_dna)).data
318
320 """Just an alias for six_frame_translations (OBSOLETE).
321
322 Use six_frame_translation directly, as this function may be deprecated
323 in a future release."""
324 return six_frame_translations(seq, genetic_code)
325
327 """Formatted string showing the 6 frame translations and GC content.
328
329 nice looking 6 frame translation with GC content - code from xbbtools
330 similar to DNA Striders six-frame translation
331
332 e.g.
333 from Bio.SeqUtils import six_frame_translations
334 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA")
335 """
336 from Bio.Seq import reverse_complement, translate
337 anti = reverse_complement(seq)
338 comp = anti[::-1]
339 length = len(seq)
340 frames = {}
341 for i in range(0,3):
342 frames[i+1] = translate(seq[i:], genetic_code)
343 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code))
344
345
346 if length > 20:
347 short = '%s ... %s' % (seq[:10], seq[-10:])
348 else:
349 short = seq
350
351 date = time.strftime('%y %b %d, %X', time.localtime(time.time()))
352 header = 'GC_Frame: %s, ' % date
353 for nt in ['a','t','g','c']:
354 header += '%s:%d ' % (nt, seq.count(nt.upper()))
355
356 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq))
357 res = header
358
359 for i in range(0,length,60):
360 subseq = seq[i:i+60]
361 csubseq = comp[i:i+60]
362 p = i/3
363 res = res + '%d/%d\n' % (i+1, i/3+1)
364 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n'
365 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n'
366 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n'
367
368 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq))
369 res = res + csubseq.lower() + '\n'
370
371 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n'
372 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n'
373 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n'
374 return res
375
376
377
378
379
380
381
382
384 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE).
385
386 file - a FASTA format filename to read in.
387
388 No return value, the output is written to screen.
389 """
390 dict = {}
391 txt = open(file).read()
392 entries = []
393 for entry in txt.split('>')[1:]:
394 name, seq= entry.split('\n',1)
395 name = name.split()[0].split(',')[0]
396
397 if name in dict:
398 n = 1
399 while 1:
400 n = n + 1
401 _name = name + str(n)
402 if _name not in dict:
403 name = _name
404 break
405
406 dict[name] = seq
407
408 for name, seq in dict.items():
409 print '>%s\n%s' % (name, seq)
410
412 """Simple FASTA reader, returning a list of string tuples.
413
414 The single argument 'file' should be the filename of a FASTA format file.
415 This function will open and read in the entire file, constructing a list
416 of all the records, each held as a tuple of strings (the sequence name or
417 title, and its sequence).
418
419 This function was originally intended for use on large files, where its
420 low overhead makes it very fast. However, because it returns the data as
421 a single in memory list, this can require a lot of RAM on large files.
422
423 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which
424 allows you to iterate over the records one by one (avoiding having all the
425 records in memory at once). Using Bio.SeqIO also makes it easy to switch
426 between different input file formats. However, please note that rather
427 than simple strings, Bio.SeqIO uses SeqRecord objects for each record.
428 """
429
430
431
432 handle = open(file)
433 txt = "\n" + handle.read()
434 handle.close()
435 entries = []
436 for entry in txt.split('\n>')[1:]:
437 name,seq= entry.split('\n',1)
438 seq = seq.replace('\n','').replace(' ','').upper()
439 entries.append((name, seq))
440 return entries
441
443 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
444
445 file - filename of a FASTA format file
446 function - the function you wish to invoke on each record
447 *args - any extra arguments you want passed to the function
448
449 This function will iterate over each record in a FASTA file as SeqRecord
450 objects, calling your function with the record (and supplied args) as
451 arguments.
452
453 This function returns a list. For those records where your function
454 returns a value, this is taken as a sequence and used to construct a
455 FASTA format string. If your function never has a return value, this
456 means apply_on_multi_fasta will return an empty list.
457 """
458 try:
459 f = globals()[function]
460 except:
461 raise NotImplementedError("%s not implemented" % function)
462
463 handle = open(file, 'r')
464 records = SeqIO.parse(handle, "fasta")
465 results = []
466 for record in records:
467 arguments = [record.sequence]
468 for arg in args: arguments.append(arg)
469 result = f(*arguments)
470 if result:
471 results.append('>%s\n%s' % (record.name, result))
472 handle.close()
473 return results
474
476 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE).
477
478 file - filename of a FASTA format file
479 function - the function you wish to invoke on each record
480 *args - any extra arguments you want passed to the function
481
482 This function will use quick_FASTA_reader to load every record in the
483 FASTA file into memory as a list of tuples. For each record, it will
484 call your supplied function with the record as a tuple of the name and
485 sequence as strings (plus any supplied args).
486
487 This function returns a list. For those records where your function
488 returns a value, this is taken as a sequence and used to construct a
489 FASTA format string. If your function never has a return value, this
490 means quicker_apply_on_multi_fasta will return an empty list.
491 """
492 try:
493 f = globals()[function]
494 except:
495 raise NotImplementedError("%s not implemented" % function)
496
497 entries = quick_FASTA_reader(file)
498 results = []
499 for name, seq in entries:
500 arguments = [seq]
501 for arg in args: arguments.append(arg)
502 result = f(*arguments)
503 if result:
504 results.append('>%s\n%s' % (name, result))
505 handle.close()
506 return results
507
508
509
510
511
512
513
514
515 if __name__ == '__main__':
516 import sys, getopt
517
518 options = {'apply_on_multi_fasta':0,
519 'quick':0,
520 'uniq_ids':0,
521 }
522
523 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=',
524 'help', 'quick', 'uniq_ids', 'search='])
525 for arg in optlist:
526 if arg[0] in ['-h', '--help']:
527 pass
528 elif arg[0] in ['--describe']:
529
530 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)]
531 mol_funcs.sort()
532 print 'available functions:'
533 for f in mol_funcs: print '\t--%s' % f
534 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas'
535
536 sys.exit(0)
537 elif arg[0] in ['--apply_on_multi_fasta']:
538 options['apply_on_multi_fasta'] = arg[1]
539 elif arg[0] in ['--search']:
540 options['search'] = arg[1]
541 else:
542 key = re.search('-*(.+)', arg[0]).group(1)
543 options[key] = 1
544
545
546 if options.get('apply_on_multi_fasta'):
547 file = args[0]
548 function = options['apply_on_multi_fasta']
549 arguments = []
550 if options.get('search'):
551 arguments = options['search']
552 if function == 'xGC_skew':
553 arguments = 1000
554 if options.get('quick'):
555 results = quicker_apply_on_multi_fasta(file, function, arguments)
556 else:
557 results = apply_on_multi_fasta(file, function, arguments)
558 for result in results: print result
559
560 elif options.get('uniq_ids'):
561 file = args[0]
562 fasta_uniqids(file)
563
564
565