1
2
3
4
5
6
7
8 """
9 Access to General Feature Format databases created with Bio::DB:GFF
10
11 based on documentation for Lincoln Stein's Perl Bio::DB::GFF
12
13 >>> import os
14 >>> import Bio.GFF
15 >>> PASSWORD = os.environ['MYSQLPASS']
16 >>> DATABASE_GFF = 'wormbase_ws93'
17 >>> FASTADIR = '/local/wormbase_ws93/'
18 >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR)
19
20 """
21
22 __version__ = "$Revision: 1.10 $"
23
24
25 import exceptions
26 import operator
27 import os.path
28 import sys
29 import types
30
31 from Bio import MissingExternalDependencyError
32
33 try:
34 import MySQLdb
35 except:
36 raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.GFF.")
37
38 from Bio.Alphabet import IUPAC
39 from Bio import DocSQL
40 from Bio.Seq import Seq
41 from Bio.SeqRecord import SeqRecord
42
43 import binning
44 import easy
45 import GenericTools
46
47
48 DEFAULT_ALPHABET = IUPAC.unambiguous_dna
49
51 """
52 this will only work for the simplest of easy.Location objects
53 """
64
67
73
75 """
76 Connection to GFF database
77
78 also functions as whole segment
79 """
80
89
90 - def segment(self, *args, **keywds):
91 return Segment(self, *args, **keywds)
92
94 """
95 Singleton: contain records of loaded FASTA files
96
97 >>> RetrieveSeqname._dir = 'GFF'
98 >>> RetrieveSeqname._diagnostics = 1
99 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them
100 >>> record1 = RetrieveSeqname('NC_001802.fna')
101 Loading GFF/NC_001802.fna... Done.
102 >>> record1.id
103 'gi|9629357|ref|NC_001802.1|'
104 >>> record2 = RetrieveSeqname('NC_001802.fna')
105 >>> record1.seq is record2.seq # should be same space in memory
106 1
107 """
108 __records = {}
109 _diagnostics = 0
110
123
125 """
126 strand may be:
127 +/0 = Watson
128 -/1 = Crick
129
130 I propose that we start calling these the Rosalind and Franklin strands
131
132 >>> RetrieveSeqname._dir = 'GFF'
133 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test
134 >>> feature.seq()
135 Seq('AATAAA', Alphabet())
136 >>> print feature.location()
137 NC_001802x.fna:73..78
138 >>> from Bio import SeqIO
139 >>> record = feature.record()
140 >>> records = [record]
141 >>> SeqIO.write(records, sys.stdout, 'fasta')
142 > NC_001802x.fna:73..78
143 AATAAA
144 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78"))
145 >>> writer.write(feature2.record())
146 > NC_001802x.fna:73..78
147 AATAAA
148 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)")
149 >>> feature3 = Feature(location=location3)
150 >>> writer.write(feature3.record())
151 > NC_001802x.fna:complement(73..78)
152 TTTATT
153 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631")
154 >>> feature4 = Feature(location=location4, frame=0)
155 >>> feature4.frame
156 0
157 >>> feature4.translate()[:7]
158 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
159 >>> feature4.frame = 6 # can't happen, but a useful demonstration
160 >>> feature4.translate()[:5]
161 Seq('ARASV', HasStopCodon(IUPACProtein(), '*'))
162 >>> feature4.frame = 1
163 >>> feature4.translate()[:5]
164 Seq('WVRER', HasStopCodon(IUPACProtein(), '*'))
165 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data
166 >>> feature5 = Feature(location=location5, frame=0)
167 >>> feature5.translate()[:7]
168 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*'))
169 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351")
170 >>> feature6 = Feature(location=location6, frame=1)
171 >>> feature6.translate()
172 Seq('MGARA', HasStopCodon(IUPACProtein(), '*'))
173 """
174 - def __init__(self,
175 seqname=None,
176 start=None,
177 end=None,
178 strand="+",
179 frame=None,
180 location=None,
181 alphabet=DEFAULT_ALPHABET):
197
205
209
226
229
231 if self.target_start <= self.target_end:
232 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname)
233 else:
234
235 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
236
238 try:
239 return "%s:%s" % (self.gclass, self.gname)
240 except AttributeError:
241 return ""
242
245
248
250 return "Feature(%s)" % self.location()
251
253 """
254 row of FeatureQuery results
255 works like a Feature
256 """
266
268 """
269 SELECT fdata.fref AS seqname,
270 ftype.fsource AS source,
271 ftype.fmethod AS feature,
272 fdata.fstart AS start,
273 fdata.fstop AS end,
274 fdata.fscore AS score,
275 fdata.fstrand AS strand,
276 fdata.fphase AS frame,
277 fdata.ftarget_start AS target_start,
278 fdata.ftarget_stop AS target_end,
279 fdata.gid,
280 fgroup.gclass,
281 fgroup.gname
282 FROM fdata, ftype, fgroup
283 WHERE fdata.ftypeid = ftype.ftypeid
284 AND fdata.gid = fgroup.gid
285 """
286
287 row_class = FeatureQueryRow
288 - def __init__(self,
289 seqname=None,
290 coords=None,
291 complement=0,
292 method=None,
293 source=None,
294 object=None,
295 gid=None,
296 *args,
297 **keywds):
298
299 DocSQL.Query.__init__(self, *args, **keywds)
300
301 if seqname is not None:
302 self.statement += ' AND fref="%s"\n' % seqname
303 if coords is not None:
304 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1])
305 if coords[0] is not None:
306 self.statement += (" AND fstop >= %s\n" % coords[0])
307 if coords[1] is not None:
308 self.statement += (" AND fstart <= %s\n" % coords[1])
309 if method is not None:
310 self.statement += ' AND fmethod LIKE "%s"\n' % method
311 if source is not None:
312 self.statement += ' AND fsource LIKE "%s"\n' % source
313 if object is not None:
314 self.statement += ' AND %s\n' % object2fgroup_sql(object)
315 if gid is not None:
316 self.statement += ' AND fgroup.gid = %s\n' % gid
317
318 if complement:
319 self.statement += " ORDER BY 0-fstart, 0-fstop"
320 else:
321 self.statement += " ORDER BY fstart, fstop"
322
325
327 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
328
330 """
331 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol
332 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage
333 >>> aggregate = FeatureAggregate([feature1_1, feature1_2])
334 >>> print aggregate.location()
335 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642)
336 >>> xlate_str = aggregate.translate().tostring()
337 >>> xlate_str[:5], xlate_str[-5:]
338 ('MGARA', 'RQDED')
339
340 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)")
341 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)")
342 >>> feature2_1 = Feature(location=location1, frame=0)
343 >>> feature2_2 = Feature(location=location2, frame=0)
344 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2])
345 >>> print aggregate2.location()
346 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12))
347 >>> print aggregate2.translate()
348 Seq('TRET', HasStopCodon(IUPACProtein(), '*'))
349 >>> location1.reverse()
350 >>> location2.reverse()
351 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]])
352 >>> print aggregate3.location()
353 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)
354 >>> print aggregate3.translate()
355 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
356 >>> aggregate3[0].frame = 3
357 >>> print aggregate3.translate()
358 Seq('LSG', HasStopCodon(IUPACProtein(), '*'))
359
360 >>> aggregate4 = FeatureAggregate()
361 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0))
362 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2))
363 >>> aggregate4.seq()
364 Seq('GGTCTCTCTGGT', Alphabet())
365 >>> aggregate4.translate()
366 Seq('GLSG', HasStopCodon(IUPACProtein(), '*'))
367 """
368 - def __init__(self, feature_query=None):
373
378
379 - def map(self, func):
384
387
389 attributes = ['alphabet', 'frame']
390 index = [0, -1][self.location().complement]
391 for attribute in attributes:
392 self.__dict__[attribute] = self[index].__dict__[attribute]
393 translation = Feature.translate(self)
394 try:
395 assert translation.tostring().index("*") == len(translation) - 1
396 return translation[:translation.tostring().index("*")]
397 except ValueError:
398 return translation
399
401 """
402 >>> object_split("Sequence:F02E9.2a")
403 ('Sequence', 'F02E9.2a')
404 """
405 return tuple(object.split(":"))
406
407 -def _test(*args, **keywds):
408 import doctest, sys
409 doctest.testmod(sys.modules[__name__], *args, **keywds)
410
411 if __name__ == "__main__":
412 if __debug__:
413 _test()
414