Package Bio :: Package GFF
[hide private]
[frames] | no frames]

Source Code for Package Bio.GFF

  1  #!/usr/bin/env python 
  2  # 
  3  # Copyright 2002 by Michael Hoffman.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """ 
  9  Access to General Feature Format databases created with Bio::DB:GFF 
 10   
 11  based on documentation for Lincoln Stein's Perl Bio::DB::GFF 
 12   
 13  >>> import os 
 14  >>> import Bio.GFF 
 15  >>> PASSWORD = os.environ['MYSQLPASS'] 
 16  >>> DATABASE_GFF = 'wormbase_ws93' 
 17  >>> FASTADIR = '/local/wormbase_ws93/' 
 18  >>> gff = Bio.GFF.Connection(passwd=PASSWORD, db=DATABASE_GFF, fastadir=FASTADIR) 
 19   
 20  """ 
 21   
 22  __version__ = "$Revision: 1.10 $" 
 23  # $Source: /home/repository/biopython/biopython/Bio/GFF/__init__.py,v $ 
 24   
 25  import exceptions 
 26  import operator 
 27  import os.path 
 28  import sys 
 29  import types 
 30   
 31  from Bio import MissingExternalDependencyError 
 32   
 33  try: 
 34      import MySQLdb 
 35  except: 
 36      raise MissingExternalDependencyError("Install MySQLdb if you want to use Bio.GFF.") 
 37   
 38  from Bio.Alphabet import IUPAC 
 39  from Bio import DocSQL 
 40  from Bio.Seq import Seq 
 41  from Bio.SeqRecord import SeqRecord 
 42   
 43  import binning 
 44  import easy 
 45  import GenericTools 
 46   
 47   
 48  DEFAULT_ALPHABET = IUPAC.unambiguous_dna 
 49   
50 -class Segment(object):
51 """ 52 this will only work for the simplest of easy.Location objects 53 """
54 - def __init__(self, gff, location=None):
55 self.gff = gff 56 if location is None: 57 self.seqname = None 58 self.coords = None 59 self.complement = None 60 else: 61 self.seqname = location.seqname 62 self.coords = [location.start(), location.end()] 63 self.complement = location.complement
64
65 - def features(self, *args, **keywds):
66 return FeatureQuery(self.seqname, self.coords, self.complement, connection=self.gff.db, *args, **keywds)
67
68 - def single_feature(self, *args, **keywds):
69 features = self.features(*args, **keywds) 70 all_features = GenericTools.all(features) 71 assert len(all_features) == 1 72 return all_features[0]
73
74 -class Connection(Segment):
75 """ 76 Connection to GFF database 77 78 also functions as whole segment 79 """ 80
81 - def __init__(self, *args, **keywds):
82 try: 83 RetrieveSeqname._dir = keywds['fastadir'] 84 del keywds['fastadir'] 85 except KeyError: 86 RetrieveSeqname._dir = '.' 87 self.db = MySQLdb.connect(*args, **keywds) 88 Segment.__init__(self, self)
89
90 - def segment(self, *args, **keywds):
91 return Segment(self, *args, **keywds)
92
93 -class RetrieveSeqname(GenericTools.Surrogate, SeqRecord):
94 """ 95 Singleton: contain records of loaded FASTA files 96 97 >>> RetrieveSeqname._dir = 'GFF' 98 >>> RetrieveSeqname._diagnostics = 1 99 >>> sys.stderr = sys.stdout # dump diagnostics to stdout so doctest can see them 100 >>> record1 = RetrieveSeqname('NC_001802.fna') 101 Loading GFF/NC_001802.fna... Done. 102 >>> record1.id 103 'gi|9629357|ref|NC_001802.1|' 104 >>> record2 = RetrieveSeqname('NC_001802.fna') 105 >>> record1.seq is record2.seq # should be same space in memory 106 1 107 """ 108 __records = {} 109 _diagnostics = 0 110
111 - def __init__(self, seqname):
112 try: 113 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 114 except KeyError: 115 filename = os.path.join(self._dir, seqname) 116 if self._diagnostics: 117 sys.stderr.write("Loading %s..." % filename) 118 record = easy.fasta_single(filename) 119 self.__records[seqname] = record 120 GenericTools.Surrogate.__init__(self, self.__records[seqname]) 121 if self._diagnostics: 122 print >>sys.stderr, " Done."
123
124 -class Feature(object):
125 """ 126 strand may be: 127 +/0 = Watson 128 -/1 = Crick 129 130 I propose that we start calling these the Rosalind and Franklin strands 131 132 >>> RetrieveSeqname._dir = 'GFF' 133 >>> feature = Feature("NC_001802x.fna", 73, 78) # not having the x will interfere with the RetrieveSequence test 134 >>> feature.seq() 135 Seq('AATAAA', Alphabet()) 136 >>> print feature.location() 137 NC_001802x.fna:73..78 138 >>> from Bio import SeqIO 139 >>> record = feature.record() 140 >>> records = [record] 141 >>> SeqIO.write(records, sys.stdout, 'fasta') 142 > NC_001802x.fna:73..78 143 AATAAA 144 >>> feature2 = Feature(location=easy.LocationFromString("NC_001802x.fna:73..78")) 145 >>> writer.write(feature2.record()) 146 > NC_001802x.fna:73..78 147 AATAAA 148 >>> location3 = easy.LocationFromString("NC_001802x.fna:complement(73..78)") 149 >>> feature3 = Feature(location=location3) 150 >>> writer.write(feature3.record()) 151 > NC_001802x.fna:complement(73..78) 152 TTTATT 153 >>> location4 = easy.LocationFromString("NC_001802x.fna:336..1631") 154 >>> feature4 = Feature(location=location4, frame=0) 155 >>> feature4.frame 156 0 157 >>> feature4.translate()[:7] 158 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 159 >>> feature4.frame = 6 # can't happen, but a useful demonstration 160 >>> feature4.translate()[:5] 161 Seq('ARASV', HasStopCodon(IUPACProtein(), '*')) 162 >>> feature4.frame = 1 163 >>> feature4.translate()[:5] 164 Seq('WVRER', HasStopCodon(IUPACProtein(), '*')) 165 >>> location5 = easy.LocationFromString("NC_001802lc.fna:336..1631") # lowercase data 166 >>> feature5 = Feature(location=location5, frame=0) 167 >>> feature5.translate()[:7] 168 Seq('MGARASV', HasStopCodon(IUPACProtein(), '*')) 169 >>> location6 = easy.LocationFromString("NC_001802lc.fna:335..351") 170 >>> feature6 = Feature(location=location6, frame=1) 171 >>> feature6.translate() 172 Seq('MGARA', HasStopCodon(IUPACProtein(), '*')) 173 """
174 - def __init__(self, 175 seqname=None, 176 start=None, 177 end=None, 178 strand="+", 179 frame=None, 180 location=None, 181 alphabet=DEFAULT_ALPHABET):
182 if not isinstance(seqname, (types.StringType, types.NoneType)): 183 raise exceptions.TypeError("seqname needs to be string") 184 self.frame = frame 185 self.alphabet = alphabet 186 if location is None: 187 self.seqname = seqname 188 self.start = start 189 self.end = end 190 self.strand = strand 191 return 192 else: 193 self.seqname = location.seqname 194 self.start = location.start() + 1 195 self.end = location.end() + 1 196 self.strand = location.complement
197
198 - def __hash__(self):
199 return hash((self.seqname, 200 self.start, 201 self.end, 202 self.strand, 203 self.frame, 204 self.alphabet))
205
206 - def seq(self):
207 rec = RetrieveSeqname(self.seqname) 208 return easy.record_subseq(rec, self.location(), upper=1)
209
210 - def translate(self):
211 seq = self.seq() 212 try: 213 seq = Seq(seq.tostring()[self.frame:], self.alphabet) 214 except TypeError: 215 seq.alphabet = self.alphabet 216 try: 217 return seq.translate() #default table 218 except AssertionError: 219 # if the feature was pickled then we have problems 220 import cPickle 221 if cPickle.dumps(seq.alphabet) == cPickle.dumps(DEFAULT_ALPHABET): 222 seq.alphabet = DEFAULT_ALPHABET 223 return seq.translate() 224 else: 225 raise
226
227 - def location(self):
228 return easy.LocationFromCoords(self.start-1, self.end-1, self.strand, seqname=self.seqname)
229
230 - def target_location(self):
231 if self.target_start <= self.target_end: 232 return easy.LocationFromCoords(self.target_start-1, self.target_end-1, 0, seqname=self.gname) 233 else: 234 # switch start and end and make it complement: 235 return easy.LocationFromCoords(self.target_end-1, self.target_start-1, 1, seqname=self.gname)
236
237 - def id(self):
238 try: 239 return "%s:%s" % (self.gclass, self.gname) 240 except AttributeError: 241 return ""
242
243 - def record(self):
244 return SeqRecord(self.seq(), self.id(), "", self.location())
245
246 - def xrange(self):
247 return xrange(self.start, self.end)
248
249 - def __str__(self):
250 return "Feature(%s)" % self.location()
251
252 -class FeatureQueryRow(DocSQL.QueryRow, Feature):
253 """ 254 row of FeatureQuery results 255 works like a Feature 256 """
257 - def __init__(self, *args, **keywds):
258 DocSQL.QueryRow.__init__(self, *args, **keywds) 259 try: 260 self.frame = int(self.frame) 261 except ValueError: 262 self.frame = '' 263 except TypeError: 264 self.frame = None 265 self.alphabet = DEFAULT_ALPHABET
266
267 -class FeatureQuery(DocSQL.Query):
268 """ 269 SELECT fdata.fref AS seqname, 270 ftype.fsource AS source, 271 ftype.fmethod AS feature, 272 fdata.fstart AS start, 273 fdata.fstop AS end, 274 fdata.fscore AS score, 275 fdata.fstrand AS strand, 276 fdata.fphase AS frame, 277 fdata.ftarget_start AS target_start, 278 fdata.ftarget_stop AS target_end, 279 fdata.gid, 280 fgroup.gclass, 281 fgroup.gname 282 FROM fdata, ftype, fgroup 283 WHERE fdata.ftypeid = ftype.ftypeid 284 AND fdata.gid = fgroup.gid 285 """ 286 287 row_class = FeatureQueryRow
288 - def __init__(self, 289 seqname=None, 290 coords=None, 291 complement=0, 292 method=None, 293 source=None, 294 object=None, # "class:name" 295 gid=None, 296 *args, 297 **keywds):
298 299 DocSQL.Query.__init__(self, *args, **keywds) 300 301 if seqname is not None: 302 self.statement += ' AND fref="%s"\n' % seqname 303 if coords is not None: 304 self.statement += " AND (%s)\n" % binning.query(coords[0], coords[1]) 305 if coords[0] is not None: 306 self.statement += (" AND fstop >= %s\n" % coords[0]) 307 if coords[1] is not None: 308 self.statement += (" AND fstart <= %s\n" % coords[1]) 309 if method is not None: 310 self.statement += ' AND fmethod LIKE "%s"\n' % method # LIKE allows SQL "%" wildcards 311 if source is not None: 312 self.statement += ' AND fsource LIKE "%s"\n' % source 313 if object is not None: 314 self.statement += ' AND %s\n' % object2fgroup_sql(object) 315 if gid is not None: 316 self.statement += ' AND fgroup.gid = %s\n' % gid 317 318 if complement: 319 self.statement += " ORDER BY 0-fstart, 0-fstop" 320 else: 321 self.statement += " ORDER BY fstart, fstop"
322
323 - def aggregate(self):
324 return FeatureAggregate(self)
325
326 -def object2fgroup_sql(object):
327 return 'gclass LIKE "%s" AND gname LIKE "%s"' % object_split(object)
328
329 -class FeatureAggregate(list, Feature):
330 """ 331 >>> feature1_1 = Feature(location=easy.LocationFromString("NC_001802x.fna:336..1631"), frame=0) # gag-pol 332 >>> feature1_2 = Feature(location=easy.LocationFromString("NC_001802x.fna:1631..4642"), frame=0) # slippage 333 >>> aggregate = FeatureAggregate([feature1_1, feature1_2]) 334 >>> print aggregate.location() 335 join(NC_001802x.fna:336..1631,NC_001802x.fna:1631..4642) 336 >>> xlate_str = aggregate.translate().tostring() 337 >>> xlate_str[:5], xlate_str[-5:] 338 ('MGARA', 'RQDED') 339 340 >>> location1 = easy.LocationFromString("NC_001802x.fna:complement(1..6)") 341 >>> location2 = easy.LocationFromString("NC_001802x.fna:complement(7..12)") 342 >>> feature2_1 = Feature(location=location1, frame=0) 343 >>> feature2_2 = Feature(location=location2, frame=0) 344 >>> aggregate2 = FeatureAggregate([feature2_1, feature2_2]) 345 >>> print aggregate2.location() 346 complement(join(NC_001802x.fna:1..6,NC_001802x.fna:7..12)) 347 >>> print aggregate2.translate() 348 Seq('TRET', HasStopCodon(IUPACProtein(), '*')) 349 >>> location1.reverse() 350 >>> location2.reverse() 351 >>> aggregate3 = FeatureAggregate([Feature(location=x, frame=0) for x in [location1, location2]]) 352 >>> print aggregate3.location() 353 join(NC_001802x.fna:1..6,NC_001802x.fna:7..12) 354 >>> print aggregate3.translate() 355 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 356 >>> aggregate3[0].frame = 3 357 >>> print aggregate3.translate() 358 Seq('LSG', HasStopCodon(IUPACProtein(), '*')) 359 360 >>> aggregate4 = FeatureAggregate() 361 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:1..5"), frame=0)) 362 >>> aggregate4.append(Feature(location=easy.LocationFromString("NC_001802x.fna:6..12"), frame=2)) 363 >>> aggregate4.seq() 364 Seq('GGTCTCTCTGGT', Alphabet()) 365 >>> aggregate4.translate() 366 Seq('GLSG', HasStopCodon(IUPACProtein(), '*')) 367 """
368 - def __init__(self, feature_query=None):
369 if feature_query is None: 370 list.__init__(self, []) 371 else: 372 list.__init__(self, map(lambda x: x, feature_query))
373
374 - def location(self):
375 loc = easy.LocationJoin(map(lambda x: x.location(), self)) 376 loc.reorient() 377 return loc
378
379 - def map(self, func):
380 mapped = map(func, self) 381 if self.location().complement: 382 mapped.reverse() 383 return reduce(operator.add, mapped)
384
385 - def seq(self):
386 return self.map(lambda x: x.seq())
387
388 - def translate(self):
389 attributes = ['alphabet', 'frame'] 390 index = [0, -1][self.location().complement] 391 for attribute in attributes: 392 self.__dict__[attribute] = self[index].__dict__[attribute] 393 translation = Feature.translate(self) 394 try: 395 assert translation.tostring().index("*") == len(translation) - 1 396 return translation[:translation.tostring().index("*")] 397 except ValueError: 398 return translation
399
400 -def object_split(object):
401 """ 402 >>> object_split("Sequence:F02E9.2a") 403 ('Sequence', 'F02E9.2a') 404 """ 405 return tuple(object.split(":"))
406
407 -def _test(*args, **keywds):
408 import doctest, sys 409 doctest.testmod(sys.modules[__name__], *args, **keywds)
410 411 if __name__ == "__main__": 412 if __debug__: 413 _test() 414