1 """Represent a Sequence Feature holding info about a part of a sequence.
2
3 This is heavily modeled after the Biocorba SeqFeature objects, and
4 may be pretty biased towards GenBank stuff since I'm writing it
5 for the GenBank parser output...
6
7 What's here:
8
9 Base class to hold a Feature.
10 ----------------------------
11 classes:
12 o SeqFeature
13
14 Hold information about a Reference.
15 ----------------------------------
16
17 This is an attempt to create a General class to hold Reference type
18 information.
19
20 classes:
21 o Reference
22
23 Specify locations of a feature on a Sequence.
24 ---------------------------------------------
25
26 This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
27 much the same way as Biocorba. This has the advantages of allowing us
28 to handle fuzzy stuff in case anyone needs it, and also be compatible
29 with Biocorba.
30
31 classes:
32 o FeatureLocation - Specify the start and end location of a feature.
33
34 o ExactPosition - Specify the position as being exact.
35 o WithinPosition - Specify a position occuring within some range.
36 o BetweenPosition - Specify a position occuring between a range.
37 o BeforePosition - Specify the position as being found before some base.
38 o AfterPosition - Specify the position as being found after some base.
39 """
40
42 """Represent a Sequence Feature on an object.
43
44 Attributes:
45 o location - the location of the feature on the sequence
46 o type - the specified type of the feature (ie. CDS, exon, repeat...)
47 o location_operator - a string specifying how this SeqFeature may
48 be related to others. For example, in the example GenBank feature
49 shown below, the location_operator would be "join"
50 o strand - A value specifying on which strand (of a DNA sequence, for
51 instance) the feature deals with. 1 indicates the plus strand, -1
52 indicates the minus strand, 0 indicates both strands, and None indicates
53 that strand doesn't apply (ie. for proteins) or is not known.
54 o id - A string identifier for the feature.
55 o ref - A reference to another sequence. This could be an accession
56 number for some different sequence.
57 o ref_db - A different database for the reference accession number.
58 o qualifiers - A dictionary of qualifiers on the feature. These are
59 analagous to the qualifiers from a GenBank feature table. The keys of
60 the dictionary are qualifier names, the values are the qualifier
61 values.
62 o sub_features - Additional SeqFeatures which fall under this 'parent'
63 feature. For instance, if we having something like:
64
65 CDS join(1..10,30..40,50..60)
66
67 The the top level feature would be a CDS from 1 to 60, and the sub
68 features would be of 'CDS_join' type and would be from 1 to 10, 30 to
69 40 and 50 to 60, respectively.
70 """
71 - def __init__(self, location = None, type = '', location_operator = '',
72 strand = None, id = "<unknown id>",
73 qualifiers = None, sub_features = None,
74 ref = None, ref_db = None):
75 """Initialize a SeqFeature on a Sequence.
76 """
77 self.location = location
78
79 self.type = type
80 self.location_operator = location_operator
81 self.strand = strand
82 self.id = id
83 if qualifiers is None:
84 qualifiers = {}
85 self.qualifiers = qualifiers
86 if sub_features is None:
87 sub_features = []
88 self.sub_features = sub_features
89 self.ref = ref
90 self.ref_db = ref_db
91
93 """A string representation of the record for debugging."""
94 answer = "%s(%s" % (self.__class__.__name__, repr(self.location))
95 if self.type :
96 answer += ", type=%s" % repr(self.type)
97 if self.location_operator :
98 answer += ", location_operator=%s" % repr(self.location_operator)
99 if self.strand :
100 answer += ", strand=%s" % repr(self.strand)
101 if self.id and self.id != "<unknown id>" :
102 answer += ", id=%s" % repr(self.id)
103 if self.ref :
104 answer += ", ref=%s" % repr(self.ref)
105 if self.ref_db :
106 answer += ", ref_db=%s" % repr(self.ref_db)
107 answer += ")"
108 return answer
109
111 """A readable summary of the feature intended to be printed to screen.
112 """
113 out = "type: %s\n" % self.type
114 out += "location: %s\n" % self.location
115 out += "ref: %s:%s\n" % (self.ref, self.ref_db)
116 out += "strand: %s\n" % self.strand
117 out += "qualifiers: \n"
118 qualifier_keys = self.qualifiers.keys()
119 qualifier_keys.sort()
120 for qual_key in qualifier_keys:
121 out += " Key: %s, Value: %s\n" % (qual_key,
122 self.qualifiers[qual_key])
123 if len(self.sub_features) != 0:
124 out += "Sub-Features\n"
125 for sub_feature in self.sub_features:
126 out +="%s\n" % sub_feature
127
128 return out
129
131 """Returns a copy of the feature with its location shifted (PRIVATE).
132
133 The annotation qaulifiers are copied."""
134 answer = SeqFeature(location = self.location._shift(offset),
135 type = self.type,
136 location_operator = self.location_operator,
137 strand = self.strand,
138 id = self.id,
139
140
141 ref = self.ref,
142 ref_db = self.ref_db)
143
144 answer.sub_features = [f._shift(offset) for f in self.sub_features]
145 answer.qualifiers = dict(self.qualifiers.iteritems())
146 return answer
147
148
149
150
152 """Represent a Generic Reference object.
153
154 Attributes:
155 o location - A list of Location objects specifying regions of
156 the sequence that the references correspond to. If no locations are
157 specified, the entire sequence is assumed.
158 o authors - A big old string, or a list split by author, of authors
159 for the reference.
160 o title - The title of the reference.
161 o journal - Journal the reference was published in.
162 o medline_id - A medline reference for the article.
163 o pubmed_id - A pubmed reference for the article.
164 o comment - A place to stick any comments about the reference.
165 """
175
177 """Output an informative string for debugging.
178 """
179 out = ""
180 for single_location in self.location:
181 out += "location: %s\n" % single_location
182 out += "authors: %s\n" % self.authors
183 if self.consrtm:
184 out += "consrtm: %s\n" % self.consrtm
185 out += "title: %s\n" % self.title
186 out += "journal: %s\n" % self.journal
187 out += "medline id: %s\n" % self.medline_id
188 out += "pubmed id: %s\n" % self.pubmed_id
189 out += "comment: %s\n" % self.comment
190 return out
191
193
194 return "%s(title=%s, ...)" % (self.__class__.__name__,
195 repr(self.title))
196
197
198
200 """Specify the location of a feature along a sequence.
201
202 This attempts to deal with fuzziness of position ends, but also
203 make it easy to get the start and end in the 'normal' case (no
204 fuzziness).
205
206 You should access the start and end attributes with
207 your_location.start and your_location.end. If the start and
208 end are exact, this will return the positions, if not, we'll return
209 the approriate Fuzzy class with info about the position and fuzziness.
210
211 Note that the start and end location numbering follow Python's scheme,
212 thus a GenBank entry of 123..150 (one based counting) becomes a location
213 of [122:150] (zero based counting).
214 """
216 """Specify the start and end of a sequence feature.
217
218 start and end arguments specify the values where the feature begins
219 and ends. These can either by any of the *Position objects that
220 inherit from AbstractPosition, or can just be integers specifying the
221 position. In the case of integers, the values are assumed to be
222 exact and are converted in ExactPosition arguments. This is meant
223 to make it easy to deal with non-fuzzy ends.
224 """
225 if isinstance(start, AbstractPosition):
226 self._start = start
227 else:
228 self._start = ExactPosition(start)
229
230 if isinstance(end, AbstractPosition):
231 self._end = end
232 else:
233 self._end = ExactPosition(end)
234
236 """Returns a representation of the location (with python counting).
237
238 For the simple case this uses the python splicing syntax, [122:150]
239 (zero based counting) which GenBank would call 123..150 (one based
240 counting).
241 """
242 return "[%s:%s]" % (self._start, self._end)
243
245 """A string representation of the location for debugging."""
246 return "%s(%s,%s)" \
247 % (self.__class__.__name__, repr(self.start), repr(self.end))
248
253
254 start = property(fget= lambda self : self._start,
255 doc="Start location (possibly a fuzzy position, read only).")
256
257 end = property(fget= lambda self : self._end,
258 doc="End location (possibly a fuzzy position, read only).")
259
268 nofuzzy_start = property(fget=_get_nofuzzy_start,
269 doc="""Start position (integer, approximated if fuzzy, read only).
270
271 To get non-fuzzy attributes (ie. the position only) ask for
272 'location.nofuzzy_start', 'location.nofuzzy_end'. These should return
273 the largest range of the fuzzy position. So something like:
274 (10.20)..(30.40) should return 10 for start, and 40 for end.
275 """)
276
278
279 if ((self._start == self._end) and isinstance(self._start,
280 BetweenPosition)):
281 return self._end.position
282 else:
283 return max(self._end.position,
284 self._end.position + self._end.extension)
285 nofuzzy_end = property(fget=_get_nofuzzy_end,
286 doc="""End position (integer, approximated if fuzzy, read only).
287
288 To get non-fuzzy attributes (ie. the position only) ask for
289 'location.nofuzzy_start', 'location.nofuzzy_end'. These should return
290 the largest range of the fuzzy position. So something like:
291 (10.20)..(30.40) should return 10 for start, and 40 for end.
292 """)
293
295 """Abstract base class representing a position.
296 """
297 - def __init__(self, position, extension):
298 self.position = position
299 self.extension = extension
300
302 """String representation of the location for debugging."""
303 return "%s(%s,%s)" % (self.__class__.__name__, \
304 repr(self.position), repr(self.extension))
305
307 """A simple comparison function for positions.
308
309 This is very simple-minded and just compares the position attribute
310 of the features; extensions are not considered at all. This could
311 potentially be expanded to try to take advantage of extensions.
312 """
313 assert isinstance(other, AbstractPosition), \
314 "We can only do comparisons between Biopython Position objects."
315
316 return cmp(self.position, other.position)
317
319
320 return self.__class__(self.position + offset, self.extension)
321
323 """Specify the specific position of a boundary.
324
325 o position - The position of the boundary.
326 o extension - An optional argument which must be zero since we don't
327 have an extension. The argument is provided so that the same number of
328 arguments can be passed to all position types.
329
330 In this case, there is no fuzziness associated with the position.
331 """
332 - def __init__(self, position, extension = 0):
333 if extension != 0:
334 raise AttributeError("Non-zero extension %s for exact position."
335 % extension)
336 AbstractPosition.__init__(self, position, 0)
337
339 """String representation of the ExactPosition location for debugging."""
340 assert self.extension == 0
341 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
342
344 return str(self.position)
345
347 """Specify the position of a boundary within some coordinates.
348
349 Arguments:
350 o position - The start position of the boundary
351 o extension - The range to which the boundary can extend.
352
353 This allows dealing with a position like ((1.4)..100). This
354 indicates that the start of the sequence is somewhere between 1
355 and 4. To represent that with this class we would set position as
356 1 and extension as 3.
357 """
358 - def __init__(self, position, extension = 0):
360
362 return "(%s.%s)" % (self.position, self.position + self.extension)
363
365 """Specify the position of a boundary between two coordinates.
366
367 Arguments:
368 o position - The start position of the boundary.
369 o extension - The range to the other position of a boundary.
370
371 This specifies a coordinate which is found between the two positions.
372 So this allows us to deal with a position like ((1^2)..100). To
373 represent that with this class we set position as 1 and the
374 extension as 1.
375 """
376 - def __init__(self, position, extension = 0):
378
380 return "(%s^%s)" % (self.position, self.position + self.extension)
381
383 """Specify a position where the actual location occurs before it.
384
385 Arguments:
386 o position - The upper boundary of where the location can occur.
387 o extension - An optional argument which must be zero since we don't
388 have an extension. The argument is provided so that the same number of
389 arguments can be passed to all position types.
390
391 This is used to specify positions like (<10..100) where the location
392 occurs somewhere before position 10.
393 """
394 - def __init__(self, position, extension = 0):
395 if extension != 0:
396 raise AttributeError("Non-zero extension %s for exact position."
397 % extension)
398 AbstractPosition.__init__(self, position, 0)
399
401 """A string representation of the location for debugging."""
402 assert self.extension == 0
403 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
404
406 return "<%s" % self.position
407
409 """Specify a position where the actual location is found after it.
410
411 Arguments:
412 o position - The lower boundary of where the location can occur.
413 o extension - An optional argument which must be zero since we don't
414 have an extension. The argument is provided so that the same number of
415 arguments can be passed to all position types.
416
417 This is used to specify positions like (>10..100) where the location
418 occurs somewhere after position 10.
419 """
420 - def __init__(self, position, extension = 0):
421 if extension != 0:
422 raise AttributeError("Non-zero extension %s for exact position."
423 % extension)
424 AbstractPosition.__init__(self, position, 0)
425
427 """A string representation of the location for debugging."""
428 assert self.extension == 0
429 return "%s(%s)" % (self.__class__.__name__, repr(self.position))
430
432 return ">%s" % self.position
433
435 """Specify a position where the location can be multiple positions.
436
437 This models the GenBank 'one-of(1888,1901)' function, and tries
438 to make this fit within the Biopython Position models. In our case
439 the position of the "one-of" is set as the lowest choice, and the
440 extension is the range to the highest choice.
441 """
443 """Initialize with a set of posssible positions.
444
445 position_list is a list of AbstractPosition derived objects,
446 specifying possible locations.
447 """
448
449 self.position_choices = position_list
450
451 smallest = None
452 largest = None
453 for position_choice in self.position_choices:
454 assert isinstance(position_choice, AbstractPosition), \
455 "Expected position objects, got %r" % position_choice
456 if smallest is None and largest is None:
457 smallest = position_choice.position
458 largest = position_choice.position
459 elif position_choice.position > largest:
460 largest = position_choice.position
461 elif position_choice.position < smallest:
462 smallest = position_choice.position
463
464 AbstractPosition.__init__(self, smallest, largest - smallest)
465
467 """String representation of the OneOfPosition location for debugging."""
468 return "%s(%s)" % (self.__class__.__name__, \
469 repr(self.position_choices))
470
472 out = "one-of("
473 for position in self.position_choices:
474 out += "%s," % position
475
476 out = out[:-1] + ")"
477 return out
478
480 """Simple class to hold information about a gap between positions.
481 """
483 """Intialize with a position object containing the gap information.
484 """
485 self.gap_size = gap_size
486
488 """A string representation of the position gap for debugging."""
489 return "%s(%s)" % (self.__class__.__name__, repr(self.gap_size))
490
492 out = "gap(%s)" % self.gap_size
493 return out
494