1 """Represent a Sequence Feature holding info about a part of a sequence.
2
3 This is heavily modeled after the Biocorba SeqFeature objects, and
4 may be pretty biased towards GenBank stuff since I'm writing it
5 for the GenBank parser output...
6
7 What's here:
8
9 Base class to hold a Feature.
10 ----------------------------
11 classes:
12 o SeqFeature
13
14 Hold information about a Reference.
15 ----------------------------------
16
17 This is an attempt to create a General class to hold Reference type
18 information.
19
20 classes:
21 o Reference
22
23 Specify locations of a feature on a Sequence.
24 ---------------------------------------------
25
26 This aims to handle, in Ewan's words, 'the dreaded fuzziness issue' in
27 much the same way as Biocorba. This has the advantages of allowing us
28 to handle fuzzy stuff in case anyone needs it, and also be compatible
29 with Biocorba.
30
31 classes:
32 o FeatureLocation - Specify the start and end location of a feature.
33
34 o ExactPosition - Specify the position as being exact.
35 o WithinPosition - Specify a position occuring within some range.
36 o BetweenPosition - Specify a position occuring between a range.
37 o BeforePosition - Specify the position as being found before some base.
38 o AfterPosition - Specify the position as being found after some base.
39 """
40
42 """Represent a Sequence Feature on an object.
43
44 Attributes:
45 o location - the location of the feature on the sequence
46 o type - the specified type of the feature (ie. CDS, exon, repeat...)
47 o location_operator - a string specifying how this SeqFeature may
48 be related to others. For example, in the example GenBank feature
49 shown below, the location_operator would be "join"
50 o strand - A value specifying on which strand (of a DNA sequence, for
51 instance) the feature deals with. 1 indicates the plus strand, -1
52 indicates the minus strand, 0 indicates both strands, and None indicates
53 that strand doesn't apply (ie. for proteins) or is not known.
54 o id - A string identifier for the feature.
55 o ref - A reference to another sequence. This could be an accession
56 number for some different sequence.
57 o ref_db - A different database for the reference accession number.
58 o qualifier - A dictionary of qualifiers on the feature. These are
59 analagous to the qualifiers from a GenBank feature table. The keys of
60 the dictionary are qualifier names, the values are the qualifier
61 values.
62 o sub_features - Additional SeqFeatures which fall under this 'parent'
63 feature. For instance, if we having something like:
64
65 CDS join(1..10,30..40,50..60)
66
67 The the top level feature would be a CDS from 1 to 60, and the sub
68 features would be of 'CDS_join' type and would be from 1 to 10, 30 to
69 40 and 50 to 60, respectively.
70 """
71 - def __init__(self, location = None, type = '', location_operator = '',
72 strand = None, id = "<unknown id>",
73 qualifiers = {}, sub_features = [],
74 ref = None, ref_db = None):
75 """Initialize a SeqFeature on a Sequence.
76 """
77 self.location = location
78
79 self.type = type
80 self.location_operator = location_operator
81 self.strand = strand
82 self.id = id
83
84
85
86
87 self.qualifiers = {}
88 self.sub_features = []
89 self.ref = ref
90 self.ref_db = ref_db
91
93 """Make it easier to debug features.
94 """
95 out = "type: %s\n" % self.type
96 out += "location: %s\n" % self.location
97 out += "ref: %s:%s\n" % (self.ref, self.ref_db)
98 out += "strand: %s\n" % self.strand
99 out += "qualifiers: \n"
100 qualifier_keys = self.qualifiers.keys()
101 qualifier_keys.sort()
102 for qual_key in qualifier_keys:
103 out += "\tKey: %s, Value: %s\n" % (qual_key,
104 self.qualifiers[qual_key])
105 if len(self.sub_features) != 0:
106 out += "Sub-Features\n"
107 for sub_feature in self.sub_features:
108 out +="%s\n" % sub_feature
109
110 return out
111
112
113
114
116 """Represent a Generic Reference object.
117
118 Attributes:
119 o location - A list of Location objects specifying regions of
120 the sequence that the references correspond to. If no locations are
121 specified, the entire sequence is assumed.
122 o authors - A big old string, or a list split by author, of authors
123 for the reference.
124 o title - The title of the reference.
125 o journal - Journal the reference was published in.
126 o medline_id - A medline reference for the article.
127 o pubmed_id - A pubmed reference for the article.
128 o comment - A place to stick any comments about the reference.
129 """
139
141 """Output an informative string for debugging.
142 """
143 out = ""
144 for single_location in self.location:
145 out += "location: %s\n" % single_location
146 out += "authors: %s\n" % self.authors
147 if self.consrtm:
148 out += "consrtm: %s\n" % self.consrtm
149 out += "title: %s\n" % self.title
150 out += "journal: %s\n" % self.journal
151 out += "medline id: %s\n" % self.medline_id
152 out += "pubmed id: %s\n" % self.pubmed_id
153 out += "comment: %s\n" % self.comment
154
155 return out
156
157
158
160 """Specify the location of a feature along a sequence.
161
162 This attempts to deal with fuzziness of position ends, but also
163 make it easy to get the start and end in the 'normal' case (no
164 fuzziness).
165
166 You should access the start and end attributes with
167 your_location.start and your_location.end. If the start and
168 end are exact, this will return the positions, if not, we'll return
169 the approriate Fuzzy class with info about the position and fuzziness.
170
171 Note that the start and end location numbering follow Python's scheme,
172 thus a GenBank entry of 123..150 (one based counting) becomes a location
173 of [122:150] (zero based counting).
174 """
176 """Specify the start and end of a sequence feature.
177
178 start and end arguments specify the values where the feature begins
179 and ends. These can either by any of the *Position objects that
180 inherit from AbstractPosition, or can just be integers specifying the
181 position. In the case of integers, the values are assumed to be
182 exact and are converted in ExactPosition arguments. This is meant
183 to make it easy to deal with non-fuzzy ends.
184 """
185 if isinstance(start, AbstractPosition):
186 self._start = start
187 else:
188 self._start = ExactPosition(start)
189
190 if isinstance(end, AbstractPosition):
191 self._end = end
192 else:
193 self._end = ExactPosition(end)
194
196 """Returns a representation of the location. For the simple case this
197 uses the python splicing syntax, [122:150] (zero based counting) which
198 GenBank would call 123..150 (one based counting).
199 """
200 return "[%s:%s]" % (self._start, self._end)
201
203 """Make it easy to get non-fuzzy starts and ends.
204
205 We override get_attribute here so that in non-fuzzy cases we
206 can just return the start and end position without any hassle.
207
208 To get fuzzy start and ends, just ask for item.start and
209 item.end. To get non-fuzzy attributes (ie. the position only)
210 ask for 'item.nofuzzy_start', 'item.nofuzzy_end'. These should return
211 the largest range of the fuzzy position. So something like:
212 (10.20)..(30.40) should return 10 for start, and 40 for end.
213 """
214 if attr == 'start':
215 return self._start
216 elif attr == 'end':
217 return self._end
218 elif attr == 'nofuzzy_start':
219 return min(self._start.position,
220 self._start.position + self._start.extension)
221 elif attr == 'nofuzzy_end':
222 return max(self._end.position,
223 self._end.position + self._end.extension)
224 else:
225 raise AttributeError("Cannot evaluate attribute %s." % attr)
226
228 """Abstract base class representing a position.
229 """
230 - def __init__(self, position, extension):
231 self.position = position
232 self.extension = extension
233
235 """A simple comparison function for positions.
236
237 This is very simple-minded and just compares the position attribute
238 of the features; extensions are not considered at all. This could
239 potentially be expanded to try to take advantage of extensions.
240 """
241 assert isinstance(other, AbstractPosition), \
242 "We can only do comparisons between Biopython Position objects."
243
244 return cmp(self.position, other.position)
245
247 """Specify the specific position of a boundary.
248
249 o position - The position of the boundary.
250 o extension - An optional argument which must be zero since we don't
251 have an extension. The argument is provided so that the same number of
252 arguments can be passed to all position types.
253
254 In this case, there is no fuzziness associated with the position.
255 """
256 - def __init__(self, position, extension = 0):
257 if extension != 0:
258 raise AttributeError("Non-zero extension %s for exact position."
259 % extension)
260 AbstractPosition.__init__(self, position, 0)
261
263 return str(self.position)
264
266 """Specify the position of a boundary within some coordinates.
267
268 Arguments:
269 o position - The start position of the boundary
270 o extension - The range to which the boundary can extend.
271
272 This allows dealing with a position like ((1.4)..100). This
273 indicates that the start of the sequence is somewhere between 1
274 and 4. To represent that with this class we would set position as
275 1 and extension as 3.
276 """
277 - def __init__(self, position, extension = 0):
279
281 return "(%s.%s)" % (self.position, self.position + self.extension)
282
284 """Specify the position of a boundary between two coordinates.
285
286 Arguments:
287 o position - The start position of the boundary.
288 o extension - The range to the other position of a boundary.
289
290 This specifies a coordinate which is found between the two positions.
291 So this allows us to deal with a position like ((1^2)..100). To
292 represent that with this class we set position as 1 and the
293 extension as 1.
294 """
295 - def __init__(self, position, extension = 0):
297
299 return "(%s^%s)" % (self.position, self.position + self.extension)
300
302 """Specify a position where the actual location occurs before it.
303
304 Arguments:
305 o position - The upper boundary of where the location can occur.
306 o extension - An optional argument which must be zero since we don't
307 have an extension. The argument is provided so that the same number of
308 arguments can be passed to all position types.
309
310 This is used to specify positions like (<10..100) where the location
311 occurs somewhere before position 10.
312 """
313 - def __init__(self, position, extension = 0):
314 if extension != 0:
315 raise AttributeError("Non-zero extension %s for exact position."
316 % extension)
317 AbstractPosition.__init__(self, position, 0)
318
320 return "<%s" % self.position
321
323 """Specify a position where the actual location is found after it.
324
325 Arguments:
326 o position - The lower boundary of where the location can occur.
327 o extension - An optional argument which must be zero since we don't
328 have an extension. The argument is provided so that the same number of
329 arguments can be passed to all position types.
330
331 This is used to specify positions like (>10..100) where the location
332 occurs somewhere after position 10.
333 """
334 - def __init__(self, position, extension = 0):
335 if extension != 0:
336 raise AttributeError("Non-zero extension %s for exact position."
337 % extension)
338 AbstractPosition.__init__(self, position, 0)
339
341 return ">%s" % self.position
342
344 """Specify a position where the location can be multiple positions.
345
346 This models the GenBank 'one-of(1888,1901)' function, and tries
347 to make this fit within the Biopython Position models. In our case
348 the position of the "one-of" is set as the lowest choice, and the
349 extension is the range to the highest choice.
350 """
352 """Initialie with a set of posssible positions.
353
354 position_list is a list of AbstractPosition derived objects,
355 specifying possible locations.
356 """
357
358 self.position_choices = position_list
359
360 smallest = None
361 largest = None
362 for position_choice in self.position_choices:
363 assert isinstance(position_choice, AbstractPosition), \
364 "Expected position objects, got %r" % position_choice
365 if smallest is None and largest is None:
366 smallest = position_choice.position
367 largest = position_choice.position
368 elif position_choice.position > largest:
369 largest = position_choice.position
370 elif position_choice.position < smallest:
371 smallest = position_choice.position
372
373 AbstractPosition.__init__(self, smallest, largest - smallest)
374
376 out = "one-of("
377 for position in self.position_choices:
378 out += "%s," % position
379
380 out = out[:-1] + ")"
381 return out
382
384 """Simple class to hold information about a gap between positions.
385 """
387 """Intialize with a position object containing the gap information.
388 """
389 self.gap_size = gap_size
390
392 out = "gap(%s)" % self.gap_size
393 return out
394