Package Bio :: Package GenBank :: Module LocationParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.LocationParser

  1  # First pass at a parser for the location fields of a feature table. 
  2  # Everything likely to change. 
  3   
  4  # Based on the DDBJ/EMBL/GenBank Feature Table Definition Version 2.2 
  5  # Dec 15 1999 available from EBI, but the documentation is not 
  6  # completely internally consistent much less agree with real-life 
  7  # examples.  Conflicts resolved to agree with real examples. 
  8  # 
  9  # This does NOT cope with the Gap(), Gap(X), or Gap(unkXXX) tokens used 
 10  # in CONTIG lines, which are otherwise similar to feature locations. 
 11  # 
 12  # Uses John Aycock's SPARK for parsing 
 13  from Bio.Parsers.spark import GenericScanner, GenericParser 
 14   
15 -class Token:
16 - def __init__(self, type):
17 self.type = type
18 - def __cmp__(self, other):
19 return cmp(self.type, other)
20 - def __repr__(self):
21 return "Tokens(%r)" % (self.type,)
22 23 # "38"
24 -class Integer:
25 type = "integer"
26 - def __init__(self, val):
27 self.val = val
28 - def __cmp__(self, other):
29 return cmp(self.type, other)
30 - def __str__(self):
31 return str(self.val)
32 - def __repr__(self):
33 return "Integer(%s)" % self.val
34 35 # From the BNF definition, this isn't needed. Does tht mean 36 # that bases can be refered to with negative numbers?
37 -class UnsignedInteger(Integer):
38 type = "unsigned_integer"
39 - def __repr__(self):
40 return "UnsignedInteger(%s)" % self.val
41
42 -class Symbol:
43 type = "symbol"
44 - def __init__(self, name):
45 self.name = name
46 - def __cmp__(self, other):
47 return cmp(self.type, other)
48 - def __str__(self):
49 return str(self.name)
50 - def __repr__(self):
51 return "Symbol(%s)" % repr(self.name)
52 53 # ">38" -- The BNF says ">" is for the lower bound.. seems wrong to me
54 -class LowBound:
55 - def __init__(self, base):
56 self.base = base
57 - def __repr__(self):
58 return "LowBound(%r)" % self.base
59 60 # "<38"
61 -class HighBound:
62 - def __init__(self, base):
63 self.base = base
64 - def __repr__(self):
65 return "HighBound(%r)" % self.base
66 67 # 12.34
68 -class TwoBound:
69 - def __init__(self, low, high):
70 self.low = low 71 self.high = high
72 - def __repr__(self):
73 return "TwoBound(%r, %r)" % (self.low, self.high)
74 75 # 12^34
76 -class Between:
77 - def __init__(self, low, high):
78 self.low = low 79 self.high = high
80 - def __repr__(self):
81 return "Between(%r, %r)" % (self.low, self.high)
82 83 # 12..34
84 -class Range:
85 - def __init__(self, low, high):
86 self.low = low 87 self.high = high
88 - def __repr__(self):
89 return "Range(%r, %r)" % (self.low, self.high)
90
91 -class Function:
92 - def __init__(self, name, args):
93 self.name = name 94 self.args = args
95 - def __repr__(self):
96 return "Function(%r, %r)" % (self.name, self.args)
97
98 -class AbsoluteLocation:
99 - def __init__(self, path, local_location):
100 self.path = path 101 self.local_location = local_location
102 - def __repr__(self):
103 return "AbsoluteLocation(%r, %r)" % (self.path, self.local_location)
104
105 -class Path:
106 - def __init__(self, database, accession):
107 self.database = database 108 self.accession = accession
109 - def __repr__(self):
110 return "Path(%r, %r)" % (self.database, self.accession)
111
112 -class FeatureName:
113 - def __init__(self, path, label):
114 self.path = path 115 self.label = label
116 - def __repr__(self):
117 return "FeatureName(%r, %r)" % (self.path, self.label)
118
119 -class LocationScanner(GenericScanner):
120 - def __init__(self):
122
123 - def tokenize(self, input):
124 self.rv = [] 125 GenericScanner.tokenize(self, input) 126 return self.rv
127
128 - def t_double_colon(self, input):
129 r" :: " 130 self.rv.append(Token("double_colon"))
131 - def t_double_dot(self, input):
132 r" \.\. " 133 self.rv.append(Token("double_dot"))
134 - def t_dot(self, input):
135 r" \.(?!\.) " 136 self.rv.append(Token("dot"))
137 - def t_caret(self, input):
138 r" \^ " 139 self.rv.append(Token("caret"))
140 - def t_comma(self, input):
141 r" \, " 142 self.rv.append(Token("comma"))
143 - def t_integer(self, input):
144 r" -?[0-9]+ " 145 self.rv.append(Integer(int(input)))
146 - def t_unsigned_integer(self, input):
147 r" [0-9]+ " 148 self.rv.append(UnsignedInteger(int(input)))
149 - def t_colon(self, input):
150 r" :(?!:) " 151 self.rv.append(Token("colon"))
152 - def t_open_paren(self, input):
153 r" \( " 154 self.rv.append(Token("open_paren"))
155 - def t_close_paren(self, input):
156 r" \) " 157 self.rv.append(Token("close_paren"))
158 - def t_symbol(self, input):
159 r" [A-Za-z0-9_'*-][A-Za-z0-9_'*.-]* " 160 # Needed an extra '.' 161 self.rv.append(Symbol(input))
162 - def t_less_than(self, input):
163 r" < " 164 self.rv.append(Token("less_than"))
165 - def t_greater_than(self, input):
166 r" > " 167 self.rv.append(Token("greater_than"))
168 169 # punctuation .. hmm, isn't needed for location 170 # r''' [ !#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~] ''' 171
172 -class LocationParser(GenericParser):
173 - def __init__(self, start='location'):
174 GenericParser.__init__(self, start) 175 self.begin_pos = 0
176
177 - def p_location(self, args):
178 """ 179 location ::= absolute_location 180 location ::= feature_name 181 location ::= function 182 """ 183 return args[0]
184
185 - def p_function(self, args):
186 """ 187 function ::= functional_operator open_paren location_list close_paren 188 """ 189 return Function(args[0].name, args[2])
190
191 - def p_absolute_location(self, args):
192 """ 193 absolute_location ::= local_location 194 absolute_location ::= path colon local_location 195 """ 196 if len(args) == 1: 197 return AbsoluteLocation(None, args[-1]) 198 return AbsoluteLocation(args[0], args[-1])
199
200 - def p_path(self, args):
201 """ 202 path ::= database double_colon primary_accession 203 path ::= primary_accession 204 """ 205 if len(args) == 3: 206 return Path(args[0], args[2]) 207 return Path(None, args[0])
208
209 - def p_feature_name(self, args):
210 """ 211 feature_name ::= path colon feature_label 212 feature_name ::= feature_label 213 """ 214 if len(args) == 3: 215 return FeatureName(args[0], args[2]) 216 return FeatureName(None, args[0])
217
218 - def p_feature_label(self, args):
219 """ 220 label ::= symbol 221 """ 222 return args[0].name
223
224 - def p_local_location(self, args):
225 """ 226 local_location ::= base_position 227 local_location ::= between_position 228 local_location ::= base_range 229 """ 230 return args[0]
231 - def p_location_list(self, args):
232 """ 233 location_list ::= location 234 location_list ::= location_list comma location 235 """ 236 if len(args) == 1: 237 return args 238 return args[0] + [args[2]]
239
240 - def p_functional_operator(self, args):
241 """ 242 functional_operator ::= symbol 243 """ 244 return args[0]
245
246 - def p_base_position(self, args):
247 """ 248 base_position ::= integer 249 base_position ::= low_base_bound 250 base_position ::= high_base_bound 251 base_position ::= two_base_bound 252 """ 253 return args[0]
254
255 - def p_low_base_bound(self, args):
256 """ 257 low_base_bound ::= greater_than integer 258 """ 259 return LowBound(args[1])
260
261 - def p_high_base_bound(self, args):
262 """ 263 high_base_bound ::= less_than integer 264 """ 265 return HighBound(args[1])
266
267 - def p_two_base_bound_1(self, args):
268 """ 269 two_base_bound ::= open_paren base_position dot base_position close_paren 270 """ 271 # main example doesn't have parens but others do.. (?) 272 return TwoBound(args[1], args[3])
273
274 - def p_two_base_bound_2(self, args):
275 """ 276 two_base_bound ::= base_position dot base_position 277 """ 278 # two_base_bound with no parentheses like 1.6 279 return TwoBound(args[0], args[2])
280
281 - def p_between_position(self, args):
282 """ 283 between_position ::= base_position caret base_position 284 """ 285 return Between(args[0], args[2])
286
287 - def p_base_range(self, args):
288 """ 289 base_range ::= base_position double_dot base_position 290 base_range ::= function double_dot base_position 291 base_range ::= base_position double_dot function 292 base_range ::= function double_dot function 293 """ 294 return Range(args[0], args[2])
295
296 - def p_database(self, args):
297 """ 298 database ::= symbol 299 """ 300 return args[0].name
301
302 - def p_primary_accession(self, args):
303 """ 304 primary_accession ::= symbol 305 """ 306 return args[0].name
307 308 309 _cached_scanner = LocationScanner()
310 -def scan(input):
311 """Break a location string into a set of tokens""" 312 #scanner = LocationScanner() 313 #return scanner.tokenize(input) 314 return _cached_scanner.tokenize(input)
315 316 _cached_parser = LocationParser()
317 -def parse(tokens):
318 """Go from a set of tokens to an object representation""" 319 #print "I have", tokens 320 #parser = LocationParser() 321 #return parser.parse(tokens) 322 return _cached_parser.parse(tokens)
323