1
2
3
4
5
6
7
8 """
9 This module provides code to work with the prosite dat file from
10 Prosite.
11 http://www.expasy.ch/prosite/
12
13 Tested with:
14 Release 20.43, 10-Feb-2009
15
16
17 Functions:
18 read Reads a Prosite file containing one Prosite record
19 parse Iterates over records in a Prosite file.
20
21 Classes:
22 Record Holds Prosite data.
23 """
24
26 """Parse Prosite records.
27
28 This function is for parsing Prosite files containing multiple
29 records.
30
31 handle - handle to the file."""
32 while True:
33 record = __read(handle)
34 if not record:
35 break
36 yield record
37
39 """Read one Prosite record.
40
41 This function is for parsing Prosite files containing
42 exactly one record.
43
44 handle - handle to the file."""
45
46 record = __read(handle)
47
48 remainder = handle.read()
49 if remainder:
50 raise ValueError("More than one Prosite record found")
51 return record
52
54 """Holds information from a Prosite record.
55
56 Members:
57 name ID of the record. e.g. ADH_ZINC
58 type Type of entry. e.g. PATTERN, MATRIX, or RULE
59 accession e.g. PS00387
60 created Date the entry was created. (MMM-YYYY)
61 data_update Date the 'primary' data was last updated.
62 info_update Date data other than 'primary' data was last updated.
63 pdoc ID of the PROSITE DOCumentation.
64
65 description Free-format description.
66 pattern The PROSITE pattern. See docs.
67 matrix List of strings that describes a matrix entry.
68 rules List of rule definitions (from RU lines). (strings)
69 prorules List of prorules (from PR lines). (strings)
70
71 NUMERICAL RESULTS
72 nr_sp_release SwissProt release.
73 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
74 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
75 nr_positive True positives. tuple of (hits, seqs)
76 nr_unknown Could be positives. tuple of (hits, seqs)
77 nr_false_pos False positives. tuple of (hits, seqs)
78 nr_false_neg False negatives. (int)
79 nr_partial False negatives, because they are fragments. (int)
80
81 COMMENTS
82 cc_taxo_range Taxonomic range. See docs for format
83 cc_max_repeat Maximum number of repetitions in a protein
84 cc_site Interesting site. list of tuples (pattern pos, desc.)
85 cc_skip_flag Can this entry be ignored?
86 cc_matrix_type
87 cc_scaling_db
88 cc_author
89 cc_ft_key
90 cc_ft_desc
91 cc_version version number (introduced in release 19.0)
92
93 DATA BANK REFERENCES - The following are all
94 lists of tuples (swiss-prot accession,
95 swiss-prot name)
96 dr_positive
97 dr_false_neg
98 dr_false_pos
99 dr_potential Potential hits, but fingerprint region not yet available.
100 dr_unknown Could possibly belong
101
102 pdb_structs List of PDB entries.
103
104 """
106 self.name = ''
107 self.type = ''
108 self.accession = ''
109 self.created = ''
110 self.data_update = ''
111 self.info_update = ''
112 self.pdoc = ''
113
114 self.description = ''
115 self.pattern = ''
116 self.matrix = []
117 self.rules = []
118 self.prorules = []
119 self.postprocessing = []
120
121 self.nr_sp_release = ''
122 self.nr_sp_seqs = ''
123 self.nr_total = (None, None)
124 self.nr_positive = (None, None)
125 self.nr_unknown = (None, None)
126 self.nr_false_pos = (None, None)
127 self.nr_false_neg = None
128 self.nr_partial = None
129
130 self.cc_taxo_range = ''
131 self.cc_max_repeat = ''
132 self.cc_site = []
133 self.cc_skip_flag = ''
134
135 self.dr_positive = []
136 self.dr_false_neg = []
137 self.dr_false_pos = []
138 self.dr_potential = []
139 self.dr_unknown = []
140
141 self.pdb_structs = []
142
143
144
145
147 import re
148 record = None
149 for line in handle:
150 keyword, value = line[:2], line[5:].rstrip()
151 if keyword=='ID':
152 record = Record()
153 cols = value.split("; ")
154 if len(cols) != 2:
155 raise ValueError("I don't understand identification line\n%s" \
156 % line)
157 record.name = cols[0]
158 record.type = cols[1].rstrip('.')
159 elif keyword=='AC':
160 record.accession = value.rstrip(';')
161 elif keyword=='DT':
162 dates = value.rstrip('.').split("; ")
163 if (not dates[0].endswith('(CREATED)')) or \
164 (not dates[1].endswith('(DATA UPDATE)')) or \
165 (not dates[2].endswith('(INFO UPDATE)')):
166 raise ValueError("I don't understand date line\n%s" % line)
167 record.created = dates[0].rstrip(' (CREATED)')
168 record.data_update = dates[1].rstrip(' (DATA UPDATE)')
169 record.info_update = dates[2].rstrip(' (INFO UPDATE)')
170 elif keyword=='DE':
171 record.description = value
172 elif keyword=='PA':
173 record.pattern += value
174 elif keyword=='MA':
175 record.matrix.append(value)
176 elif keyword=='PP':
177 record.postprocessing.extend(value.split(";"))
178 elif keyword=='RU':
179 record.rules.append(value)
180 elif keyword=='NR':
181 cols = value.split(";")
182 for col in cols:
183 if not col:
184 continue
185 qual, data = [word.lstrip() for word in col.split("=")]
186 if qual == '/RELEASE':
187 release, seqs = data.split(",")
188 record.nr_sp_release = release
189 record.nr_sp_seqs = int(seqs)
190 elif qual == '/FALSE_NEG':
191 record.nr_false_neg = int(data)
192 elif qual == '/PARTIAL':
193 record.nr_partial = int(data)
194 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
195 m = re.match(r'(\d+)\((\d+)\)', data)
196 if not m:
197 raise Exception("Broken data %s in comment line\n%s" \
198 % (repr(data), line))
199 hits = tuple(map(int, m.groups()))
200 if(qual == "/TOTAL"):
201 record.nr_total = hits
202 elif(qual == "/POSITIVE"):
203 record.nr_positive = hits
204 elif(qual == "/UNKNOWN"):
205 record.nr_unknown = hits
206 elif(qual == "/FALSE_POS"):
207 record.nr_false_pos = hits
208 else:
209 raise ValueError("Unknown qual %s in comment line\n%s" \
210 % (repr(qual), line))
211 elif keyword=='CC':
212
213
214
215 cols = value.split(";")
216 for col in cols:
217 if not col or col[:17] == 'Automatic scaling':
218
219
220
221 continue
222 if col.count("=") == 0 :
223
224
225
226 continue
227 qual, data = [word.lstrip() for word in col.split("=")]
228 if qual == '/TAXO-RANGE':
229 record.cc_taxo_range = data
230 elif qual == '/MAX-REPEAT':
231 record.cc_max_repeat = data
232 elif qual == '/SITE':
233 pos, desc = data.split(",")
234 record.cc_site.append((int(pos), desc))
235 elif qual == '/SKIP-FLAG':
236 record.cc_skip_flag = data
237 elif qual == '/MATRIX_TYPE':
238 record.cc_matrix_type = data
239 elif qual == '/SCALING_DB':
240 record.cc_scaling_db = data
241 elif qual == '/AUTHOR':
242 record.cc_author = data
243 elif qual == '/FT_KEY':
244 record.cc_ft_key = data
245 elif qual == '/FT_DESC':
246 record.cc_ft_desc = data
247 elif qual == '/VERSION':
248 record.cc_version = data
249 else:
250 raise ValueError("Unknown qual %s in comment line\n%s" \
251 % (repr(qual), line))
252 elif keyword=='DR':
253 refs = value.split(";")
254 for ref in refs:
255 if not ref:
256 continue
257 acc, name, type = [word.strip() for word in ref.split(",")]
258 if type == 'T':
259 record.dr_positive.append((acc, name))
260 elif type == 'F':
261 record.dr_false_pos.append((acc, name))
262 elif type == 'N':
263 record.dr_false_neg.append((acc, name))
264 elif type == 'P':
265 record.dr_potential.append((acc, name))
266 elif type == '?':
267 record.dr_unknown.append((acc, name))
268 else:
269 raise ValueError("I don't understand type flag %s" % type)
270 elif keyword=='3D':
271 cols = value.split()
272 for id in cols:
273 record.pdb_structs.append(id.rstrip(';'))
274 elif keyword=='PR':
275 rules = value.split(";")
276 record.prorules.extend(rules)
277 elif keyword=='DO':
278 record.pdoc = value.rstrip(';')
279 elif keyword=='CC':
280 continue
281 elif keyword=='//':
282 if not record:
283
284 continue
285 break
286 else:
287 raise ValueError("Unknown keyword %s found" % keyword)
288 else:
289 return
290 if not record:
291 raise ValueError("Unexpected end of stream.")
292 return record
293