1
2
3
4
5
6
7
8
9
10
11
12
13 """
14 Parse Unigene flat file format files such as the Hs.data file.
15
16 Here is an overview of the flat file format that this parser deals with:
17 Line types/qualifiers:
18
19 ID UniGene cluster ID
20 TITLE Title for the cluster
21 GENE Gene symbol
22 CYTOBAND Cytological band
23 EXPRESS Tissues of origin for ESTs in cluster
24 RESTR_EXPR Single tissue or development stage contributes
25 more than half the total EST frequency for this gene.
26 GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
27 T if a non-templated polyA tail is found among
28 a cluster's sequences; else
29 I if templated As are found in genomic sequence or
30 S if a canonical polyA signal is found on
31 the genomic sequence
32 GENE_ID Entrez gene identifier associated with at least one sequence in this cluster;
33 to be used instead of LocusLink.
34 LOCUSLINK LocusLink identifier associated with at least one sequence in this cluster;
35 deprecated in favor of GENE_ID
36 CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping on the arabidopsis genome.
37 STS STS
38 NAME= Name of STS
39 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
40 DSEG= GDB Dsegment number [optional field]
41 UNISTS= identifier in NCBI's UNISTS database
42 TXMAP Transcript map interval
43 MARKER= Marker found on at least one sequence in this cluster
44 RHPANEL= Radiation Hybrid panel used to place marker
45 PROTSIM Protein Similarity data for the sequence with highest-scoring protein similarity in this cluster
46 ORG= Organism
47 PROTGI= Sequence GI of protein
48 PROTID= Sequence ID of protein
49 PCT= Percent alignment
50 ALN= length of aligned region (aa)
51 SCOUNT Number of sequences in the cluster
52 SEQUENCE Sequence
53 ACC= GenBank/EMBL/DDBJ accession number of sequence
54 NID= Unique nucleotide sequence identifier (gi)
55 PID= Unique protein sequence identifier (used for non-ESTs)
56 CLONE= Clone identifier (used for ESTs only)
57 END= End (5'/3') of clone insert read (used for ESTs only)
58 LID= Library ID; see Hs.lib.info for library name and tissue
59 MGC= 5' CDS-completeness indicator; if present,
60 the clone associated with this sequence
61 is believed CDS-complete. A value greater than 511
62 is the gi of the CDS-complete mRNA matched by the EST,
63 otherwise the value is an indicator of the reliability
64 of the test indicating CDS comleteness;
65 higher values indicate more reliable CDS-completeness predictions.
66 SEQTYPE= Description of the nucleotide sequence. Possible values are
67 mRNA, EST and HTC.
68 TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive
69 PERIPHERAL= Indicator that the sequence is a suboptimal
70 representative of the gene represented by this cluster.
71 Peripheral sequences are those that are in a cluster
72 which represents a spliced gene without sharing a
73 splice junction with any other sequence. In many
74 cases, they are unspliced transcripts originating
75 from the gene.
76
77 // End of record
78 """
79 from Bio.ParserSupport import *
80 import re
81
82
83
84
85 UG_INDENT=12
86
88 """Store the information for one SEQUENCE line from a Unigene file
89
90 Initialize with the text part of the SEQUENCE line, or nothing.
91
92 Attributes and descriptions (access as LOWER CASE)
93 ACC= GenBank/EMBL/DDBJ accession number of sequence
94 NID= Unique nucleotide sequence identifier (gi)
95 PID= Unique protein sequence identifier (used for non-ESTs)
96 CLONE= Clone identifier (used for ESTs only)
97 END= End (5'/3') of clone insert read (used for ESTs only)
98 LID= Library ID; see Hs.lib.info for library name and tissue
99 MGC= 5' CDS-completeness indicator; if present,
100 the clone associated with this sequence
101 is believed CDS-complete. A value greater than 511
102 is the gi of the CDS-complete mRNA matched by the EST,
103 otherwise the value is an indicator of the reliability
104 of the test indicating CDS comleteness;
105 higher values indicate more reliable CDS-completeness predictions.
106 SEQTYPE= Description of the nucleotide sequence. Possible values are
107 mRNA, EST and HTC.
108 TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive
109 PERIPHERAL= Indicator that the sequence is a suboptimal
110 representative of the gene represented by this cluster.
111 Peripheral sequences are those that are in a cluster
112 which represents a spliced gene without sharing a
113 splice junction with any other sequence. In many
114 cases, they are unspliced transcripts originating
115 from the gene.
116 """
117
119 self.acc = ''
120 self.nid = ''
121 self.lid = ''
122 self.pid = ''
123 self.clone = ''
124 self.image = ''
125 self.is_image = False
126 self.end = ''
127 self.mgc = ''
128 self.seqtype = ''
129 self.Trace = ''
130 self.peripheral = ''
131 if not text==None:
132 self.text=text
133 return self._init_from_text(text)
134
135 - def _init_from_text(self,text):
136 parts = text.split('; ');
137 for part in parts:
138 key,val = re.match('(\w+)=(\S+)',part).groups()
139 if key=='CLONE':
140 if val[:5]=='IMAGE':
141 self.is_image=True
142 self.image = val[6:]
143 setattr(self,key.lower(),val)
144
147
148
150 """Store the information for one PROTSIM line from a Unigene file
151
152 Initialize with the text part of the PROTSIM line, or nothing.
153
154 Attributes and descriptions (access as LOWER CASE)
155 ORG= Organism
156 PROTGI= Sequence GI of protein
157 PROTID= Sequence ID of protein
158 PCT= Percent alignment
159 ALN= length of aligned region (aa)
160 """
161
163 self.org = ''
164 self.protgi = ''
165 self.protid = ''
166 self.pct = ''
167 self.aln = ''
168 if not text==None:
169 self.text=text
170 return self._init_from_text(text)
171
172 - def _init_from_text(self,text):
173 parts = text.split('; ');
174
175 for part in parts:
176 key,val = re.match('(\w+)=(\S+)',part).groups()
177 setattr(self,key.lower(),val)
178
181
182
184 """Store the information for one STS line from a Unigene file
185
186 Initialize with the text part of the STS line, or nothing.
187
188 Attributes and descriptions (access as LOWER CASE)
189
190 NAME= Name of STS
191 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
192 DSEG= GDB Dsegment number [optional field]
193 UNISTS= identifier in NCBI's UNISTS database
194 """
195
204
205 - def _init_from_text(self,text):
206 parts = text.split(' ');
207
208 for part in parts:
209 key,val = re.match('(\w+)=(\S+)',part).groups()
210 setattr(self,key.lower(),val)
211
214
215
217 """Store a Unigene record
218
219 Here is what is stored:
220
221 self.ID = '' # ID line
222 self.species = '' # Hs, Bt, etc.
223 self.title = '' # TITLE line
224 self.symbol = '' # GENE line
225 self.cytoband = '' # CYTOBAND line
226 self.express = [] # EXPRESS line, parsed on ';'
227 # Will be an array of strings
228 self.restr_expr = '' # RESTR_EXPR line
229 self.gnm_terminus = '' # GNM_TERMINUS line
230 self.gene_id = '' # GENE_ID line
231 self.chromosome = '' # CHROMOSOME
232 self.protsim = [] # PROTSIM entries, array of Protsims
233 # Type UnigeneProtsimRecord
234 self.sequence = [] # SEQUENCE entries, array of Sequence entries
235 # Type UnigeneSequenceRecord
236 self.sts = [] # STS entries, array of STS entries
237 # Type UnigeneSTSRecord
238 self.txmap = [] # TXMAP entries, array of TXMap entries
239 """
240
242 self.ID = ''
243 self.species = ''
244 self.title = ''
245 self.symbol = ''
246 self.cytoband = ''
247 self.express = []
248 self.restr_expr = ''
249 self.gnm_terminus = ''
250 self.gene_id = ''
251 self.chromosome = ''
252 self.protsim = []
253 self.sequence = []
254 self.sts = []
255 self.txmap = []
256
258 return "<%s> %s %s\n%s" % (self.__class__.__name__,
259 self.ID, self.symbol, self.title)
260
261
263
271 - def GENE(self,line):
289 - def STS(self,line):
292
293
294 - def _get_single_entry(self,line):
295 """Consume a single-value line
296 """
297 return line[UG_INDENT:]
298
299 - def _get_array_entry(self,line,split_on):
300 """Consume a multi-value line by splitting on split_on
301 """
302 return line[UG_INDENT:].split(split_on)
303
304
306 """Scans a Unigene Flat File Format file
307 """
308
309 - def feed(self, handle, consumer):
310 """feed(self, handle, consumer)
311
312 Feed events from parsing a Unigene file to a consumer.
313 handle is a file-like object, and consumer is a consumer object
314 that will receive events as the file is scanned
315
316 """
317 consumer.start_record()
318 for line in handle:
319 tag = line.split(' ')[0]
320 line = line.rstrip()
321 if line=='//':
322 consumer.end_record()
323 break
324 try:
325 f = getattr(consumer, tag)
326 except AttributeError:
327 print 'no method called', tag
328 else:
329 if callable(f):
330 f(line)
331
332
337
338 - def parse(self, handle):
345
347 - def __init__(self, handle, parser=None):
349
351 self._parser = RecordParser()
352 lines = []
353 while 1:
354 line = self._uhandle.readline()
355 if not line: break
356 if line[:2] == '//':
357 break
358 lines.append(line)
359 if not lines:
360 return None
361 lines.append('//')
362 data = string.join(lines,'')
363 if self._parser is not None:
364 return self._parser.parse(File.StringHandle(data))
365 return data
366
368 return iter(self.next, None)
369