1
2
3
4
5
6
7
8
9
10
11
12
13 """
14 Parse Unigene flat file format files such as the Hs.data file.
15
16 Here is an overview of the flat file format that this parser deals with:
17 Line types/qualifiers:
18
19 ID UniGene cluster ID
20 TITLE Title for the cluster
21 GENE Gene symbol
22 CYTOBAND Cytological band
23 EXPRESS Tissues of origin for ESTs in cluster
24 RESTR_EXPR Single tissue or development stage contributes
25 more than half the total EST frequency for this gene.
26 GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
27 T if a non-templated polyA tail is found among
28 a cluster's sequences; else
29 I if templated As are found in genomic sequence or
30 S if a canonical polyA signal is found on
31 the genomic sequence
32 GENE_ID Entrez gene identifier associated with at least one
33 sequence in this cluster;
34 to be used instead of LocusLink.
35 LOCUSLINK LocusLink identifier associated with at least one
36 sequence in this cluster;
37 deprecated in favor of GENE_ID
38 HOMOL Homology;
39 CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
40 on the arabidopsis genome.
41 STS STS
42 ACC= GenBank/EMBL/DDBJ accession number of STS
43 [optional field]
44 UNISTS= identifier in NCBI's UNISTS database
45 TXMAP Transcript map interval
46 MARKER= Marker found on at least one sequence in this
47 cluster
48 RHPANEL= Radiation Hybrid panel used to place marker
49 PROTSIM Protein Similarity data for the sequence with
50 highest-scoring protein similarity in this cluster
51 ORG= Organism
52 PROTGI= Sequence GI of protein
53 PROTID= Sequence ID of protein
54 PCT= Percent alignment
55 ALN= length of aligned region (aa)
56 SCOUNT Number of sequences in the cluster
57 SEQUENCE Sequence
58 ACC= GenBank/EMBL/DDBJ accession number of sequence
59 NID= Unique nucleotide sequence identifier (gi)
60 PID= Unique protein sequence identifier (used for
61 non-ESTs)
62 CLONE= Clone identifier (used for ESTs only)
63 END= End (5'/3') of clone insert read (used for
64 ESTs only)
65 LID= Library ID; see Hs.lib.info for library name
66 and tissue
67 MGC= 5' CDS-completeness indicator; if present, the
68 clone associated with this sequence is believed
69 CDS-complete. A value greater than 511 is the gi
70 of the CDS-complete mRNA matched by the EST,
71 otherwise the value is an indicator of the
72 reliability of the test indicating CDS
73 completeness; higher values indicate more
74 reliable CDS-completeness predictions.
75 SEQTYPE= Description of the nucleotide sequence.
76 Possible values are mRNA, EST and HTC.
77 TRACE= The Trace ID of the EST sequence, as provided by
78 NCBI Trace Archive
79 """
80
81
83 """Store the information for one SEQUENCE line from a Unigene file
84
85 Initialize with the text part of the SEQUENCE line, or nothing.
86
87 Attributes and descriptions (access as LOWER CASE)
88 ACC= GenBank/EMBL/DDBJ accession number of sequence
89 NID= Unique nucleotide sequence identifier (gi)
90 PID= Unique protein sequence identifier (used for non-ESTs)
91 CLONE= Clone identifier (used for ESTs only)
92 END= End (5'/3') of clone insert read (used for ESTs only)
93 LID= Library ID; see Hs.lib.info for library name and tissue
94 MGC= 5' CDS-completeness indicator; if present,
95 the clone associated with this sequence
96 is believed CDS-complete. A value greater than 511
97 is the gi of the CDS-complete mRNA matched by the EST,
98 otherwise the value is an indicator of the reliability
99 of the test indicating CDS completeness;
100 higher values indicate more reliable CDS-completeness
101 predictions.
102 SEQTYPE= Description of the nucleotide sequence. Possible values
103 are mRNA, EST and HTC.
104 TRACE= The Trace ID of the EST sequence, as provided by NCBI
105 Trace Archive
106 """
107
109 self.acc = ''
110 self.nid = ''
111 self.lid = ''
112 self.pid = ''
113 self.clone = ''
114 self.image = ''
115 self.is_image = False
116 self.end = ''
117 self.mgc = ''
118 self.seqtype = ''
119 self.trace = ''
120 if not text==None:
121 self.text=text
122 self._init_from_text(text)
123
124 - def _init_from_text(self,text):
125 parts = text.split('; ');
126 for part in parts:
127 key, val = part.split("=")
128 if key=='CLONE':
129 if val[:5]=='IMAGE':
130 self.is_image=True
131 self.image = val[6:]
132 setattr(self,key.lower(),val)
133
136
137
139 """Store the information for one PROTSIM line from a Unigene file
140
141 Initialize with the text part of the PROTSIM line, or nothing.
142
143 Attributes and descriptions (access as LOWER CASE)
144 ORG= Organism
145 PROTGI= Sequence GI of protein
146 PROTID= Sequence ID of protein
147 PCT= Percent alignment
148 ALN= length of aligned region (aa)
149 """
150
152 self.org = ''
153 self.protgi = ''
154 self.protid = ''
155 self.pct = ''
156 self.aln = ''
157 if not text==None:
158 self.text=text
159 self._init_from_text(text)
160
161 - def _init_from_text(self,text):
162 parts = text.split('; ');
163
164 for part in parts:
165 key, val = part.split("=")
166 setattr(self,key.lower(),val)
167
170
171
173 """Store the information for one STS line from a Unigene file
174
175 Initialize with the text part of the STS line, or nothing.
176
177 Attributes and descriptions (access as LOWER CASE)
178
179 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
180 UNISTS= identifier in NCBI's UNISTS database
181 """
182
189
190 - def _init_from_text(self,text):
191 parts = text.split(' ');
192
193 for part in parts:
194 key, val = part.split("=")
195 setattr(self,key.lower(),val)
196
199
200
202 """Store a Unigene record
203
204 Here is what is stored:
205
206 self.ID = '' # ID line
207 self.species = '' # Hs, Bt, etc.
208 self.title = '' # TITLE line
209 self.symbol = '' # GENE line
210 self.cytoband = '' # CYTOBAND line
211 self.express = [] # EXPRESS line, parsed on ';'
212 # Will be an array of strings
213 self.restr_expr = '' # RESTR_EXPR line
214 self.gnm_terminus = '' # GNM_TERMINUS line
215 self.gene_id = '' # GENE_ID line
216 self.locuslink = '' # LOCUSLINK line
217 self.homol = '' # HOMOL line
218 self.chromosome = '' # CHROMOSOME line
219 self.protsim = [] # PROTSIM entries, array of Protsims
220 # Type ProtsimLine
221 self.sequence = [] # SEQUENCE entries, array of Sequence entries
222 # Type SequenceLine
223 self.sts = [] # STS entries, array of STS entries
224 # Type STSLine
225 self.txmap = [] # TXMAP entries, array of TXMap entries
226 """
227
229 self.ID = ''
230 self.species = ''
231 self.title = ''
232 self.symbol = ''
233 self.cytoband = ''
234 self.express = []
235 self.restr_expr = ''
236 self.gnm_terminus = ''
237 self.gene_id = ''
238 self.locuslink = ''
239 self.homol = ''
240 self.chromosome = ''
241 self.protsim = []
242 self.sequence = []
243 self.sts = []
244 self.txmap = []
245
247 return "<%s> %s %s\n%s" % (self.__class__.__name__,
248 self.ID, self.symbol, self.title)
249
250
257
258
268
269
270
271
272
325
326
327
328
329
330 from Bio.ParserSupport import *
331 import re
332
333
334
335
336 UG_INDENT=12
337
339 """Store the information for one SEQUENCE line from a Unigene file
340
341 Initialize with the text part of the SEQUENCE line, or nothing.
342
343 Attributes and descriptions (access as LOWER CASE)
344 ACC= GenBank/EMBL/DDBJ accession number of sequence
345 NID= Unique nucleotide sequence identifier (gi)
346 PID= Unique protein sequence identifier (used for non-ESTs)
347 CLONE= Clone identifier (used for ESTs only)
348 END= End (5'/3') of clone insert read (used for ESTs only)
349 LID= Library ID; see Hs.lib.info for library name and tissue
350 MGC= 5' CDS-completeness indicator; if present,
351 the clone associated with this sequence
352 is believed CDS-complete. A value greater than 511
353 is the gi of the CDS-complete mRNA matched by the EST,
354 otherwise the value is an indicator of the reliability
355 of the test indicating CDS comleteness;
356 higher values indicate more reliable CDS-completeness predictions.
357 SEQTYPE= Description of the nucleotide sequence. Possible values are
358 mRNA, EST and HTC.
359 TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive
360 PERIPHERAL= Indicator that the sequence is a suboptimal
361 representative of the gene represented by this cluster.
362 Peripheral sequences are those that are in a cluster
363 which represents a spliced gene without sharing a
364 splice junction with any other sequence. In many
365 cases, they are unspliced transcripts originating
366 from the gene.
367 """
368
370 self.acc = ''
371 self.nid = ''
372 self.lid = ''
373 self.pid = ''
374 self.clone = ''
375 self.image = ''
376 self.is_image = False
377 self.end = ''
378 self.mgc = ''
379 self.seqtype = ''
380 self.Trace = ''
381 self.peripheral = ''
382 if not text==None:
383 self.text=text
384 return self._init_from_text(text)
385
386 - def _init_from_text(self,text):
387 parts = text.split('; ');
388 for part in parts:
389 key,val = re.match('(\w+)=(\S+)',part).groups()
390 if key=='CLONE':
391 if val[:5]=='IMAGE':
392 self.is_image=True
393 self.image = val[6:]
394 setattr(self,key.lower(),val)
395
398
399
401 """Store the information for one PROTSIM line from a Unigene file
402
403 Initialize with the text part of the PROTSIM line, or nothing.
404
405 Attributes and descriptions (access as LOWER CASE)
406 ORG= Organism
407 PROTGI= Sequence GI of protein
408 PROTID= Sequence ID of protein
409 PCT= Percent alignment
410 ALN= length of aligned region (aa)
411 """
412
414 self.org = ''
415 self.protgi = ''
416 self.protid = ''
417 self.pct = ''
418 self.aln = ''
419 if not text==None:
420 self.text=text
421 return self._init_from_text(text)
422
423 - def _init_from_text(self,text):
424 parts = text.split('; ');
425
426 for part in parts:
427 key,val = re.match('(\w+)=(\S+)',part).groups()
428 setattr(self,key.lower(),val)
429
432
433
435 """Store the information for one STS line from a Unigene file
436
437 Initialize with the text part of the STS line, or nothing.
438
439 Attributes and descriptions (access as LOWER CASE)
440
441 NAME= Name of STS
442 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
443 DSEG= GDB Dsegment number [optional field]
444 UNISTS= identifier in NCBI's UNISTS database
445 """
446
455
456 - def _init_from_text(self,text):
457 parts = text.split(' ');
458
459 for part in parts:
460 key,val = re.match('(\w+)=(\S+)',part).groups()
461 setattr(self,key.lower(),val)
462
465
466
468 """Store a Unigene record
469
470 Here is what is stored:
471
472 self.ID = '' # ID line
473 self.species = '' # Hs, Bt, etc.
474 self.title = '' # TITLE line
475 self.symbol = '' # GENE line
476 self.cytoband = '' # CYTOBAND line
477 self.express = [] # EXPRESS line, parsed on ';'
478 # Will be an array of strings
479 self.restr_expr = '' # RESTR_EXPR line
480 self.gnm_terminus = '' # GNM_TERMINUS line
481 self.gene_id = '' # GENE_ID line
482 self.chromosome = '' # CHROMOSOME
483 self.protsim = [] # PROTSIM entries, array of Protsims
484 # Type UnigeneProtsimRecord
485 self.sequence = [] # SEQUENCE entries, array of Sequence entries
486 # Type UnigeneSequenceRecord
487 self.sts = [] # STS entries, array of STS entries
488 # Type UnigeneSTSRecord
489 self.txmap = [] # TXMAP entries, array of TXMap entries
490 """
491
493 self.ID = ''
494 self.species = ''
495 self.title = ''
496 self.symbol = ''
497 self.cytoband = ''
498 self.express = []
499 self.restr_expr = ''
500 self.gnm_terminus = ''
501 self.gene_id = ''
502 self.chromosome = ''
503 self.protsim = []
504 self.sequence = []
505 self.sts = []
506 self.txmap = []
507
509 return "<%s> %s %s\n%s" % (self.__class__.__name__,
510 self.ID, self.symbol, self.title)
511
512
514
522 - def GENE(self,line):
540 - def STS(self,line):
543
544
545 - def _get_single_entry(self,line):
546 """Consume a single-value line
547 """
548 return line[UG_INDENT:]
549
550 - def _get_array_entry(self,line,split_on):
551 """Consume a multi-value line by splitting on split_on
552 """
553 return line[UG_INDENT:].split(split_on)
554
555
557 """Scans a Unigene Flat File Format file
558 """
559
560 - def feed(self, handle, consumer):
561 """feed(self, handle, consumer)
562
563 Feed events from parsing a Unigene file to a consumer.
564 handle is a file-like object, and consumer is a consumer object
565 that will receive events as the file is scanned
566
567 """
568 consumer.start_record()
569 for line in handle:
570 tag = line.split(' ')[0]
571 line = line.rstrip()
572 if line=='//':
573 consumer.end_record()
574 break
575 try:
576 f = getattr(consumer, tag)
577 except AttributeError:
578 print 'no method called', tag
579 else:
580 if callable(f):
581 f(line)
582
583
588
589 - def parse(self, handle):
596
598 - def __init__(self, handle, parser=None):
600
602 self._parser = RecordParser()
603 lines = []
604 while 1:
605 line = self._uhandle.readline()
606 if not line: break
607 if line[:2] == '//':
608 break
609 lines.append(line)
610 if not lines:
611 return None
612 lines.append('//')
613 data = ''.join(lines)
614 if self._parser is not None:
615 return self._parser.parse(File.StringHandle(data))
616 return data
617
619 return iter(self.next, None)
620