Package Bio :: Package UniGene
[hide private]
[frames] | no frames]

Source Code for Package Bio.UniGene

  1  # Copyright 2006 by Sean Davis.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # $Id: __init__.py,v 1.12 2009/04/24 12:03:45 mdehoon Exp $ 
  7  # Sean Davis <sdavis2 at mail dot nih dot gov> 
  8  # National Cancer Institute 
  9  # National Institutes of Health 
 10  # Bethesda, MD, USA 
 11  # 
 12   
 13  """ 
 14  Parse Unigene flat file format files such as the Hs.data file. 
 15   
 16  Here is an overview of the flat file format that this parser deals with: 
 17     Line types/qualifiers: 
 18   
 19         ID           UniGene cluster ID 
 20         TITLE        Title for the cluster 
 21         GENE         Gene symbol 
 22         CYTOBAND     Cytological band 
 23         EXPRESS      Tissues of origin for ESTs in cluster 
 24         RESTR_EXPR   Single tissue or development stage contributes  
 25                      more than half the total EST frequency for this gene. 
 26         GNM_TERMINUS genomic confirmation of presence of a 3' terminus;  
 27                      T if a non-templated polyA tail is found among  
 28                        a cluster's sequences; else 
 29                      I if templated As are found in genomic sequence or 
 30                      S if a canonical polyA signal is found on  
 31                        the genomic sequence 
 32         GENE_ID      Entrez gene identifier associated with at least one 
 33                      sequence in this cluster;  
 34                      to be used instead of LocusLink.   
 35         LOCUSLINK    LocusLink identifier associated with at least one 
 36                      sequence in this cluster;   
 37                      deprecated in favor of GENE_ID 
 38         HOMOL        Homology; 
 39         CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping 
 40                      on the arabidopsis genome. 
 41         STS          STS 
 42              ACC=         GenBank/EMBL/DDBJ accession number of STS 
 43                           [optional field] 
 44              UNISTS=      identifier in NCBI's UNISTS database 
 45         TXMAP        Transcript map interval 
 46              MARKER=      Marker found on at least one sequence in this 
 47                           cluster 
 48              RHPANEL=     Radiation Hybrid panel used to place marker 
 49         PROTSIM      Protein Similarity data for the sequence with 
 50                      highest-scoring protein similarity in this cluster 
 51              ORG=         Organism 
 52              PROTGI=      Sequence GI of protein 
 53              PROTID=      Sequence ID of protein 
 54              PCT=         Percent alignment 
 55              ALN=         length of aligned region (aa) 
 56         SCOUNT       Number of sequences in the cluster 
 57         SEQUENCE     Sequence 
 58              ACC=         GenBank/EMBL/DDBJ accession number of sequence 
 59              NID=         Unique nucleotide sequence identifier (gi) 
 60              PID=         Unique protein sequence identifier (used for 
 61                           non-ESTs) 
 62              CLONE=       Clone identifier (used for ESTs only) 
 63              END=         End (5'/3') of clone insert read (used for 
 64                           ESTs only)  
 65              LID=         Library ID; see Hs.lib.info for library name 
 66                           and tissue      
 67              MGC=         5' CDS-completeness indicator; if present, the 
 68                           clone associated with this sequence is believed 
 69                           CDS-complete. A value greater than 511 is the gi 
 70                           of the CDS-complete mRNA matched by the EST, 
 71                           otherwise the value is an indicator of the 
 72                           reliability of the test indicating CDS 
 73                           completeness; higher values indicate more 
 74                           reliable CDS-completeness predictions.  
 75             SEQTYPE=      Description of the nucleotide sequence. 
 76                           Possible values are mRNA, EST and HTC. 
 77             TRACE=        The Trace ID of the EST sequence, as provided by 
 78                           NCBI Trace Archive 
 79  """ 
 80   
 81   
82 -class SequenceLine:
83 """Store the information for one SEQUENCE line from a Unigene file 84 85 Initialize with the text part of the SEQUENCE line, or nothing. 86 87 Attributes and descriptions (access as LOWER CASE) 88 ACC= GenBank/EMBL/DDBJ accession number of sequence 89 NID= Unique nucleotide sequence identifier (gi) 90 PID= Unique protein sequence identifier (used for non-ESTs) 91 CLONE= Clone identifier (used for ESTs only) 92 END= End (5'/3') of clone insert read (used for ESTs only) 93 LID= Library ID; see Hs.lib.info for library name and tissue 94 MGC= 5' CDS-completeness indicator; if present, 95 the clone associated with this sequence 96 is believed CDS-complete. A value greater than 511 97 is the gi of the CDS-complete mRNA matched by the EST, 98 otherwise the value is an indicator of the reliability 99 of the test indicating CDS completeness; 100 higher values indicate more reliable CDS-completeness 101 predictions. 102 SEQTYPE= Description of the nucleotide sequence. Possible values 103 are mRNA, EST and HTC. 104 TRACE= The Trace ID of the EST sequence, as provided by NCBI 105 Trace Archive 106 """ 107
108 - def __init__(self,text=None):
109 self.acc = '' 110 self.nid = '' 111 self.lid = '' 112 self.pid = '' 113 self.clone = '' 114 self.image = '' 115 self.is_image = False 116 self.end = '' 117 self.mgc = '' 118 self.seqtype = '' 119 self.trace = '' 120 if not text==None: 121 self.text=text 122 self._init_from_text(text)
123
124 - def _init_from_text(self,text):
125 parts = text.split('; '); 126 for part in parts: 127 key, val = part.split("=") 128 if key=='CLONE': 129 if val[:5]=='IMAGE': 130 self.is_image=True 131 self.image = val[6:] 132 setattr(self,key.lower(),val)
133
134 - def __repr__(self):
135 return self.text
136 137
138 -class ProtsimLine:
139 """Store the information for one PROTSIM line from a Unigene file 140 141 Initialize with the text part of the PROTSIM line, or nothing. 142 143 Attributes and descriptions (access as LOWER CASE) 144 ORG= Organism 145 PROTGI= Sequence GI of protein 146 PROTID= Sequence ID of protein 147 PCT= Percent alignment 148 ALN= length of aligned region (aa) 149 """ 150
151 - def __init__(self,text=None):
152 self.org = '' 153 self.protgi = '' 154 self.protid = '' 155 self.pct = '' 156 self.aln = '' 157 if not text==None: 158 self.text=text 159 self._init_from_text(text)
160
161 - def _init_from_text(self,text):
162 parts = text.split('; '); 163 164 for part in parts: 165 key, val = part.split("=") 166 setattr(self,key.lower(),val)
167
168 - def __repr__(self):
169 return self.text
170 171
172 -class STSLine:
173 """Store the information for one STS line from a Unigene file 174 175 Initialize with the text part of the STS line, or nothing. 176 177 Attributes and descriptions (access as LOWER CASE) 178 179 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] 180 UNISTS= identifier in NCBI's UNISTS database 181 """ 182
183 - def __init__(self,text=None):
184 self.acc = '' 185 self.unists = '' 186 if not text==None: 187 self.text=text 188 self._init_from_text(text)
189
190 - def _init_from_text(self,text):
191 parts = text.split(' '); 192 193 for part in parts: 194 key, val = part.split("=") 195 setattr(self,key.lower(),val)
196
197 - def __repr__(self):
198 return self.text
199 200
201 -class Record:
202 """Store a Unigene record 203 204 Here is what is stored: 205 206 self.ID = '' # ID line 207 self.species = '' # Hs, Bt, etc. 208 self.title = '' # TITLE line 209 self.symbol = '' # GENE line 210 self.cytoband = '' # CYTOBAND line 211 self.express = [] # EXPRESS line, parsed on ';' 212 # Will be an array of strings 213 self.restr_expr = '' # RESTR_EXPR line 214 self.gnm_terminus = '' # GNM_TERMINUS line 215 self.gene_id = '' # GENE_ID line 216 self.locuslink = '' # LOCUSLINK line 217 self.homol = '' # HOMOL line 218 self.chromosome = '' # CHROMOSOME line 219 self.protsim = [] # PROTSIM entries, array of Protsims 220 # Type ProtsimLine 221 self.sequence = [] # SEQUENCE entries, array of Sequence entries 222 # Type SequenceLine 223 self.sts = [] # STS entries, array of STS entries 224 # Type STSLine 225 self.txmap = [] # TXMAP entries, array of TXMap entries 226 """ 227
228 - def __init__(self):
229 self.ID = '' # ID line 230 self.species = '' # Hs, Bt, etc. 231 self.title = '' # TITLE line 232 self.symbol = '' # GENE line 233 self.cytoband = '' # CYTOBAND line 234 self.express = [] # EXPRESS line, parsed on ';' 235 self.restr_expr = '' # RESTR_EXPR line 236 self.gnm_terminus = '' # GNM_TERMINUS line 237 self.gene_id = '' # GENE_ID line 238 self.locuslink = '' # LOCUSLINK line 239 self.homol = '' # HOMOL line 240 self.chromosome = '' # CHROMOSOME line 241 self.protsim = [] # PROTSIM entries, array of Protsims 242 self.sequence = [] # SEQUENCE entries, array of Sequence entries 243 self.sts = [] # STS entries, array of STS entries 244 self.txmap = [] # TXMAP entries, array of TXMap entries
245
246 - def __repr__(self):
247 return "<%s> %s %s\n%s" % (self.__class__.__name__, 248 self.ID, self.symbol, self.title)
249 250
251 -def parse(handle):
252 while True: 253 record = _read(handle) 254 if not record: 255 return 256 yield record
257 258
259 -def read(handle):
260 record = _read(handle) 261 if not record: 262 raise ValueError("No SwissProt record found") 263 # We should have reached the end of the record by now 264 remainder = handle.read() 265 if remainder: 266 raise ValueError("More than one SwissProt record found") 267 return record
268 269 270 # Everything below is private 271 272
273 -def _read(handle):
274 UG_INDENT = 12 275 record = None 276 for line in handle: 277 tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip() 278 line = line.rstrip() 279 if tag=="ID": 280 record = Record() 281 record.ID = value 282 record.species = record.ID.split('.')[0] 283 elif tag=="TITLE": 284 record.title = value 285 elif tag=="GENE": 286 record.symbol = value 287 elif tag=="GENE_ID": 288 record.gene_id = value 289 elif tag=="LOCUSLINK": 290 record.locuslink = value 291 elif tag=="HOMOL": 292 if value=="YES": 293 record.homol = True 294 elif value=="NO": 295 record.homol = True 296 else: 297 raise ValueError, "Cannot parse HOMOL line %s" % line 298 elif tag=="EXPRESS": 299 record.express = [word.strip() for word in value.split("|")] 300 elif tag=="RESTR_EXPR": 301 record.restr_expr = [word.strip() for word in value.split("|")] 302 elif tag=="CHROMOSOME": 303 record.chromosome = value 304 elif tag=="CYTOBAND": 305 record.cytoband = value 306 elif tag=="PROTSIM": 307 protsim = ProtsimLine(value) 308 record.protsim.append(protsim) 309 elif tag=="SCOUNT": 310 scount = int(value) 311 elif tag=="SEQUENCE": 312 sequence = SequenceLine(value) 313 record.sequence.append(sequence) 314 elif tag=="STS": 315 sts = STSLine(value) 316 record.sts.append(sts) 317 elif tag=='//': 318 if len(record.sequence)!=scount: 319 raise ValueError, "The number of sequences specified in the record (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence)) 320 return record 321 else: 322 raise ValueError, "Unknown tag %s" % tag 323 if record: 324 raise ValueError("Unexpected end of stream.")
325 326 327 # Everything below is considered obsolete 328 329 330 from Bio.ParserSupport import * 331 import re 332 333 # 334 # CONSTANTS 335 # 336 UG_INDENT=12 337
338 -class UnigeneSequenceRecord:
339 """Store the information for one SEQUENCE line from a Unigene file 340 341 Initialize with the text part of the SEQUENCE line, or nothing. 342 343 Attributes and descriptions (access as LOWER CASE) 344 ACC= GenBank/EMBL/DDBJ accession number of sequence 345 NID= Unique nucleotide sequence identifier (gi) 346 PID= Unique protein sequence identifier (used for non-ESTs) 347 CLONE= Clone identifier (used for ESTs only) 348 END= End (5'/3') of clone insert read (used for ESTs only) 349 LID= Library ID; see Hs.lib.info for library name and tissue 350 MGC= 5' CDS-completeness indicator; if present, 351 the clone associated with this sequence 352 is believed CDS-complete. A value greater than 511 353 is the gi of the CDS-complete mRNA matched by the EST, 354 otherwise the value is an indicator of the reliability 355 of the test indicating CDS comleteness; 356 higher values indicate more reliable CDS-completeness predictions. 357 SEQTYPE= Description of the nucleotide sequence. Possible values are 358 mRNA, EST and HTC. 359 TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive 360 PERIPHERAL= Indicator that the sequence is a suboptimal 361 representative of the gene represented by this cluster. 362 Peripheral sequences are those that are in a cluster 363 which represents a spliced gene without sharing a 364 splice junction with any other sequence. In many 365 cases, they are unspliced transcripts originating 366 from the gene. 367 """ 368
369 - def __init__(self,text=None):
370 self.acc = '' 371 self.nid = '' 372 self.lid = '' 373 self.pid = '' 374 self.clone = '' 375 self.image = '' 376 self.is_image = False 377 self.end = '' 378 self.mgc = '' 379 self.seqtype = '' 380 self.Trace = '' 381 self.peripheral = '' 382 if not text==None: 383 self.text=text 384 return self._init_from_text(text)
385
386 - def _init_from_text(self,text):
387 parts = text.split('; '); 388 for part in parts: 389 key,val = re.match('(\w+)=(\S+)',part).groups() 390 if key=='CLONE': 391 if val[:5]=='IMAGE': 392 self.is_image=True 393 self.image = val[6:] 394 setattr(self,key.lower(),val)
395
396 - def __repr__(self):
397 return self.text
398 399
400 -class UnigeneProtsimRecord:
401 """Store the information for one PROTSIM line from a Unigene file 402 403 Initialize with the text part of the PROTSIM line, or nothing. 404 405 Attributes and descriptions (access as LOWER CASE) 406 ORG= Organism 407 PROTGI= Sequence GI of protein 408 PROTID= Sequence ID of protein 409 PCT= Percent alignment 410 ALN= length of aligned region (aa) 411 """ 412
413 - def __init__(self,text=None):
414 self.org = '' 415 self.protgi = '' 416 self.protid = '' 417 self.pct = '' 418 self.aln = '' 419 if not text==None: 420 self.text=text 421 return self._init_from_text(text)
422
423 - def _init_from_text(self,text):
424 parts = text.split('; '); 425 426 for part in parts: 427 key,val = re.match('(\w+)=(\S+)',part).groups() 428 setattr(self,key.lower(),val)
429
430 - def __repr__(self):
431 return self.text
432 433
434 -class UnigeneSTSRecord:
435 """Store the information for one STS line from a Unigene file 436 437 Initialize with the text part of the STS line, or nothing. 438 439 Attributes and descriptions (access as LOWER CASE) 440 441 NAME= Name of STS 442 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] 443 DSEG= GDB Dsegment number [optional field] 444 UNISTS= identifier in NCBI's UNISTS database 445 """ 446
447 - def __init__(self,text=None):
448 self.name = '' 449 self.acc = '' 450 self.dseg = '' 451 self.unists = '' 452 if not text==None: 453 self.text=text 454 return self._init_from_text(text)
455
456 - def _init_from_text(self,text):
457 parts = text.split(' '); 458 459 for part in parts: 460 key,val = re.match('(\w+)=(\S+)',part).groups() 461 setattr(self,key.lower(),val)
462
463 - def __repr__(self):
464 return self.text
465 466
467 -class UnigeneRecord:
468 """Store a Unigene record 469 470 Here is what is stored: 471 472 self.ID = '' # ID line 473 self.species = '' # Hs, Bt, etc. 474 self.title = '' # TITLE line 475 self.symbol = '' # GENE line 476 self.cytoband = '' # CYTOBAND line 477 self.express = [] # EXPRESS line, parsed on ';' 478 # Will be an array of strings 479 self.restr_expr = '' # RESTR_EXPR line 480 self.gnm_terminus = '' # GNM_TERMINUS line 481 self.gene_id = '' # GENE_ID line 482 self.chromosome = '' # CHROMOSOME 483 self.protsim = [] # PROTSIM entries, array of Protsims 484 # Type UnigeneProtsimRecord 485 self.sequence = [] # SEQUENCE entries, array of Sequence entries 486 # Type UnigeneSequenceRecord 487 self.sts = [] # STS entries, array of STS entries 488 # Type UnigeneSTSRecord 489 self.txmap = [] # TXMAP entries, array of TXMap entries 490 """ 491
492 - def __init__(self):
493 self.ID = '' # ID line 494 self.species = '' # Hs, Bt, etc. 495 self.title = '' # TITLE line 496 self.symbol = '' # GENE line 497 self.cytoband = '' # CYTOBAND line 498 self.express = [] # EXPRESS line, parsed on ';' 499 self.restr_expr = '' # RESTR_EXPR line 500 self.gnm_terminus = '' # GNM_TERMINUS line 501 self.gene_id = '' # GENE_ID line 502 self.chromosome = '' # CHROMOSOME 503 self.protsim = [] # PROTSIM entries, array of Protsims 504 self.sequence = [] # SEQUENCE entries, array of Sequence entries 505 self.sts = [] # STS entries, array of STS entries 506 self.txmap = [] # TXMAP entries, array of TXMap entries
507
508 - def __repr__(self):
509 return "<%s> %s %s\n%s" % (self.__class__.__name__, 510 self.ID, self.symbol, self.title)
511 512
513 -class _RecordConsumer(AbstractConsumer):
514
515 - def __init__(self):
516 self.unigene_record = UnigeneRecord()
517 - def ID(self,line):
518 self.unigene_record.ID = self._get_single_entry(line) 519 self.unigene_record.species = self.unigene_record.ID.split('.')[0]
520 - def TITLE(self,line):
521 self.unigene_record.title = self._get_single_entry(line)
522 - def GENE(self,line):
523 self.unigene_record.symbol = self._get_single_entry(line)
524 - def EXPRESS(self,line):
525 self.unigene_record.express = self._get_array_entry(line,split_on='; ')
526 - def RESTR_EXPR(self,line):
527 self.unigene_record.restr_expr = self._get_single_entry(line)
528 - def GENE_ID(self,line):
529 self.unigene_record.gene_id = self._get_single_entry(line)
530 - def CHROMOSOME(self,line):
531 self.unigene_record.chromosome = self._get_single_entry(line)
532 - def GENE_ID(self,line):
533 self.unigene_record.gene_id = self._get_single_entry(line)
534 - def SEQUENCE(self,line):
535 ug_seqrecord = UnigeneSequenceRecord(self._get_single_entry(line)) 536 self.unigene_record.sequence.append(ug_seqrecord)
537 - def PROTSIM(self,line):
538 ug_protsimrecord = UnigeneProtsimRecord(self._get_single_entry(line)) 539 self.unigene_record.protsim.append(ug_protsimrecord)
540 - def STS(self,line):
541 ug_stsrecord = UnigeneSTSRecord(self._get_single_entry(line)) 542 self.unigene_record.sts.append(ug_stsrecord)
543 544
545 - def _get_single_entry(self,line):
546 """Consume a single-value line 547 """ 548 return line[UG_INDENT:]
549
550 - def _get_array_entry(self,line,split_on):
551 """Consume a multi-value line by splitting on split_on 552 """ 553 return line[UG_INDENT:].split(split_on)
554 555
556 -class _Scanner:
557 """Scans a Unigene Flat File Format file 558 """ 559
560 - def feed(self, handle, consumer):
561 """feed(self, handle, consumer) 562 563 Feed events from parsing a Unigene file to a consumer. 564 handle is a file-like object, and consumer is a consumer object 565 that will receive events as the file is scanned 566 567 """ 568 consumer.start_record() 569 for line in handle: 570 tag = line.split(' ')[0] 571 line = line.rstrip() 572 if line=='//': 573 consumer.end_record() 574 break 575 try: 576 f = getattr(consumer, tag) 577 except AttributeError: 578 print 'no method called', tag 579 else: 580 if callable(f): 581 f(line)
582 583
584 -class RecordParser(AbstractParser):
585 - def __init__(self):
586 self._scanner = _Scanner() 587 self._consumer = _RecordConsumer()
588
589 - def parse(self, handle):
590 if isinstance(handle, File.UndoHandle): 591 uhandle = handle 592 else: 593 uhandle = File.UndoHandle(handle) 594 self._scanner.feed(uhandle, self._consumer) 595 return self._consumer.unigene_record
596
597 -class Iterator:
598 - def __init__(self, handle, parser=None):
599 self._uhandle = File.UndoHandle(handle)
600
601 - def next(self):
602 self._parser = RecordParser() 603 lines = [] 604 while 1: 605 line = self._uhandle.readline() 606 if not line: break 607 if line[:2] == '//': 608 break 609 lines.append(line) 610 if not lines: 611 return None 612 lines.append('//') 613 data = ''.join(lines) 614 if self._parser is not None: 615 return self._parser.parse(File.StringHandle(data)) 616 return data
617
618 - def __iter__(self):
619 return iter(self.next, None)
620