1
2
3
4
5
6
7
8 """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format.
9
10 You are expected to use this module via the Bio.SeqIO functions.
11 See also the Bio.SwissProt module which offers more than just accessing
12 the sequences as SeqRecord objects."""
13
14 from Bio import Seq
15 from Bio import SeqRecord
16 from Bio import Alphabet
17 from Bio import SeqFeature
18 from Bio import SwissProt
19
20
22 """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.
23
24 Every section from the ID line to the terminating // becomes
25 a single SeqRecord with associated annotation and features.
26
27 This parser is for the flat file "swiss" format as used by:
28 * Swiss-Prot aka SwissProt
29 * TrEMBL
30 * UniProtKB aka UniProt Knowledgebase
31
32 It does NOT read their new XML file format.
33 http://www.expasy.org/sprot/
34
35 For consistency with BioPerl and EMBOSS we call this the "swiss"
36 format.
37 """
38 swiss_records = SwissProt.parse(handle)
39 for swiss_record in swiss_records:
40
41 seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
42 record = SeqRecord.SeqRecord(seq,
43 id=swiss_record.accessions[0],
44 name=swiss_record.entry_name,
45 description=swiss_record.description,
46 )
47 record.description = swiss_record.description
48 for cross_reference in swiss_record.cross_references:
49 if len(cross_reference) < 2: continue
50 database, accession = cross_reference[:2]
51 dbxref = "%s:%s" % (database, accession)
52 if not dbxref in record.dbxrefs:
53 record.dbxrefs.append(dbxref)
54 annotations = record.annotations
55 annotations['accessions'] = swiss_record.accessions
56 annotations['date'] = swiss_record.created[0]
57 annotations['date_last_sequence_update'] = swiss_record.sequence_update[0]
58 if swiss_record.annotation_update:
59 annotations['date_last_annotation_update'] = swiss_record.annotation_update[0]
60 if swiss_record.gene_name:
61 annotations['gene_name'] = swiss_record.gene_name
62 annotations['organism'] = swiss_record.organism.rstrip(".")
63 annotations['taxonomy'] = swiss_record.organism_classification
64 annotations['ncbi_taxid'] = swiss_record.taxonomy_id
65 if swiss_record.host_organism:
66 annotations['organism_host'] = [word.rstrip(".") for word in swiss_record.host_organism]
67 if swiss_record.comments:
68 annotations['comment'] = "\n".join(swiss_record.comments)
69 if swiss_record.references:
70 annotations['references'] = []
71 for reference in swiss_record.references:
72 feature = SeqFeature.Reference()
73 feature.comment = " ".join(["%s=%s;" % (key, value) for key, value in reference.comments])
74 for key, value in reference.references:
75 if key=='PubMed':
76 feature.pubmed_id = value
77 elif key=='MEDLINE':
78 feature.medline_id = value
79 elif key=='DOI':
80 pass
81 elif key=='AGRICOLA':
82 pass
83 else:
84 raise ValueError, "Unknown key %s found in references" % key
85 feature.authors = reference.authors
86 feature.title = reference.title
87 feature.journal = reference.location
88 annotations['references'].append(feature)
89 if swiss_record.keywords:
90 record.annotations['keywords'] = swiss_record.keywords
91 yield record
92
93 if __name__ == "__main__" :
94 print "Quick self test..."
95
96 example_filename = "../../Tests/SwissProt/sp008"
97
98 import os
99 if not os.path.isfile(example_filename):
100 print "Missing test file %s" % example_filename
101 else :
102
103 handle = open(example_filename)
104 records = SwissIterator(handle)
105 for record in records:
106 print record.name
107 print record.id
108 print record.annotations['keywords']
109 print repr(record.annotations['organism'])
110 print record.seq.tostring()[:20] + "..."
111 handle.close()
112