1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 __doc__="Parse the header of a PDB file."
25
26 import sys
27 import os, re
28 import urllib
29 import types
30
31
40
42
43
44 references=[]
45 actref=""
46 for l in inl:
47 if re.search("\AREMARK 1",l):
48 if re.search("\AREMARK 1 REFERENCE",l):
49 if actref!="":
50 actref=re.sub("\s\s+"," ",actref)
51 if actref!=" ":
52 references.append(actref)
53 actref=""
54 else:
55 actref+=l[19:72].lower()
56
57 if actref!="":
58 actref=re.sub("\s\s+"," ",actref)
59 if actref!=" ":
60 references.append(actref)
61 return references
62
63
64
81
82
84 """Chops lines ending with ' 1CSA 14' and the like."""
85 import re
86 return re.sub("\s\s\s\s+[\w]{4}.\s+\d*\Z","",line)
87
89 """Chops lines ending with ' 14-JUL-97 1CSA' and the like."""
90 import re
91 return re.sub("\s\s\s\s+.*\Z","",line)
92
94 """Makes A Lowercase String With Capitals."""
95 l=line.lower()
96 s=""
97 i=0
98 nextCap=1
99 while i<len(l):
100 c=l[i]
101 if c>='a' and c<='z' and nextCap:
102 c=c.upper()
103 nextCap=0
104 elif c==' ' or c=='.' or c==',' or c==';' or c==':' or c=='\t' or\
105 c=='-' or c=='_':
106 nextCap=1
107 s+=c
108 i+=1
109 return s
110
112 """
113 Returns the header lines of a pdb file as a dictionary.
114
115 Dictionary keys are: head, deposition_date, release_date, structure_method,
116 resolution, structure_reference, journal_reference, author and
117 compound.
118 """
119 header=[]
120 if type(file)==types.StringType:
121 f=open(file,'r')
122 else:
123 f=file
124 for l in f:
125 record_type=l[0:6]
126 if record_type=='ATOM ' or record_type=='HETATM' or record_type=='MODEL ':
127 break
128 else:
129 header.append(l)
130 f.close()
131 return _parse_pdb_header_list(header)
132
134
135 dict={'name':"",
136 'head':'',
137 'deposition_date' : "1909-01-08",
138 'release_date' : "1909-01-08",
139 'structure_method' : "unknown",
140 'resolution' : 0.0,
141 'structure_reference' : "unknown",
142 'journal_reference' : "unknown",
143 'author' : "",
144 'compound':{'1':{'misc':''}},'source':{'1':{'misc':''}}}
145
146 dict['structure_reference'] = _get_references(header)
147 dict['journal_reference'] = _get_journal(header)
148 comp_molid="1"
149 src_molid="1"
150 last_comp_key="misc"
151 last_src_key="misc"
152
153 for hh in header:
154 h=re.sub("[\s\n\r]*\Z","",hh)
155 key=re.sub("\s.+\s*","",h)
156 tail=re.sub("\A\w+\s+\d*\s*","",h)
157
158
159
160 if key=="TITLE":
161 name=_chop_end_codes(tail).lower()
162 if dict.has_key('name'):
163 dict['name'] += " "+name
164 else:
165 dict['name']=name
166 elif key=="HEADER":
167 rr=re.search("\d\d-\w\w\w-\d\d",tail)
168 if rr!=None:
169 dict['deposition_date']=_format_date(_nice_case(rr.group()))
170 head=_chop_end_misc(tail).lower()
171 dict['head']=head
172 elif key=="COMPND":
173 tt=re.sub("\;\s*\Z","",_chop_end_codes(tail)).lower()
174
175 rec = re.search('\d+\.\d+\.\d+\.\d+',tt)
176 if rec:
177 dict['compound'][comp_molid]['ec_number']=rec.group()
178 tt=re.sub("\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)","",tt)
179 tok=tt.split(":")
180 if len(tok)>=2:
181 ckey=tok[0]
182 cval=re.sub("\A\s*","",tok[1])
183 if ckey=='mol_id':
184 dict['compound'][cval]={'misc':''}
185 comp_molid=cval
186 last_comp_key="misc"
187 else:
188 dict['compound'][comp_molid][ckey]=cval
189 last_comp_key=ckey
190 else:
191 dict['compound'][comp_molid][last_comp_key]+=tok[0]+" "
192 elif key=="SOURCE":
193 tt=re.sub("\;\s*\Z","",_chop_end_codes(tail)).lower()
194 tok=tt.split(":")
195
196 if len(tok)>=2:
197 ckey=tok[0]
198 cval=re.sub("\A\s*","",tok[1])
199 if ckey=='mol_id':
200 dict['source'][cval]={'misc':''}
201 comp_molid=cval
202 last_src_key="misc"
203 else:
204 dict['source'][comp_molid][ckey]=cval
205 last_src_key=ckey
206 else:
207 dict['source'][comp_molid][last_src_key]+=tok[0]+" "
208 elif key=="KEYWDS":
209 kwd=_chop_end_codes(tail).lower()
210 if dict.has_key('keywords'):
211 dict['keywords']+=" "+kwd
212 else:
213 dict['keywords']=kwd
214 elif key=="EXPDTA":
215 expd=_chop_end_codes(tail)
216
217 expd=re.sub('\s\s\s\s\s\s\s.*\Z','',expd)
218
219
220 dict['structure_method']=expd.lower()
221 elif key=="CAVEAT":
222
223 pass
224 elif key=="REVDAT":
225 rr=re.search("\d\d-\w\w\w-\d\d",tail)
226 if rr!=None:
227 dict['release_date']=_format_date(_nice_case(rr.group()))
228 elif key=="JRNL":
229
230 if dict.has_key('journal'):
231 dict['journal']+=tail
232 else:
233 dict['journal']=tail
234 elif key=="AUTHOR":
235 auth = _nice_case(_chop_end_codes(tail))
236 if dict.has_key('author'):
237 dict['author']+=auth
238 else:
239 dict['author']=auth
240 elif key=="REMARK":
241 if re.search("REMARK 2 RESOLUTION.",hh):
242 r=_chop_end_codes(re.sub("REMARK 2 RESOLUTION.",'',hh))
243 r=re.sub("\s+ANGSTROM.*","",r)
244 try:
245 dict['resolution']=float(r)
246 except:
247
248 dict['resolution']=None
249 else:
250
251 pass
252 if dict['structure_method']=='unknown':
253 if dict['resolution']>0.0: dict['structure_method']='x-ray diffraction'
254 return dict
255
256 if __name__=='__main__':
257 """
258 Reads a PDB file passed as argument, parses its header, extracts
259 some data and returns it as a dictionary.
260 """
261 filename = sys.argv[1]
262 file = open(filename,'r')
263 dict = parse_pdb_header(file)
264
265
266 for d in dict.keys():
267 print "-"*40
268 print d
269 print dict[d]
270