Package Bio :: Package Sequencing :: Module Ace
[hide private]
[frames] | no frames]

Source Code for Module Bio.Sequencing.Ace

  1  # Copyright 2004 by Frank Kauff and Cymon J. Cox.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  Parser for ACE files output by PHRAP. 
  7   
  8  Written by Frank Kauff (fkauff@duke.edu) and 
  9  Cymon J. Cox (cymon@duke.edu) 
 10   
 11  Uses the Biopython Parser interface: ParserSupport.py 
 12   
 13  Usage: 
 14   
 15  There are two ways of reading an ace file: 
 16  1) The function 'read' reads the whole file at once; 
 17  2) The function 'parse' reads the file contig after contig. 
 18       
 19  1) Parse whole ace file at once: 
 20   
 21          from Bio.Sequencing import Ace 
 22          acefilerecord=Ace.read(open('my_ace_file.ace')) 
 23   
 24  This gives you: 
 25          acefilerecord.ncontigs (the number of contigs in the ace file) 
 26          acefilerecord.nreads (the number of reads in the ace file) 
 27          acefilerecord.contigs[] (one instance of the Contig class for each contig) 
 28   
 29  The Contig class holds the info of the CO tag, CT and WA tags, and all the reads used 
 30  for this contig in a list of instances of the Read class, e.g.: 
 31   
 32          contig3=acefilerecord.contigs[2] 
 33          read4=contig3.reads[3] 
 34          RD_of_read4=read4.rd 
 35          DS_of_read4=read4.ds 
 36   
 37  CT, WA, RT tags from the end of the file can appear anywhere are automatically 
 38  sorted into the right place. 
 39   
 40  see _RecordConsumer for details. 
 41   
 42  2) Or you can iterate over the contigs of an ace file one by one in the ususal way: 
 43   
 44          from Bio.Sequencing import Ace 
 45          contigs=Ace.parse(open('my_ace_file.ace')) 
 46          for contig in contigs: 
 47              print contig.name 
 48              ... 
 49   
 50  Please note that for memory efficiency, when using the iterator approach, only one 
 51  contig is kept in memory at once.  However, there can be a footer to the ACE file 
 52  containing WA, CT, RT or WR tags which contain additional meta-data on the contigs. 
 53  Because the parser doesn't see this data until the final record, it cannot be added to 
 54  the appropriate records.  Instead these tags will be returned with the last contig record. 
 55  Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags 
 56  are needed, the 'read' function rather than the 'parse' function might be more appropriate. 
 57  """ 
 58   
 59   
60 -class rd:
61 """RD (reads), store a read with its name, sequence etc."""
62 - def __init__(self):
63 self.name='' 64 self.padded_bases=None 65 self.info_items=None 66 self.read_tags=None 67 self.sequence=''
68
69 -class qa:
70 """QA (read quality), including which part if any was used as the consensus."""
71 - def __init__(self, line=None):
72 self.qual_clipping_start=None 73 self.qual_clipping_end=None 74 self.align_clipping_start=None 75 self.align_clipping_end=None 76 if line: 77 header=map(eval,line.split()[1:]) 78 self.qual_clipping_start=header[0] 79 self.qual_clipping_end=header[1] 80 self.align_clipping_start=header[2] 81 self.align_clipping_end=header[3]
82
83 -class ds:
84 """DS lines, include file name of a read's chromatogram file."""
85 - def __init__(self, line=None):
86 self.chromat_file='' 87 self.phd_file='' 88 self.time='' 89 self.chem='' 90 self.dye='' 91 self.template='' 92 self.direction='' 93 if line: 94 tags=['CHROMAT_FILE','PHD_FILE','TIME','CHEM','DYE','TEMPLATE','DIRECTION'] 95 poss=map(line.find,tags) 96 tagpos=dict(zip(poss,tags)) 97 if -1 in tagpos: 98 del tagpos[-1] 99 ps=tagpos.keys() 100 ps.sort() 101 for (p1,p2) in zip(ps,ps[1:]+[len(line)+1]): 102 setattr(self,tagpos[p1].lower(),line[p1+len(tagpos[p1])+1:p2].strip())
103 104
105 -class af:
106 """AF lines, define the location of the read within the contig."""
107 - def __init__(self, line=None):
108 self.name='' 109 self.coru=None 110 self.padded_start=None 111 if line: 112 header = line.split() 113 self.name = header[1] 114 self.coru = header[2] 115 self.padded_start = int(header[3])
116
117 -class bs:
118 """"BS (base segment), which read was chosen as the consensus at each position."""
119 - def __init__(self, line=None):
120 self.name='' 121 self.padded_start=None 122 self.padded_end=None 123 if line: 124 header = line.split() 125 self.padded_start = int(header[1]) 126 self.padded_end = int(header[2]) 127 self.name = header[3]
128
129 -class rt:
130 """RT (transient read tags), generated by crossmatch and phrap."""
131 - def __init__(self, line=None):
132 self.name='' 133 self.tag_type='' 134 self.program='' 135 self.padded_start=None 136 self.padded_end=None 137 self.date='' 138 if line: 139 header=line.split() 140 self.name=header[0] 141 self.tag_type=header[1] 142 self.program=header[2] 143 self.padded_start=int(header[3]) 144 self.padded_end=int(header[4]) 145 self.date=header[5]
146
147 -class ct:
148 """CT (consensus tags)."""
149 - def __init__(self, line=None):
150 self.name='' 151 self.tag_type='' 152 self.program='' 153 self.padded_start=None 154 self.padded_end=None 155 self.date='' 156 self.notrans='' 157 self.info=[] 158 self.comment=[] 159 if line: 160 header=line.split() 161 self.name = header[0] 162 self.tag_type = header[1] 163 self.program = header[2] 164 self.padded_start = int(header[3]) 165 self.padded_end = int(header[4]) 166 self.date = header[5] 167 if len(header)==7: 168 self.notrans = header[6]
169
170 -class wa:
171 """WA (whole assembly tag), holds the assembly program name, version, etc."""
172 - def __init__(self, line=None):
173 self.tag_type='' 174 self.program='' 175 self.date='' 176 self.info=[] 177 if line: 178 header = line.split() 179 self.tag_type = header[0] 180 self.program = header[1] 181 self.date = header[2]
182
183 -class wr:
184 """WR lines."""
185 - def __init__(self, line=None):
186 self.name='' 187 self.aligned='' 188 self.program='' 189 self.date=[] 190 if line: 191 header = line.split() 192 self.name = header[0] 193 self.aligned = header[1] 194 self.program = header[2] 195 self.date = header[3]
196
197 -class Reads:
198 """Holds information about a read supporting an ACE contig."""
199 - def __init__(self, line=None):
200 self.rd=None # one per read 201 self.qa=None # one per read 202 self.ds=None # none or one per read 203 self.rt=None # none or many per read 204 self.wr=None # none or many per read 205 if line: 206 self.rd = rd() 207 header = line.split() 208 self.rd.name = header[1] 209 self.rd.padded_bases = int(header[2]) 210 self.rd.info_items = int(header[3]) 211 self.rd.read_tags = int(header[4])
212
213 -class Contig:
214 """Holds information about a contig from an ACE record."""
215 - def __init__(self, line=None):
216 self.name = '' 217 self.nbases=None 218 self.nreads=None 219 self.nsegments=None 220 self.uorc=None 221 self.sequence="" 222 self.quality=[] 223 self.af=[] 224 self.bs=[] 225 self.reads=[] 226 self.ct=None # none or many 227 self.wa=None # none or many 228 if line: 229 header = line.split() 230 self.name = header[1] 231 self.nbases = int(header[2]) 232 self.nreads = int(header[3]) 233 self.nsegments = int(header[4]) 234 self.uorc = header[5]
235
236 -def parse(handle):
237 """parse(handle) 238 239 where handle is a file-like object. 240 241 This function returns an iterator that allows you to iterate 242 over the ACE file record by record: 243 244 records = parse(handle) 245 for record in records: 246 # do something with the record 247 248 where each record is a Contig object. 249 """ 250 251 handle = iter(handle) 252 253 line = "" 254 while True: 255 # at beginning, skip the AS and look for first CO command 256 try: 257 while True: 258 if line.startswith('CO'): 259 break 260 line = handle.next() 261 except StopIteration: 262 return 263 264 record = Contig(line) 265 266 for line in handle: 267 line = line.strip() 268 if not line: 269 break 270 record.sequence+=line 271 272 for line in handle: 273 if line.strip(): 274 break 275 if not line.startswith("BQ"): 276 raise ValueError("Failed to find BQ line") 277 278 for line in handle: 279 if not line.strip(): 280 break 281 record.quality.extend(map(int,line.split())) 282 283 for line in handle: 284 if line.strip(): 285 break 286 287 while True: 288 if not line.startswith("AF "): 289 break 290 record.af.append(af(line)) 291 try: 292 line = handle.next() 293 except StopIteration: 294 raise ValueError("Unexpected end of AF block") 295 296 while True: 297 if line.strip(): 298 break 299 try: 300 line = handle.next() 301 except StopIteration: 302 raise ValueError("Unexpected end of file") 303 304 while True: 305 if not line.startswith("BS "): 306 break 307 record.bs.append(bs(line)) 308 try: 309 line = handle.next() 310 except StopIteration: 311 raise ValueError("Failed to find end of BS block") 312 313 # now read all the read data 314 # it starts with a 'RD', and then a mandatory QA 315 # then follows an optional DS 316 # CT,RT,WA,WR may or may not be there in unlimited quantity. They might refer to the actual read or contig, 317 # or, if encountered at the end of file, to any previous read or contig. the sort() method deals 318 # with that later. 319 while True: 320 321 # each read must have a rd and qa 322 try: 323 while True: 324 # If I've met the condition, then stop reading the line. 325 if line.startswith("RD "): 326 break 327 line = handle.next() 328 except StopIteration: 329 raise ValueError("Failed to find RD line") 330 331 record.reads.append(Reads(line)) 332 333 for line in handle: 334 line = line.strip() 335 if not line: 336 break 337 record.reads[-1].rd.sequence+=line 338 339 for line in handle: 340 if line.strip(): 341 break 342 if not line.startswith("QA "): 343 raise ValueError("Failed to find QA line") 344 record.reads[-1].qa = qa(line) 345 346 # now one ds can follow 347 for line in handle: 348 if line.strip(): 349 break 350 else: 351 break 352 353 if line.startswith("DS "): 354 record.reads[-1].ds = ds(line) 355 line = "" 356 # the file could just end, or there's some more stuff. In ace files, anything can happen. 357 # the following tags are interspersed between reads and can appear multiple times. 358 while True: 359 # something left 360 try: 361 while True: 362 if line.strip(): 363 break 364 line = handle.next() 365 except StopIteration: 366 # file ends here 367 break 368 if line.startswith("RT{"): 369 # now if we're at the end of the file, this rt could 370 # belong to a previous read, not the actual one. 371 # we store it here were it appears, the user can sort later. 372 if record.reads[-1].rt is None: 373 record.reads[-1].rt=[] 374 for line in handle: 375 line=line.strip() 376 if line=='}': break 377 record.reads[-1].rt.append(rt(line)) 378 line = "" 379 elif line.startswith("WR{"): 380 if record.reads[-1].wr is None: 381 record.reads[-1].wr=[] 382 for line in handle: 383 line=line.strip() 384 if line=='}': break 385 record.reads[-1].wr.append(wr(line)) 386 line = "" 387 elif line.startswith("WA{"): 388 if record.wa is None: 389 record.wa=[] 390 try: 391 line = handle.next() 392 except StopIteration: 393 raise ValueError("Failed to read WA block") 394 record.wa.append(wa(line)) 395 for line in handle: 396 line=line.strip() 397 if line=='}': break 398 record.wa[-1].info.append(line) 399 line = "" 400 elif line.startswith("CT{"): 401 if record.ct is None: 402 record.ct=[] 403 try: 404 line = handle.next() 405 except StopIteration: 406 raise ValueError("Failed to read CT block") 407 record.ct.append(ct(line)) 408 for line in handle: 409 line=line.strip() 410 if line=="COMMENT{": 411 for line in handle: 412 line = line.strip() 413 if line.endswith("C}"): 414 break 415 record.ct[-1].comment.append(line) 416 elif line=='}': 417 break 418 else: 419 record.ct[-1].info.append(line) 420 line = "" 421 else: 422 break 423 424 if not line.startswith('RD'): # another read? 425 break 426 427 yield record
428
429 -class ACEFileRecord:
430 """Holds data of an ACE file. 431 """
432 - def __init__(self):
433 self.ncontigs=None 434 self.nreads=None 435 self.contigs=[] 436 self.wa=None # none or many
437
438 - def sort(self):
439 """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """ 440 441 ct=[] 442 rt=[] 443 wr=[] 444 # search for tags that aren't in the right position 445 for i in range(len(self.contigs)): 446 c = self.contigs[i] 447 if c.wa: 448 if not self.wa: 449 self.wa=[] 450 self.wa.extend(c.wa) 451 if c.ct: 452 newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name] 453 map(self.contigs[i].ct.remove,newcts) 454 ct.extend(newcts) 455 for j in range(len(c.reads)): 456 r = c.reads[j] 457 if r.rt: 458 newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name] 459 map(self.contigs[i].reads[j].rt.remove,newrts) 460 rt.extend(newrts) 461 if r.wr: 462 newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name] 463 map(self.contigs[i].reads[j].wr.remove,newwrs) 464 wr.extend(newwrs) 465 # now sort them into their proper place 466 for i in range(len(self.contigs)): 467 c = self.contigs[i] 468 for ct_tag in ct: 469 if ct_tag.name==c.name: 470 if self.contigs[i].ct is None: 471 self.contigs[i].ct=[] 472 self.contigs[i].ct.append(ct_tag) 473 if rt or wr: 474 for j in range(len(c.reads)): 475 r = c.reads[j] 476 for rt_tag in rt: 477 if rt_tag.name==r.rd.name: 478 if self.contigs[i].reads[j].rt is None: 479 self.contigs[i].reads[j].rt=[] 480 self.contigs[i].reads[j].rt.append(rt_tag) 481 for wr_tag in wr: 482 if wr_tag.name==r.rd.name: 483 if self.contigs[i].reads[j].wr is None: 484 self.contigs[i].reads[j].wr=[] 485 self.contigs[i].reads[j].wr.append(wr_tag)
486
487 -def read(handle):
488 """Parses the full ACE file in list of contigs. 489 490 """ 491 492 handle = iter(handle) 493 494 record=ACEFileRecord() 495 496 try: 497 line = handle.next() 498 except StopIteration: 499 raise ValueError("Premature end of file") 500 501 # check if the file starts correctly 502 if not line.startswith('AS'): 503 raise ValueError("File does not start with 'AS'.") 504 505 words = line.split() 506 record.ncontigs, record.nreads = map(int, words[1:3]) 507 508 # now read all the records 509 record.contigs = list(parse(handle)) 510 # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?). 511 # If the iterator is used, the tags are returned with the contig or the read after which they appear, 512 # if all tags are at the end, they are read with the last contig. The concept of an 513 # iterator leaves no other choice. But if the user uses the ACEParser, we can check 514 # them and put them into the appropriate contig/read instance. 515 # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable... 516 record.sort() 517 return record
518