Package Bio :: Package SeqIO :: Module FastaIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.FastaIO

  1  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # This module is for reading and writing FASTA format files as SeqRecord 
  7  # objects.  The code is partly inspired  by earlier Biopython modules, 
  8  # Bio.Fasta.* and the now deprecated Bio.SeqIO.FASTA 
  9   
 10  """Bio.SeqIO support for the "fasta" (aka FastA or Pearson) file format. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions.""" 
 13   
 14  from Bio.Alphabet import single_letter_alphabet 
 15  from Bio.Seq import Seq 
 16  from Bio.SeqRecord import SeqRecord 
 17  from Interfaces import SequentialSequenceWriter 
 18   
 19  #This is a generator function! 
20 -def FastaIterator(handle, alphabet = single_letter_alphabet, title2ids = None) :
21 """Generator function to iterate over Fasta records (as SeqRecord objects). 22 23 handle - input file 24 alphabet - optional alphabet 25 title2ids - A function that, when given the title of the FASTA 26 file (without the beginning >), will return the id, name and 27 description (in that order) for the record as a tuple of strings. 28 29 If this is not given, then the entire title line will be used 30 as the description, and the first word as the id and name. 31 32 Note that use of title2ids matches that of Bio.Fasta.SequenceParser 33 but the defaults are slightly different. 34 """ 35 #Skip any text before the first record (e.g. blank lines, comments) 36 while True : 37 line = handle.readline() 38 if line == "" : return #Premature end of file, or just empty? 39 if line[0] == ">" : 40 break 41 42 while True : 43 if line[0]<>">" : 44 raise ValueError("Records in Fasta files should start with '>' character") 45 if title2ids : 46 id, name, descr = title2ids(line[1:].rstrip()) 47 else : 48 descr = line[1:].rstrip() 49 id = descr.split()[0] 50 name = id 51 52 lines = [] 53 line = handle.readline() 54 while True: 55 if not line : break 56 if line[0] == ">": break 57 #Remove trailing whitespace, and any internal spaces 58 lines.append(line.rstrip().replace(" ","")) 59 line = handle.readline() 60 61 #Return the record and then continue... 62 yield SeqRecord(Seq("".join(lines), alphabet), 63 id = id, name = name, description = descr) 64 65 if not line : return #StopIteration 66 assert False, "Should not reach this line"
67
68 -class FastaWriter(SequentialSequenceWriter):
69 """Class to write Fasta format files."""
70 - def __init__(self, handle, wrap=60, record2title=None):
71 """Create a Fasta writer. 72 73 handle - Handle to an output file, e.g. as returned 74 by open(filename, "w") 75 wrap - Optional line length used to wrap sequence lines. 76 Defaults to wrapping the sequence at 60 characters 77 Use zero (or None) for no wrapping, giving a single 78 long line for the sequence. 79 record2title - Optional function to return the text to be 80 used for the title line of each record. By default the 81 a combination of the record.id and record.description 82 is used. If the record.description starts with the 83 record.id, then just the record.description is used. 84 85 You can either use: 86 87 myWriter = FastaWriter(open(filename,"w")) 88 writer.write_file(myRecords) 89 90 Or, follow the sequential file writer system, for example: 91 92 myWriter = FastaWriter(open(filename,"w")) 93 writer.write_header() # does nothing for Fasta files 94 ... 95 Multiple calls to writer.write_record() and/or writer.write_records() 96 ... 97 writer.write_footer() # does nothing for Fasta files 98 writer.close() 99 """ 100 SequentialSequenceWriter.__init__(self, handle) 101 #self.handle = handle 102 self.wrap = None 103 if wrap : 104 if wrap < 1 : 105 raise ValueError 106 self.wrap = wrap 107 self.record2title = record2title
108
109 - def write_record(self, record):
110 """Write a single Fasta record to the file.""" 111 assert self._header_written 112 assert not self._footer_written 113 self._record_written = True 114 115 if self.record2title : 116 title=self.clean(record2title(record)) 117 else : 118 id = self.clean(record.id) 119 description = self.clean(record.description) 120 121 #if description[:len(id)]==id : 122 if description and description.split(None,1)[0]==id : 123 #The description includes the id at the start 124 title = description 125 else : 126 title = "%s %s" % (id, description) 127 128 assert "\n" not in title 129 assert "\r" not in title 130 self.handle.write(">%s\n" % title) 131 132 data = record.seq.tostring() 133 assert "\n" not in data 134 assert "\r" not in data 135 136 if self.wrap : 137 for i in range(0, len(data), self.wrap): 138 self.handle.write(data[i:i+self.wrap] + "\n") 139 else : 140 self.handle.write(data + "\n")
141 142 if __name__ == "__main__" : 143 print "Running quick self test" 144 145 import os 146 from Bio.Alphabet import generic_protein, generic_nucleotide 147 148 #Download the files from here: 149 #ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Nanoarchaeum_equitans 150 fna_filename = "NC_005213.fna" 151 faa_filename = "NC_005213.faa" 152
153 - def genbank_name_function(text) :
154 text, descr = text.split(None,1) 155 id = text.split("|")[3] 156 name = id.split(".",1)[0] 157 return id, name, descr
158 171 172 if os.path.isfile(fna_filename) : 173 print "--------" 174 print "FastaIterator (single sequence)" 175 iterator = FastaIterator(open(fna_filename, "r"), alphabet=generic_nucleotide, title2ids=genbank_name_function) 176 count=0 177 for record in iterator : 178 count=count+1 179 print_record(record) 180 assert count == 1 181 print str(record.__class__) 182 183 if os.path.isfile(faa_filename) : 184 print "--------" 185 print "FastaIterator (multiple sequences)" 186 iterator = FastaIterator(open(faa_filename, "r"), alphabet=generic_protein, title2ids=genbank_name_function) 187 count=0 188 for record in iterator : 189 count=count+1 190 print_record(record) 191 break 192 assert count>0 193 print str(record.__class__) 194 195 from cStringIO import StringIO 196 print "--------" 197 print "FastaIterator (empty input file)" 198 #Just to make sure no errors happen 199 iterator = FastaIterator(StringIO("")) 200 count = 0 201 for record in iterator : 202 count = count+1 203 assert count==0 204 205 print "Done" 206