Package Bio :: Package Fasta
[hide private]
[frames] | no frames]

Source Code for Package Bio.Fasta

  1  """Utilities for working with FASTA-formatted sequences (DEPRECATED). 
  2   
  3  Classes: 
  4  Record             Holds FASTA sequence data. 
  5  Iterator           Iterates over sequence data in a FASTA file. 
  6  RecordParser       Parses FASTA sequence data into a Record object. 
  7  SequenceParser     Parses FASTA sequence data into a SeqRecord object. 
  8   
  9  For a long time this module was the most commonly used and best documented 
 10  FASTA parser in Biopython.  However, we now recommend using Bio.SeqIO instead. 
 11  After being declared obsolete, Bio.Fasta has now been officially deprecated 
 12  (with a warning message when imported) and will be removed in a future 
 13  release. 
 14   
 15  If you are already using Bio.Fasta with the SequenceParser to get SeqRecord 
 16  objects, then you should be able to switch to the more recent Bio.SeqIO module 
 17  very easily as that too uses SeqRecord objects.  For example, 
 18   
 19  from Bio import Fasta 
 20  handle = open("example.fas") 
 21  for seq_record in Fasta.Iterator(handle, Fasta.SequenceParser()) : 
 22      print seq_record.description 
 23      print seq_record.seq 
 24  handle.close() 
 25   
 26  Using Bio.SeqIO instead this becomes: 
 27   
 28  from Bio import SeqIO 
 29  handle = open("example.fas") 
 30  for seq_record in SeqIO.parse(handle, "fasta") : 
 31      print seq_record.description 
 32      print seq_record.seq 
 33  handle.close() 
 34   
 35  Converting an existing code which uses the RecordParser is a little more 
 36  complicated as the Bio.Fasta.Record object differs from the SeqRecord. 
 37   
 38  from Bio import Fasta 
 39  handle = open("example.fas") 
 40  for record in Fasta.Iterator(handle, Fasta.RecordParser()) : 
 41      #record is a Bio.Fasta.Record object 
 42      print record.title #The full title line as a string 
 43      print record.sequence #The sequence as a string 
 44  handle.close() 
 45   
 46  Using Bio.SeqIO instead this becomes: 
 47   
 48  from Bio import SeqIO 
 49  handle = open("example.fas") 
 50  for seq_record in SeqIO.parse(handle, "fasta") : 
 51      print seq_record.description #The full title line as a string 
 52      print str(seq_record.seq) #The sequence as a string 
 53  handle.close() 
 54   
 55  Very old code may have used Bio.Fasta.index_file and Dictionary, which were 
 56  deprecated in Biopython 1.44 and removed in Biopython 1.46. These allowed 
 57  indexing of a FASTA file and access to the records with a dictionary like 
 58  interface. Currently using Bio.SeqIO.to_dict to create an in memory dictionary 
 59  of SeqRecord objects is the best replacement, but for very large files 
 60  additional indexing support for Bio.SeqIO is being considered. 
 61  """ 
 62  from Bio import Seq 
 63  from Bio import SeqRecord 
 64  from Bio import Alphabet 
 65   
 66  import warnings 
 67  warnings.warn('Bio.Fasta is deprecated. Please use the "fasta" support in ' 
 68                'Bio.SeqIO (or Bio.AlignIO) instead.', DeprecationWarning) 
 69   
70 -class Record:
71 """Holds information from a FASTA record. 72 73 Members: 74 title Title line ('>' character not included). 75 sequence The sequence. 76 77 """
78 - def __init__(self, colwidth=60):
79 """__init__(self, colwidth=60) 80 81 Create a new Record. colwidth specifies the number of residues 82 to put on each line when generating FASTA format. 83 84 """ 85 self.title = '' 86 self.sequence = '' 87 self._colwidth = colwidth
88
89 - def __str__(self):
90 s = [] 91 s.append('>%s' % self.title) 92 i = 0 93 while i < len(self.sequence): 94 s.append(self.sequence[i:i+self._colwidth]) 95 i = i + self._colwidth 96 #Was having a problem getting the tests to pass on windows... 97 #return os.linesep.join(s) 98 return "\n".join(s)
99
100 -class Iterator:
101 """Returns one record at a time from a FASTA file. 102 """
103 - def __init__(self, handle, parser = None, debug = 0):
104 """Initialize a new iterator. 105 """ 106 self.handle = handle 107 self._parser = parser 108 self._debug = debug 109 110 #Skip any text before the first record (e.g. blank lines) 111 while True : 112 line = handle.readline() 113 if not line or line[0] == ">" : 114 break 115 if debug : print "Skipping: " + line 116 self._lookahead = line
117
118 - def __iter__(self):
119 return iter(self.next, None)
120
121 - def next(self):
122 """Return the next record in the file""" 123 line = self._lookahead 124 if not line: 125 return None 126 assert line[0]==">", line 127 lines = [line.rstrip()] 128 line = self.handle.readline() 129 while line: 130 if line[0] == ">": break 131 if line[0] == "#" : 132 if self._debug : print "Ignoring comment line" 133 pass 134 else : 135 lines.append(line.rstrip()) 136 line = self.handle.readline() 137 self._lookahead = line 138 if self._debug : print "Debug: '%s' and '%s'" % (title, "".join(lines)) 139 if self._parser is None: 140 return "\n".join(lines) 141 else : 142 return self._parser.parse_string("\n".join(lines))
143
144 -class RecordParser:
145 """Parses FASTA sequence data into a Fasta.Record object. 146 """
147 - def __init__(self, debug = 0):
148 pass
149
150 - def parse_string(self, text) :
151 text = text.replace("\r\n","\n") #Crude way of dealing with \r\n 152 assert text[0] == ">", text 153 text = text.split("\n>",1)[0] # Only do the first record if more than one 154 title, sequence = text.split("\n", 1) 155 title = title[1:] 156 rec = Record() 157 rec.title = title 158 rec.sequence = sequence.replace("\n","") 159 return rec
160
161 - def parse(self, handle):
162 return self.parse_string(handle.read())
163
164 -class SequenceParser:
165 """Parses FASTA sequence data into a SeqRecord object. 166 """
167 - def __init__(self, alphabet = Alphabet.generic_alphabet, title2ids = None, 168 debug = 0):
169 """Initialize a Scanner and Sequence Consumer. 170 171 Arguments: 172 o alphabet - The alphabet of the sequences to be parsed. If not 173 passed, this will be set as generic_alphabet. 174 o title2ids - A function that, when given the title of the FASTA 175 file (without the beginning >), will return the id, name and 176 description (in that order) for the record. If this is not given, 177 then the entire title line will be used as the description. 178 """ 179 self.alphabet = alphabet 180 self.title2ids = title2ids
181
182 - def parse_string(self, text) :
183 text = text.replace("\r\n","\n") #Crude way of dealing with \r\n 184 assert text[0] == ">", text 185 text = text.split("\n>",1)[0] # Only do the first record if more than one 186 title, sequence = text.split("\n", 1) 187 title = title[1:] 188 189 seq = Seq.Seq(sequence.replace("\n",""), self.alphabet) 190 rec = SeqRecord.SeqRecord(seq) 191 192 if self.title2ids: 193 seq_id, name, descr = self.title2ids(title) 194 rec.id = seq_id 195 rec.name = name 196 rec.description = descr 197 else: 198 rec.description = title 199 200 return rec
201
202 - def parse(self, handle):
203 return self.parse_string(handle.read())
204