1
2
3
4
5
6
7
8
9 """
10 Parser for PHD files output by PHRED and used by PHRAP and CONSED.
11
12 This module can be used used directly which will return Record objects
13 which should contain all the original data in the file.
14
15 Alternatively, using Bio.SeqIO with the "phd" format will call this module
16 internally. This will give SeqRecord objects for each contig sequence.
17 """
18
19 from Bio import Seq
20 from Bio.Alphabet import generic_dna
21
22 CKEYWORDS=['CHROMAT_FILE','ABI_THUMBPRINT','PHRED_VERSION','CALL_METHOD',\
23 'QUALITY_LEVELS','TIME','TRACE_ARRAY_MIN_INDEX','TRACE_ARRAY_MAX_INDEX',\
24 'TRIM','TRACE_PEAK_AREA_RATIO','CHEM','DYE']
25
27 """Hold information from a PHD file."""
29 self.file_name = ''
30 self.comments = {}
31 for kw in CKEYWORDS:
32 self.comments[kw.lower()] = None
33 self.sites = []
34 self.seq = ''
35 self.seq_trimmed = ''
36
37
39 """Reads the next PHD record from the file, returning it as a Record object.
40
41 This function reads PHD file data line by line from the handle,
42 and returns a single Record object.
43 """
44 for line in handle:
45 if line.startswith("BEGIN_SEQUENCE"):
46 record = Record()
47 record.file_name = line[15:].rstrip()
48 break
49 else:
50 return
51
52 for line in handle:
53 if line.startswith("BEGIN_COMMENT"):
54 break
55 else:
56 raise ValueError("Failed to find BEGIN_COMMENT line")
57
58 for line in handle:
59 line = line.strip()
60 if not line:
61 continue
62 if line=="END_COMMENT":
63 break
64 keyword, value = line.split(":", 1)
65 keyword = keyword.lower()
66 value = value.strip()
67 if keyword in ('chromat_file',
68 'phred_version',
69 'call_method',
70 'chem',
71 'dye',
72 'time',
73 'basecaller_version',
74 'trace_processor_version'):
75 record.comments[keyword] = value
76 elif keyword in ('abi_thumbprint',
77 'quality_levels',
78 'trace_array_min_index',
79 'trace_array_max_index'):
80 record.comments[keyword] = int(value)
81 elif keyword=='trace_peak_area_ratio':
82 record.comments[keyword] = float(value)
83 elif keyword=='trim':
84 first, last, prob = value.split()
85 record.comments[keyword] = (int(first), int(last), float(prob))
86 else:
87 raise ValueError("Failed to find END_COMMENT line")
88
89 for line in handle:
90 if line.startswith('BEGIN_DNA'):
91 break
92 else:
93 raise ValueError("Failed to find BEGIN_DNA line")
94
95 for line in handle:
96 if line.startswith('END_DNA'):
97 break
98 else:
99
100
101
102 parts = line.split()
103 if len(parts) in [2,3] :
104 record.sites.append(tuple(parts))
105 else:
106 raise ValueError("DNA line must contain a base and quality "
107 "score, and optionally a peak location.")
108
109 for line in handle:
110 if line.startswith("END_SEQUENCE"):
111 break
112 else:
113 raise ValueError("Failed to find END_SEQUENCE line")
114
115 record.seq = Seq.Seq(''.join([n[0] for n in record.sites]), generic_dna)
116 if record.comments['trim'] is not None:
117 first, last = record.comments['trim'][:2]
118 record.seq_trimmed = record.seq[first:last]
119
120 return record
121
123 """Iterates over a file returning multiple PHD records.
124
125 The data is read line by line from the handle. The handle can be a list
126 of lines, an open file, or similar; the only requirement is that we can
127 iterate over the handle to retrieve lines from it.
128
129 Typical usage:
130
131 records = parse(handle)
132 for record in records:
133 # do something with the record object
134 """
135 while True:
136 record = read(handle)
137 if not record:
138 return
139 yield record
140