1
2
3 """
4 A set of classes to interact with the multiple alignment command
5 line program clustalw.
6
7 Clustalw is the command line version of the graphical Clustalx
8 aligment program.
9
10 This requires clustalw available from:
11
12 ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/.
13
14 functions:
15 o read
16 o parse_file
17 o do_alignment
18
19 classes:
20 o ClustalAlignment
21 o MultipleAlignCL"""
22
23
24 import os
25 import sys
26
27
28 from Bio import Alphabet
29 from Bio.Alphabet import IUPAC
30 from Bio.Align.Generic import Alignment
31
33 """Parse the given file into a clustal aligment object.
34
35 Arguments:
36 o file_name - The name of the file to parse.
37 o alphabet - The type of alphabet to use for the alignment sequences.
38 This should correspond to the type of information contained in the file.
39 Defaults to be unambiguous_dna sequence.
40
41 There is a deprecated optional argument debug_level which has no effect.
42 """
43
44
45 handle = open(file_name, 'r')
46 from Bio import AlignIO
47 generic_alignment = AlignIO.read(handle, "clustal")
48 handle.close()
49
50
51 if isinstance(alphabet, Alphabet.Gapped) :
52 alpha = alphabet
53 else :
54 alpha = Alphabet.Gapped(alphabet)
55 clustal_alignment = ClustalAlignment(alpha)
56 clustal_alignment._records = generic_alignment._records
57 for record in clustal_alignment._records :
58 record.seq.alphabet = alpha
59 clustal_alignment._version = generic_alignment._version
60 clustal_alignment._star_info = generic_alignment._star_info
61
62 return clustal_alignment
63
65 """Perform an alignment with the given command line.
66
67 Arguments:
68 o command_line - A command line object that can give out
69 the command line we will input into clustalw.
70 o alphabet - the alphabet to use in the created alignment. If not
71 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for
72 dna and protein alignment respectively.
73
74 Returns:
75 o A clustal alignment object corresponding to the created alignment.
76 If the alignment type was not a clustal object, None is returned.
77 """
78 run_clust = os.popen(str(command_line))
79 status = run_clust.close()
80
81
82
83
84 value = 0
85 if status: value = status / 256
86
87
88
89 if value == 1:
90 raise ValueError("Bad command line option in the command: %s"
91 % str(command_line))
92
93 elif value == 2:
94 raise IOError("Cannot open sequence file %s"
95 % command_line.sequence_file)
96
97 elif value == 3:
98 raise IOError("Sequence file %s has an invalid format."
99 % command_line.sequence_file)
100
101 elif value == 4:
102 raise IOError("Sequence file %s has only one sequence present."
103 % command_line.sequence_file)
104
105
106 if command_line.output_file:
107 out_file = command_line.output_file
108 else:
109 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln'
110
111
112 if command_line.output_type and command_line.output_type != 'CLUSTAL':
113 return None
114
115 else:
116 if not alphabet:
117 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[
118 command_line.type == 'PROTEIN']
119
120
121 if not(os.path.exists(out_file)):
122 raise IOError("Output .aln file %s not produced, commandline: %s"
123 % (out_file, command_line))
124
125 return parse_file(out_file, alphabet)
126
127
129 """Work with the clustal aligment format.
130
131 This format is the default output from clustal -- these files normally
132 have an extension of .aln.
133 """
134
135 DEFAULT_VERSION = '1.81'
136
144
146 """Print out the alignment so it looks pretty.
147
148 The output produced from this should also be formatted in valid
149 clustal format.
150 """
151
152 from Bio import AlignIO
153 from StringIO import StringIO
154 handle = StringIO()
155 AlignIO.write([self], handle, "clustal")
156 handle.seek(0)
157 return handle.read()
158
159
161 """Add all of the stars, which indicate consensus sequence.
162 """
163 self._star_info = stars
164
166 """Add the version information about the clustal file being read.
167 """
168 self._version = version
169
170
172 """Represent a clustalw multiple alignment command line.
173
174 This is meant to make it easy to code the command line options you
175 want to submit to clustalw.
176
177 Clustalw has a ton of options and things to do but this is set up to
178 represent a clustalw mutliple alignment.
179
180 Warning: I don't use all of these options personally, so if you find
181 one to be broken for any reason, please let us know!
182 """
183
184 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA']
185 OUTPUT_ORDER = ['INPUT', 'ALIGNED']
186 OUTPUT_CASE = ['LOWER', 'UPPER']
187 OUTPUT_SEQNOS = ['OFF', 'ON']
188 RESIDUE_TYPES = ['PROTEIN', 'DNA']
189 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID']
190 DNA_MATRIX = ['IUB', 'CLUSTALW']
191
192 - def __init__(self, sequence_file, command = 'clustalw'):
193 """Initialize some general parameters that can be set as attributes.
194
195 Arguments:
196 o sequence_file - The file to read the sequences for alignment from.
197 o command - The command used to run clustalw. This defaults to
198 just 'clustalw' (ie. assumes you have it on your path somewhere).
199
200 General attributes that can be set:
201 o is_quick - if set as 1, will use a fast algorithm to create
202 the alignment guide tree.
203 o allow_negative - allow negative values in the alignment matrix.
204
205 Multiple alignment attributes that can be set as attributes:
206 o gap_open_pen - Gap opening penalty
207 o gap_ext_pen - Gap extension penalty
208 o is_no_end_pen - A flag as to whether or not there should be a gap
209 separation penalty for the ends.
210 o gap_sep_range - The gap separation penalty range.
211 o is_no_pgap - A flag to turn off residue specific gaps
212 o is_no_hgap - A flag to turn off hydrophilic gaps
213 o h_gap_residues - A list of residues to count a hydrophilic
214 o max_div - A percent identity to use for delay (? - I don't undertand
215 this!)
216 o trans_weight - The weight to use for transitions
217 """
218 self.sequence_file = sequence_file
219 self.command = command
220
221 self.is_quick = None
222 self.allow_negative = None
223
224 self.gap_open_pen = None
225 self.gap_ext_pen = None
226 self.is_no_end_pen = None
227 self.gap_sep_range = None
228 self.is_no_pgap = None
229 self.is_no_hgap = None
230 self.h_gap_residues = []
231 self.max_div = None
232 self.trans_weight = None
233
234
235
236 self.output_file = None
237 self.output_type = None
238 self.output_order = None
239 self.change_case = None
240 self.add_seqnos = None
241
242
243 self.guide_tree = None
244 self.new_tree = None
245
246
247 self.protein_matrix = None
248 self.dna_matrix = None
249
250
251 self.type = None
252
254 """Write out the command line as a string."""
255
256 if sys.platform <> "win32" :
257
258
259
260
261
262
263
264
265
266
267
268
269
270 cline = self.command + " " + self.sequence_file
271 else :
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302 if self.command.count(" ") > 0 :
303 cline = '"%s"' % self.command
304 else :
305 cline = self.command
306 if self.sequence_file.count(" ") > 0 :
307 cline += ' /INFILE="%s"' % self.sequence_file
308 else :
309 cline += ' /INFILE=%s' % self.sequence_file
310
311
312 if self.type:
313 cline += " -TYPE=%s" % self.type
314 if self.is_quick == 1:
315
316
317 cline += " -quicktree"
318 if self.allow_negative == 1:
319 cline += " -NEGATIVE"
320
321
322 if self.output_file:
323 cline += " -OUTFILE=%s" % self.output_file
324 if self.output_type:
325 cline += " -OUTPUT=%s" % self.output_type
326 if self.output_order:
327 cline += " -OUTORDER=%s" % self.output_order
328 if self.change_case:
329 cline += " -CASE=%s" % self.change_case
330 if self.add_seqnos:
331 cline += " -SEQNOS=%s" % self.add_seqnos
332 if self.new_tree:
333
334 cline += " -NEWTREE=%s -align" % self.new_tree
335
336
337 if self.guide_tree:
338 cline += " -USETREE=%s" % self.guide_tree
339 if self.protein_matrix:
340 cline += " -MATRIX=%s" % self.protein_matrix
341 if self.dna_matrix:
342 cline += " -DNAMATRIX=%s" % self.dna_matrix
343 if self.gap_open_pen:
344 cline += " -GAPOPEN=%s" % self.gap_open_pen
345 if self.gap_ext_pen:
346 cline += " -GAPEXT=%s" % self.gap_ext_pen
347 if self.is_no_end_pen == 1:
348 cline += " -ENDGAPS"
349 if self.gap_sep_range:
350 cline += " -GAPDIST=%s" % self.gap_sep_range
351 if self.is_no_pgap == 1:
352 cline += " -NOPGAP"
353 if self.is_no_hgap == 1:
354 cline += " -NOHGAP"
355 if len(self.h_gap_residues) != 0:
356
357 residue_list = ''
358 for residue in self.h_gap_residues:
359 residue_list = residue_list + residue
360 cline += " -HGAPRESIDUES=%s" % residue_list
361 if self.max_div:
362 cline += " -MAXDIV=%s" % self.max_div
363 if self.trans_weight:
364 cline += " -TRANSWEIGHT=%s" % self.trans_weight
365
366 return cline
367
368 - def set_output(self, output_file, output_type = None, output_order = None,
369 change_case = None, add_seqnos = None):
370 """Set the output parameters for the command line.
371 """
372 self.output_file = output_file
373
374 if output_type:
375 output_type = output_type.upper()
376 if output_type not in self.OUTPUT_TYPES:
377 raise ValueError("Invalid output type %s. Valid choices are %s"
378 % (output_type, self.OUTPUT_TYPES))
379 else:
380 self.output_type = output_type
381
382 if output_order:
383 output_order = output_order.upper()
384 if output_order not in self.OUTPUT_ORDER:
385 raise ValueError("Invalid output order %s. Valid choices are %s"
386 % (output_order, self.OUTPUT_ORDER))
387 else:
388 self.output_order = output_order
389
390 if change_case:
391 change_case = change_case.upper()
392 if output_type != "GDE":
393 raise ValueError("Change case only valid for GDE output.")
394 elif change_case not in self.CHANGE_CASE:
395 raise ValueError("Invalid change case %s. Valid choices are %s"
396 % (change_case, self.CHANGE_CASE))
397 else:
398 self.change_case = change_case
399
400 if add_seqnos:
401 add_seqnos = add_seqnos.upper()
402 if output_type:
403 raise ValueError("Add SeqNos only valid for CLUSTAL output.")
404 elif add_seqnos not in self.OUTPUT_SEQNOS:
405 raise ValueError("Invalid seqnos option %s. Valid choices: %s"
406 % (add_seqnos, self.OUTPUT_SEQNOS))
407 else:
408 self.add_seqnos = add_seqnos
409
411 """Provide a file to use as the guide tree for alignment.
412
413 Raises:
414 o IOError - If the tree_file doesn't exist."""
415 if not(os.path.exists(tree_file)):
416 raise IOError("Could not find the guide tree file %s." %
417 tree_file)
418 else:
419 self.guide_tree = tree_file
420
422 """Set the name of the guide tree file generated in the alignment.
423 """
424 self.new_tree = tree_file
425
427 """Set the type of protein matrix to use.
428
429 Protein matrix can be either one of the defined types (blosum, pam,
430 gonnet or id) or a file with your own defined matrix.
431 """
432 if protein_matrix.upper() in self.PROTEIN_MATRIX:
433 self.protein_matrix = protein_matrix.upper()
434 elif os.path.exists(protein_matrix):
435 self.protein_matrix = protein_matrix
436 else:
437 raise ValueError("Invalid matrix %s. Options are %s or a file." %
438 (protein_matrix.upper(), self.PROTEIN_MATRIX))
439
441 """Set the type of DNA matrix to use.
442
443 The dna_matrix can either be one of the defined types (iub or clustalw)
444 or a file with the matrix to use."""
445 if dna_matrix.upper() in self.DNA_MATRIX:
446 self.dna_matrix = dna_matrix.upper()
447 elif os.path.exists(dna_matrix):
448 self.dna_matrix = dna_matrix
449 else:
450 raise ValueError("Invalid matrix %s. Options are %s or a file." %
451 (dna_matrix, self.DNA_MATRIX))
452
454 """Set the type of residues within the file.
455
456 Clustal tries to guess whether the info is protein or DNA based on
457 the number of GATCs, but this can be wrong if you have a messed up
458 protein or DNA you are working with, so this allows you to set it
459 explicitly.
460 """
461 residue_type = residue_type.upper()
462 if residue_type in self.RESIDUE_TYPES:
463 self.type = residue_type
464 else:
465 raise ValueError("Invalid residue type %s. Valid choices are %s"
466 % (residue_type, self.RESIDUE_TYPES))
467