1 """Deal with Motifs or Signatures allowing ambiguity in the sequences.
2
3 This class contains Schema which deal with Motifs and Signatures at
4 a higher level, by introducing `don't care` (ambiguity) symbols into
5 the sequences. For instance, you could combine the following Motifs:
6
7 'GATC', 'GATG', 'GATG', 'GATT'
8
9 as all falling under a schema like 'GAT*', where the star indicates a
10 character can be anything. This helps us condense a whole ton of
11 motifs or signatures.
12 """
13
14 import random
15 import re
16
17
18 from Bio import Alphabet
19 from Bio.Seq import MutableSeq
20
21
22 from Pattern import PatternRepository
23
24
25 from Bio.GA import Organism
26 from Bio.GA.Evolver import GenerationEvolver
27 from Bio.GA.Mutation.Simple import SinglePositionMutation
28 from Bio.GA.Crossover.Point import SinglePointCrossover
29 from Bio.GA.Repair.Stabilizing import AmbiguousRepair
30 from Bio.GA.Selection.Tournament import TournamentSelection
31 from Bio.GA.Selection.Diversity import DiversitySelection
32
34 """Deal with motifs that have ambiguity characters in it.
35
36 This motif class allows specific ambiguity characters and tries to
37 speed up finding motifs using regular expressions.
38
39 This is likely to be a replacement for the Schema representation,
40 since it allows multiple ambiguity characters to be used.
41 """
43 """Initialize with ambiguity information.
44
45 Arguments:
46
47 o ambiguity_info - A dictionary which maps letters in the motifs to
48 the ambiguous characters which they might represent. For example,
49 {'R' : 'AG'} specifies that Rs in the motif can match a A or a G.
50 All letters in the motif must be represented in the ambiguity_info
51 dictionary.
52 """
53 self._ambiguity_info = ambiguity_info
54
55
56 self._motif_cache = {}
57
59 """Encode the passed motif as a regular expression pattern object.
60
61 Arguments:
62
63 o motif - The motif we want to encode. This should be a string.
64
65 Returns:
66 A compiled regular expression pattern object that can be used
67 for searching strings.
68 """
69 regexp_string = ""
70
71 for motif_letter in motif:
72 try:
73 letter_matches = self._ambiguity_info[motif_letter]
74 except KeyError:
75 raise KeyError("No match information for letter %s"
76 % motif_letter)
77
78 if len(letter_matches) > 1:
79 regexp_match = "[" + letter_matches + "]"
80 elif len(letter_matches) == 1:
81 regexp_match = letter_matches
82 else:
83 raise ValueError("Unexpected match information %s"
84 % letter_matches)
85
86 regexp_string += regexp_match
87
88 return re.compile(regexp_string)
89
91 """Return the location of ambiguous items in the motif.
92
93 This just checks through the motif and compares each letter
94 against the ambiguity information. If a letter stands for multiple
95 items, it is ambiguous.
96 """
97 ambig_positions = []
98 for motif_letter_pos in range(len(motif)):
99 motif_letter = motif[motif_letter_pos]
100 try:
101 letter_matches = self._ambiguity_info[motif_letter]
102 except KeyError:
103 raise KeyError("No match information for letter %s"
104 % motif_letter)
105
106 if len(letter_matches) > 1:
107 ambig_positions.append(motif_letter_pos)
108
109 return ambig_positions
110
112 """Return the number of ambiguous letters in a given motif.
113 """
114 ambig_positions = self.find_ambiguous(motif)
115 return len(ambig_positions)
116
118 """Return all non-overlapping motif matches in the query string.
119
120 This utilizes the regular expression findall function, and will
121 return a list of all non-overlapping occurances in query that
122 match the ambiguous motif.
123 """
124 try:
125 motif_pattern = self._motif_cache[motif]
126 except KeyError:
127 motif_pattern = self.encode_motif(motif)
128 self._motif_cache[motif] = motif_pattern
129
130 return motif_pattern.findall(query)
131
133 """Find the number of non-overlapping times motif occurs in query.
134 """
135 all_matches = self.find_matches(motif, query)
136 return len(all_matches)
137
139 """Return a listing of all unambiguous letters allowed in motifs.
140 """
141 all_letters = self._ambiguity_info.keys()
142 all_letters.sort()
143
144 unambig_letters = []
145
146 for letter in all_letters:
147 possible_matches = self._ambiguity_info[letter]
148 if len(possible_matches) == 1:
149 unambig_letters.append(letter)
150
151 return unambig_letters
152
153
154
155
156
158 """Alphabet of a simple Schema for DNA sequences.
159
160 This defines a simple alphabet for DNA sequences that has a single
161 character which can match any other character.
162
163 o G,A,T,C - The standard unambiguous DNA alphabet.
164
165 o * - Any letter
166 """
167 letters = ["G", "A", "T", "C", "*"]
168
169 alphabet_matches = {"G" : "G",
170 "A" : "A",
171 "T" : "T",
172 "C" : "C",
173 "*" : "GATC"}
174
175
176
178 """Find schemas using a genetic algorithm approach.
179
180 This approach to finding schema uses Genetic Algorithms to evolve
181 a set of schema and find the best schema for a specific set of
182 records.
183
184 The 'default' finder searches for ambiguous DNA elements. This
185 can be overridden easily by creating a GeneticAlgorithmFinder
186 with a different alphabet.
187 """
189 """Initialize a finder to get schemas using Genetic Algorithms.
190
191 Arguments:
192
193 o alphabet -- The alphabet which specifies the contents of the
194 schemas we'll be generating. This alphabet must contain the
195 attribute 'alphabet_matches', which is a dictionary specifying
196 the potential ambiguities of each letter in the alphabet. These
197 ambiguities will be used in building up the schema.
198 """
199 self.alphabet = alphabet
200
201 self.initial_population = 500
202 self.min_generations = 10
203
204 self._set_up_genetic_algorithm()
205
225
227 """Find the given number of unique schemas using a genetic algorithm
228
229 Arguments:
230
231 o fitness - A callable object (ie. function) which will evaluate
232 the fitness of a motif.
233
234 o num_schemas - The number of unique schemas with good fitness
235 that we want to generate.
236 """
237 start_population = \
238 Organism.function_population(self.motif_generator.random_motif,
239 self.initial_population,
240 fitness)
241 finisher = SimpleFinisher(num_schemas, self.min_generations)
242
243
244 evolver = GenerationEvolver(start_population, self.selector)
245 evolved_pop = evolver.evolve(finisher.is_finished)
246
247
248 schema_info = {}
249 for org in evolved_pop:
250
251
252 seq_genome = org.genome.toseq()
253 schema_info[seq_genome.data] = org.fitness
254
255 return PatternRepository(schema_info)
256
257
258
260 """Calculate fitness for schemas that differentiate between sequences.
261 """
262 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
263 """Initialize with different sequences to evaluate
264
265 Arguments:
266
267 o positive_seq - A list of SeqRecord objects which are the 'positive'
268 sequences -- the ones we want to select for.
269
270 o negative_seq - A list of SeqRecord objects which are the 'negative'
271 sequences that we want to avoid selecting.
272
273 o schema_evaluator - An Schema class which can be used to
274 evaluate find motif matches in sequences.
275 """
276 self._pos_seqs = positive_seqs
277 self._neg_seqs = negative_seqs
278 self._schema_eval = schema_evaluator
279
281 """Calculate the fitness for a given schema.
282
283 Fitness is specified by the number of occurances of the schema in
284 the positive sequences minus the number of occurances in the
285 negative examples.
286
287 This fitness is then modified by multiplying by the length of the
288 schema and then dividing by the number of ambiguous characters in
289 the schema. This helps select for schema which are longer and have
290 less redundancy.
291 """
292
293 seq_motif = genome.toseq()
294 motif = seq_motif.data
295
296
297 num_pos = 0
298 for seq_record in self._pos_seqs:
299 cur_counts = self._schema_eval.num_matches(motif,
300 seq_record.seq.data)
301 num_pos += cur_counts
302
303
304 num_neg = 0
305 for seq_record in self._neg_seqs:
306 cur_counts = self._schema_eval.num_matches(motif,
307 seq_record.seq.data)
308
309 num_neg += cur_counts
310
311 num_ambiguous = self._schema_eval.num_ambiguous(motif)
312
313 num_ambiguous = pow(2.0, num_ambiguous)
314
315 num_ambiguous += 1
316
317 motif_size = len(motif)
318 motif_size = motif_size * 4.0
319
320 discerning_power = num_pos - num_neg
321
322 diff = (discerning_power * motif_size) / float(num_ambiguous)
323 return diff
324
326 """Calculate a fitness giving weight to schemas that match many times.
327
328 This fitness function tries to maximize schemas which are found many
329 times in a group of sequences.
330 """
331 - def __init__(self, seq_records, schema_evaluator):
332 """Initialize with sequences to evaluate.
333
334 Arguments:
335
336 o seq_records -- A set of SeqRecord objects which we use to
337 calculate the fitness.
338
339 o schema_evaluator - An Schema class which can be used to
340 evaluate find motif matches in sequences.
341 """
342 self._records = seq_records
343 self._evaluator = schema_evaluator
344
346 """Calculate the fitness of a genome based on schema matches.
347
348 This bases the fitness of a genome completely on the number of times
349 it matches in the set of seq_records. Matching more times gives a
350 better fitness
351 """
352
353 seq_motif = genome.toseq()
354 motif = seq_motif.data
355
356
357 num_times = 0
358 for seq_record in self._records:
359 cur_counts = self._evaluator.num_matches(motif,
360 seq_record.seq.data)
361 num_times += cur_counts
362
363 return num_times
364
365
367 """Generate a random motif within given parameters.
368 """
369 - def __init__(self, alphabet, min_size = 12, max_size = 17):
370 """Initialize with the motif parameters.
371
372 Arguments:
373
374 o alphabet - An alphabet specifying what letters can be inserted in
375 a motif.
376
377 o min_size, max_size - Specify the range of sizes for motifs.
378 """
379 self._alphabet = alphabet
380 self._min_size = min_size
381 self._max_size = max_size
382
384 """Create a random motif within the given parameters.
385
386 This returns a single motif string with letters from the given
387 alphabet. The size of the motif will be randomly chosen between
388 max_size and min_size.
389 """
390 motif_size = random.randrange(self._min_size, self._max_size)
391
392 motif = ""
393 for letter_num in range(motif_size):
394 cur_letter = random.choice(self._alphabet.letters)
395 motif += cur_letter
396
397 return MutableSeq(motif, self._alphabet)
398
400 """Determine when we are done evolving motifs.
401
402 This takes the very simple approach of halting evolution when the
403 GA has proceeded for a specified number of generations and has
404 a given number of unique schema with positive fitness.
405 """
406 - def __init__(self, num_schemas, min_generations = 100):
407 """Initialize the finisher with its parameters.
408
409 Arguments:
410
411 o num_schemas -- the number of useful (positive fitness) schemas
412 we want to generation
413
414 o min_generations -- The minimum number of generations to allow
415 the GA to proceed.
416 """
417 self.num_generations = 0
418
419 self.num_schemas = num_schemas
420 self.min_generations = min_generations
421
423 """Determine when we can stop evolving the population.
424 """
425 self.num_generations += 1
426
427
428 if self.num_generations >= self.min_generations:
429 all_seqs = []
430 for org in organisms:
431 if org.fitness > 0:
432 if org.genome not in all_seqs:
433 all_seqs.append(org.genome)
434
435 if len(all_seqs) >= self.num_schemas:
436 return 1
437
438 return 0
439
440
442 """Find schema in a set of sequences using a genetic algorithm approach.
443
444 Finding good schemas is very difficult because it takes forever to
445 enumerate all of the potential schemas. This finder using a genetic
446 algorithm approach to evolve good schema which match many times in
447 a set of sequences.
448
449 The default implementation of the finder is ready to find schemas
450 in a set of DNA sequences, but the finder can be customized to deal
451 with any type of data.
452 """
459
460 - def find(self, seq_records):
461 """Find well-represented schemas in the given set of SeqRecords.
462 """
463 fitness_evaluator = MostCountSchemaFitness(seq_records,
464 self.evaluator)
465
466 return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
467 self.num_schemas)
468
470 """Find schemas which differentiate between the two sets of SeqRecords.
471 """
472 fitness_evaluator = DifferentialSchemaFitness(first_records,
473 second_records,
474 self.evaluator)
475
476 return self._finder.find_schemas(fitness_evaluator.calculate_fitness,
477 self.num_schemas)
478
480 """Convert a sequence into a representation of ambiguous motifs (schemas).
481
482 This takes a sequence, and returns the number of times specified
483 motifs are found in the sequence. This lets you represent a sequence
484 as just a count of (possibly ambiguous) motifs.
485 """
486 - def __init__(self, schemas, ambiguous_converter):
487 """Initialize the coder to convert sequences
488
489 Arguments:
490
491 o schema - A list of all of the schemas we want to search for
492 in input sequences.
493
494 o ambiguous_converter - An Schema class which can be
495 used to convert motifs into regular expressions for searching.
496 """
497 self._schemas = schemas
498 self._converter = ambiguous_converter
499
501 """Represent the given input sequence as a bunch of motif counts.
502
503 Arguments:
504
505 o sequence - A Bio.Seq object we are going to represent as schemas.
506
507 This takes the sequence, searches for the motifs within it, and then
508 returns counts specifying the relative number of times each motifs
509 was found. The frequencies are in the order the original motifs were
510 passed into the initializer.
511 """
512 schema_counts = []
513
514 for schema in self._schemas:
515 num_counts = self._converter.num_matches(schema, sequence.data)
516 schema_counts.append(num_counts)
517
518
519 min_count = 0
520 max_count = max(schema_counts)
521
522
523
524 if max_count > 0:
525 for count_num in range(len(schema_counts)):
526 schema_counts[count_num] = (float(schema_counts[count_num]) -
527 float(min_count)) / float(max_count)
528
529 return schema_counts
530
532 """Determine whether or not the given pattern matches the schema.
533
534 Arguments:
535
536 o pattern - A string representing the pattern we want to check for
537 matching. This pattern can contain ambiguity characters (which are
538 assumed to be the same as those in the schema).
539
540 o schema - A string schema with ambiguity characters.
541
542 o ambiguity_character - The character used for ambiguity in the schema.
543 """
544 if len(pattern) != len(schema):
545 return 0
546
547
548
549 for pos in range(len(pattern)):
550 if (schema[pos] != ambiguity_character and
551 pattern[pos] != ambiguity_character and
552 pattern[pos] != schema[pos]):
553
554 return 0
555
556 return 1
557
559 """Generate Schema from inputs of Motifs or Signatures.
560 """
561 - def __init__(self, ambiguity_symbol = '*'):
562 """Initialize the SchemaFactory
563
564 Arguments:
565
566 o ambiguity_symbol -- The symbol to use when specifying that
567 a position is arbitrary.
568 """
569 self._ambiguity_symbol = ambiguity_symbol
570
571 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
572 """Generate schema from a list of motifs.
573
574 Arguments:
575
576 o motif_repository - A MotifRepository class that has all of the
577 motifs we want to convert to Schema.
578
579 o motif_percent - The percentage of motifs in the motif bank which
580 should be matches. We'll try to create schema that match this
581 percentage of motifs.
582
583 o num_ambiguous - The number of ambiguous characters to include
584 in each schema. The positions of these ambiguous characters will
585 be randomly selected.
586 """
587
588 all_motifs = motif_repository.get_top_percentage(motif_percent)
589
590
591 schema_info = {}
592
593
594 total_count = self._get_num_motifs(motif_repository, all_motifs)
595 matched_count = 0
596 assert total_count > 0, "Expected to have motifs to match"
597 while (float(matched_count) / float(total_count)) < motif_percent:
598
599 new_schema, matching_motifs = \
600 self._get_unique_schema(schema_info.keys(),
601 all_motifs, num_ambiguous)
602
603
604
605 schema_counts = 0
606 for motif in matching_motifs:
607
608 schema_counts += motif_repository.count(motif)
609
610
611
612 all_motifs.remove(motif)
613
614
615
616 schema_info[new_schema] = schema_counts
617
618 matched_count += schema_counts
619
620
621
622 return PatternRepository(schema_info)
623
625 """Return the number of motif counts for the list of motifs.
626 """
627 motif_count = 0
628 for motif in motif_list:
629 motif_count += repository.count(motif)
630
631 return motif_count
632
634 """Retrieve a unique schema from a motif.
635
636 We don't want to end up with schema that match the same thing,
637 since this could lead to ambiguous results, and be messy. This
638 tries to create schema, and checks that they do not match any
639 currently existing schema.
640 """
641
642
643
644 num_tries = 0
645
646 while 1:
647
648 cur_motif = random.choice(motif_list)
649
650 num_tries += 1
651
652 new_schema, matching_motifs = \
653 self._schema_from_motif(cur_motif, motif_list,
654 num_ambiguous)
655
656 has_match = 0
657 for old_schema in cur_schemas:
658 if matches_schema(new_schema, old_schema,
659 self._ambiguity_symbol):
660 has_match = 1
661
662
663
664 if not(has_match):
665 break
666
667
668 assert num_tries < 150, \
669 "Could not generate schema in %s tries from %s with %s" \
670 % (num_tries, motif_list, cur_schemas)
671
672 return new_schema, matching_motifs
673
675 """Create a schema from a given starting motif.
676
677 Arguments:
678
679 o motif - A motif with the pattern we will start from.
680
681 o motif_list - The total motifs we have.to match to.
682
683 o num_ambiguous - The number of ambiguous characters that should
684 be present in the schema.
685
686 Returns:
687
688 o A string representing the newly generated schema.
689
690 o A list of all of the motifs in motif_list that match the schema.
691 """
692 assert motif in motif_list, \
693 "Expected starting motif present in remaining motifs."
694
695
696
697 new_schema_list = list(motif)
698 for add_ambiguous in range(num_ambiguous):
699
700 while 1:
701 ambig_pos = random.choice(range(len(new_schema_list)))
702
703
704
705 if new_schema_list[ambig_pos] != self._ambiguity_symbol:
706 new_schema_list[ambig_pos] = self._ambiguity_symbol
707 break
708
709
710 new_schema = ''.join(new_schema_list)
711
712
713 matched_motifs = []
714 for motif in motif_list:
715 if matches_schema(motif, new_schema, self._ambiguity_symbol):
716 matched_motifs.append(motif)
717
718 return new_schema, matched_motifs
719
721 raise NotImplementedError("Still need to code this.")
722