Package Bio :: Package NeuralNetwork :: Package Gene :: Module Motif
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Motif

  1  """Find and deal with motifs in biological sequence data. 
  2   
  3  Representing DNA (or RNA or proteins) in a neural network can be difficult 
  4  since input sequences can have different lengths. One way to get around 
  5  this problem is to deal with sequences by finding common motifs, and counting 
  6  the number of times those motifs occur in a sequence. This information can 
  7  then be used for creating the neural networks, with occurances of motifs 
  8  going into the network instead of raw sequence data. 
  9  """ 
 10  # biopython 
 11  from Bio import utils 
 12  from Bio.Seq import Seq 
 13   
 14  # local modules 
 15  from Pattern import PatternRepository 
 16   
17 -class MotifFinder:
18 """Find motifs in a set of Sequence Records. 19 """
20 - def __init__(self, alphabet_strict = 1):
21 """Initialize a finder to get motifs. 22 23 Arguments: 24 25 o alphabet_strict - Whether or not motifs should be 26 restricted to having all of there elements within the alphabet 27 of the sequences. This requires that the Sequences have a real 28 alphabet, and that all sequences have the same alphabet. 29 """ 30 self.alphabet_strict = alphabet_strict
31
32 - def find(self, seq_records, motif_size):
33 """Find all motifs of the given size in the passed SeqRecords. 34 35 Arguments: 36 37 o seq_records - A list of SeqRecord objects which the motifs 38 will be found from. 39 40 o motif_size - The size of the motifs we want to look for. 41 42 Returns: 43 A PatternRepository object that contains all of the motifs (and their 44 counts) found in the training sequences). 45 """ 46 motif_info = self._get_motif_dict(seq_records, motif_size) 47 48 return PatternRepository(motif_info)
49
50 - def _get_motif_dict(self, seq_records, motif_size):
51 """Return a dictionary with information on motifs. 52 53 This internal function essentially does all of the hard work for 54 finding motifs, and returns a dictionary containing the found motifs 55 and their counts. This is internal so it can be reused by 56 find_motif_differences. 57 """ 58 if self.alphabet_strict: 59 alphabet = seq_records[0].seq.alphabet 60 else: 61 alphabet = None 62 63 # loop through all records to find the motifs in the sequences 64 all_motifs = {} 65 for seq_record in seq_records: 66 # if we are working with alphabets, make sure we are consistent 67 if alphabet is not None: 68 assert seq_record.seq.alphabet == alphabet, \ 69 "Working with alphabet %s and got %s" % \ 70 (alphabet, seq_record.seq.alphabet) 71 72 # now start finding motifs in the sequence 73 for start in range(len(seq_record.seq) - (motif_size - 1)): 74 motif = seq_record.seq[start:start + motif_size].data 75 76 # if we are being alphabet strict, make sure the motif 77 # falls within the specified alphabet 78 if alphabet is not None: 79 motif_seq = Seq(motif, alphabet) 80 if utils.verify_alphabet(motif_seq): 81 all_motifs = self._add_motif(all_motifs, motif) 82 83 # if we are not being strict, just add the motif 84 else: 85 all_motifs = self._add_motif(all_motifs, motif) 86 87 return all_motifs
88
89 - def find_differences(self, first_records, second_records, motif_size):
90 """Find motifs in two sets of records and return the differences. 91 92 This is used for finding motifs, but instead of just counting up all 93 of the motifs in a set of records, this returns the differences 94 between two listings of seq_records. 95 96 o first_records, second_records - Two listings of SeqRecord objects 97 to have their motifs compared. 98 99 o motif_size - The size of the motifs we are looking for. 100 101 Returns: 102 A PatternRepository object that has motifs, but instead of their 103 raw counts, this has the counts in the first set of records 104 subtracted from the counts in the second set. 105 """ 106 first_motifs = self._get_motif_dict(first_records, motif_size) 107 second_motifs = self._get_motif_dict(second_records, motif_size) 108 109 motif_diffs = {} 110 111 # first deal with all of the keys from the first motif 112 for cur_key in first_motifs.keys(): 113 if second_motifs.has_key(cur_key): 114 motif_diffs[cur_key] = first_motifs[cur_key] - \ 115 second_motifs[cur_key] 116 else: 117 motif_diffs[cur_key] = first_motifs[cur_key] 118 119 # now see if there are any keys from the second motif 120 # that we haven't got yet. 121 missing_motifs = second_motifs.keys()[:] 122 123 # remove all of the motifs we've already added 124 for added_motif in motif_diffs.keys(): 125 if added_motif in missing_motifs: 126 missing_motifs.remove(added_motif) 127 128 # now put in all of the motifs we didn't get 129 for cur_key in missing_motifs: 130 motif_diffs[cur_key] = 0 - second_motifs[cur_key] 131 132 return PatternRepository(motif_diffs)
133
134 - def _add_motif(self, motif_dict, motif_to_add):
135 """Add a motif to the given dictionary. 136 """ 137 # incrememt the count of the motif if it is already present 138 if motif_dict.has_key(motif_to_add): 139 motif_dict[motif_to_add] += 1 140 # otherwise add it to the dictionary 141 else: 142 motif_dict[motif_to_add] = 1 143 144 return motif_dict
145
146 -class MotifCoder:
147 """Convert motifs and a sequence into neural network representations. 148 149 This is designed to convert a sequence into a representation that 150 can be fed as an input into a neural network. It does this by 151 representing a sequence based the motifs present. 152 """
153 - def __init__(self, motifs):
154 """Initialize an input producer with motifs to look for. 155 156 Arguments: 157 158 o motifs - A complete list of motifs, in order, that are to be 159 searched for in a sequence. 160 """ 161 self._motifs = motifs 162 163 # check to be sure the motifs make sense (all the same size) 164 self._motif_size = len(self._motifs[0]) 165 for motif in self._motifs: 166 if len(motif) != self._motif_size: 167 raise ValueError("Motif %s given, expected motif size %s" 168 % (motif, self._motif_size))
169
170 - def representation(self, sequence):
171 """Represent a sequence as a set of motifs. 172 173 Arguments: 174 175 o sequence - A Bio.Seq object to represent as a motif. 176 177 This converts a sequence into a representation based on the motifs. 178 The representation is returned as a list of the relative amount of 179 each motif (number of times a motif occured divided by the total 180 number of motifs in the sequence). The values in the list correspond 181 to the input order of the motifs specified in the initializer. 182 """ 183 # initialize a dictionary to hold the motifs in this sequence 184 seq_motifs = {} 185 for motif in self._motifs: 186 seq_motifs[motif] = 0 187 188 # count all of the motifs we are looking for in the sequence 189 for start in range(len(sequence) - (self._motif_size - 1)): 190 motif = sequence[start:start + self._motif_size].data 191 192 if seq_motifs.has_key(motif): 193 seq_motifs[motif] += 1 194 195 # normalize the motifs to go between zero and one 196 min_count = min(seq_motifs.values()) 197 max_count = max(seq_motifs.values()) 198 199 # as long as we have some motifs present, normalize them 200 # otherwise we'll just return 0 for everything 201 if max_count > 0: 202 for motif in seq_motifs.keys(): 203 seq_motifs[motif] = (float(seq_motifs[motif] - min_count) 204 / float(max_count)) 205 206 # return the relative motif counts in the specified order 207 motif_amounts = [] 208 for motif in self._motifs: 209 motif_amounts.append(seq_motifs[motif]) 210 211 return motif_amounts
212