1 """Generic functionality useful for all gene representations.
2
3 This module contains classes which can be used for all the different
4 types of patterns available for representing gene information (ie. motifs,
5 signatures and schemas). These are the general classes which should be
6 handle any of the different specific patterns.
7 """
8
9 import string
10 import random
11
12
13 from Bio import utils
14 from Bio.Seq import Seq, MutableSeq
15
17 """Allow reading and writing of patterns to files.
18
19 This just defines a simple persistance class for patterns, making
20 it easy to write them to a file and read 'em back.
21 """
23 """Intialize the reader and writer class.
24
25 Arguments:
26
27 o alphabet - An optional argument specifying the alphabet
28 which patterns should follow. If an alphabet is set it'll be used
29 to verify that all patterns follow it.
30
31 Attributes:
32 o separator - A character to use in separating items in a signature
33 when it is written to a file and read back. This character should
34 not be in the possible alphabet of the sequences, or there will
35 be trouble.
36 """
37 self._alphabet = alphabet
38
39 self.separator = ";"
40
41 - def write(self, pattern_list, output_handle):
42 """Write a list of patterns to the given handle.
43 """
44 for pattern in pattern_list:
45
46 if (type(pattern) == type([]) or
47 type(pattern) == type(tuple([]))):
48 string_pattern = string.join(pattern, self.separator)
49
50 else:
51 string_pattern = pattern
52
53 output_handle.write("%s\n" % string_pattern)
54
55 - def write_seq(self, seq_pattern_list, output_handle):
56 """Convenience function to write Seq objects to a file.
57
58 This can take Seqs and MutableSeqs, and write them to a file
59 as strings.
60 """
61
62 all_patterns = []
63
64 for seq_pattern in seq_pattern_list:
65 if isinstance(seq_pattern, MutableSeq):
66 seq = seq_pattern.toseq()
67 all_patterns.append(seq.data)
68 elif isinstance(seq_pattern, Seq):
69 all_patterns.append(seq_pattern.data)
70 else:
71 raise ValueError("Unexpected pattern type %r" % seq_pattern)
72
73 self.write(all_patterns, output_handle)
74
75 - def read(self, input_handle):
76 """Read patterns from the specified handle.
77 """
78 all_patterns = []
79
80 while 1:
81 cur_line = input_handle.readline()
82
83 if not(cur_line):
84 break
85
86 cur_pattern = string.rstrip(cur_line)
87
88 if cur_pattern.find(self.separator) >= 0:
89 cur_pattern = tuple(cur_pattern.split(self.separator))
90
91 if self._alphabet is not None:
92
93
94 if type(cur_pattern) != type(tuple([])):
95 test_pattern = [cur_pattern]
96 else:
97 test_pattern = cur_pattern
98 for pattern_item in test_pattern:
99 pattern_seq = Seq(pattern_item, self._alphabet)
100 if not(utils.verify_alphabet(pattern_seq)):
101 raise ValueError("Pattern %s not matching alphabet %s"
102 % (cur_pattern, self._alphabet))
103
104 all_patterns.append(cur_pattern)
105
106 return all_patterns
107
109 """This holds a list of specific patterns found in sequences.
110
111 This is designed to be a general holder for a set of patterns and
112 should be subclassed for specific implementations (ie. holding Motifs
113 or Signatures.
114 """
116 """Initialize a repository with patterns,
117
118 Arguments:
119
120 o pattern_info - A representation of all of the patterns found in
121 a *Finder search. This should be a dictionary, where the keys
122 are patterns, and the values are the number of times a pattern is
123 found.
124
125 The patterns are represented interally as a list of two
126 tuples, where the first element is the number of times a pattern
127 occurs, and the second is the pattern itself. This makes it easy
128 to sort the list and return the top N patterns.
129 """
130 self._pattern_dict = pattern_info
131
132
133 self._pattern_list = []
134 for pattern_name in self._pattern_dict.keys():
135 self._pattern_list.append((self._pattern_dict[pattern_name],
136 pattern_name))
137
138 self._pattern_list.sort()
139 self._pattern_list.reverse()
140
142 """Retrieve all of the patterns in the repository.
143 """
144 patterns = []
145 for pattern_info in self._pattern_list:
146 patterns.append(pattern_info[1])
147
148 return patterns
149
151 """Retrieve the specified number of patterns randomly.
152
153 Randomly selects patterns from the list and returns them.
154
155 Arguments:
156
157 o num_patterns - The total number of patterns to return.
158 """
159 all_patterns = []
160
161 while len(all_patterns) < num_patterns:
162
163 new_pattern_info = random.choice(self._pattern_list)
164
165 if new_pattern_info[1] not in all_patterns:
166 all_patterns.append(new_pattern_info[1])
167
168 return all_patterns
169
171 """Return a percentage of the patterns.
172
173 This returns the top 'percent' percentage of the patterns in the
174 repository.
175 """
176 all_patterns = self.get_all()
177
178 num_to_return = int(len(all_patterns) * percent)
179
180 return all_patterns[:num_to_return]
181
183 """Return the specified number of most frequently occurring patterns
184
185 Arguments:
186
187 o num_patterns - The number of patterns to return.
188 """
189 all_patterns = []
190 for pattern_info in self._pattern_list[:num_patterns]:
191 all_patterns.append(pattern_info[1])
192
193 return all_patterns
194
196 """Retrieve patterns that are at the extreme ranges.
197
198 This returns both patterns at the top of the list (ie. the same as
199 returned by get_top) and at the bottom of the list. This
200 is especially useful for patterns that are the differences between
201 two sets of patterns.
202
203 Arguments:
204
205 o top_num - The number of patterns to take from the top of the list.
206
207 o bottom_num - The number of patterns to take from the bottom of
208 the list.
209 """
210 all_patterns = []
211
212 for pattern_info in self._pattern_list[:top_num]:
213 all_patterns.append(pattern_info[1])
214
215
216 for pattern_info in self._pattern_list[-bottom_num:]:
217 all_patterns.append(pattern_info[1])
218
219 return all_patterns
220
222 """Remove patterns which are likely due to polyA tails from the lists.
223
224 This is just a helper function to remove pattenrs which are likely
225 just due to polyA tails, and thus are not really great motifs.
226 This will also get rid of stuff like ATATAT, which might be a
227 useful motif, so use at your own discretion.
228
229 XXX Could we write a more general function, based on info content
230 or something like that?
231
232 Arguments:
233
234 o at_percentage - The percentage of A and T residues in a pattern
235 that qualifies it for being removed.
236 """
237 remove_list = []
238
239 for pattern_info in self._pattern_list:
240 pattern_at = (float(string.count(pattern_info[1], 'A') +
241 string.count(pattern_info[1], 'T')) /
242 float(len(pattern_info[1])))
243 if pattern_at > at_percentage:
244 remove_list.append(pattern_info)
245
246
247 for to_remove in remove_list:
248 self._pattern_list.remove(to_remove)
249
250 - def count(self, pattern):
251 """Return the number of times the specified pattern is found.
252 """
253 try:
254 return self._pattern_dict[pattern]
255 except KeyError:
256 return 0
257