Package Bio :: Package AlignAce :: Module Motif
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignAce.Motif

  1  # Copyright 2003 by Bartek Wilczynski.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  Implementation of sequence motifs. 
  7   
  8  Changes: 
  9  9.2007 (BW) : added the to_faste() and .weblogo() methods allowing to use the Berkeley weblogo server at http://weblogo.berkeley.edu/ 
 10  """ 
 11   
 12  from __future__ import generators 
 13  from Bio.SubsMat import FreqTable 
 14   
15 -class Motif(object):
16 """ 17 A class representing sequence motifs. 18 """
19 - def __init__(self):
20 self.instances = [] 21 self.score = 0.0 22 self.mask = [] 23 self._pwm_is_current = 0 24 self._pwm = [] 25 self.alphabet=None 26 self.length=None
27
28 - def _check_length(self, len):
29 if self.length==None: 30 self.length = len 31 elif self.length != len: 32 raise ValueError, "You can't change the length of the motif"
33
34 - def _check_alphabet(self,alphabet):
35 if self.alphabet==None: 36 self.alphabet=alphabet 37 elif self.alphabet != alphabet: 38 raise ValueError, "Wrong Alphabet"
39
40 - def add_instance(self,instance):
41 """ 42 adds new instance to the motif 43 """ 44 self._check_alphabet(instance.alphabet) 45 self._check_length(len(instance)) 46 self.instances.append(instance) 47 self._pwm_is_current = False
48
49 - def set_mask(self,mask):
50 """ 51 sets the mask for the motif 52 53 The mask should be a string containing asterisks in the position of significant columns and spaces in other columns 54 """ 55 self._check_length(len(mask)) 56 self.mask=[] 57 for char in mask: 58 if char=="*": 59 self.mask.append(1) 60 elif char==" ": 61 self.mask.append(0) 62 else: 63 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)
64
65 - def pwm(self):
66 """ 67 returns the PWM computed for the set of instances 68 """ 69 70 if self._pwm_is_current: 71 return self._pwm 72 #we need to compute new pwm 73 self._pwm = [] 74 for i in xrange(len(self.mask)): 75 dict = {} 76 #filling the dict with 0's 77 for letter in self.alphabet.letters: 78 dict[letter]=0 79 #counting the occurences of letters in instances 80 for seq in self.instances: 81 dict[seq[i]]=dict[seq[i]]+1 82 self._pwm.append(FreqTable.FreqTable(dict,FreqTable.COUNT,self.alphabet)) 83 self._pwm_is_current=True 84 return self._pwm
85
86 - def search_instances(self,sequence):
87 """ 88 a generator function, returning found positions of instances of the motif in a given sequence 89 """ 90 for pos in xrange(0,len(sequence)-self.length+1): 91 for instance in self.instances: 92 if instance.tostring()==sequence[pos:pos+self.length].tostring(): 93 yield(pos,instance) 94 break # no other instance will fit (we don't want to return multiple hits)
95
96 - def score_hit(self,sequence,position,normalized=1,masked=0):
97 """ 98 give the pwm score for a given position 99 """ 100 score = 0.0 101 for pos in xrange(self.length): 102 if not masked or self.mask[pos]: 103 score += self.pwm()[pos][sequence[position+pos]] 104 if normalized: 105 if not masked: 106 score/=self.length 107 else: 108 score/=len(filter(lambda x: x, self.mask)) 109 return score
110
111 - def search_pwm(self,sequence,threshold=0.0,normalized=1,masked=1):
112 """ 113 a generator function, returning found hits in a given sequence with the pwm score higher than the threshold 114 """ 115 116 for pos in xrange(0,len(sequence)-self.length+1): 117 score = self.score_hit(sequence,pos,normalized,masked) 118 if score > threshold: 119 yield (pos,score)
120 121
122 - def sim(self, motif, masked = 0):
123 """ 124 return the similarity score for the given motif against self. 125 126 We use the Pearson's correlation of the respective probabilities. 127 If the motifs have different length or mask raise the ValueError. 128 """ 129 130 from math import sqrt 131 132 if self.alphabet != motif.alphabet: 133 raise ValueError("Wrong alphabet") 134 135 if self.length != motif.length: 136 raise ValueError("Wrong length") 137 138 if masked and self.mask!=motif.mask: 139 raise ValueError("Wrong mask") 140 141 sxx = 0 # \sum x^2 142 sxy = 0 # \sum x \cdot y 143 sx = 0 # \sum x 144 sy = 0 # \sum y 145 syy = 0 # \sum x^2 146 147 for pos in xrange(self.length): 148 if not masked or self.mask: 149 for l in self.alphabet.letters: 150 xi = self.pwm()[pos][l] 151 yi = motif.pwm()[pos][l] 152 sx = sx + xi 153 sy = sy + yi 154 sxx = sxx + xi * xi 155 syy = syy + yi * yi 156 sxy = sxy + xi * yi 157 158 if masked: 159 norm = len(filter(lambda x: x,self.mask)) 160 else: 161 norm = self.length 162 163 norm *= len(self.alphabet.letters) 164 s1 = (sxy - sx*sy*1.0/norm) 165 s2 = (sxx - sx*sx*1.0/norm)*(syy- sy*sy*1.0/norm) 166 167 return s1/sqrt(s2)
168
169 - def read(self,stream):
170 """ 171 reads the motif from the stream 172 173 the self.alphabet variable must be set before 174 """ 175 from Bio.Seq import Seq 176 while 1: 177 ln = stream.readline() 178 if "*" in ln: 179 self.set_mask(ln.strip("\n\c")) 180 break 181 self.add_instance(Seq(ln.strip(),self.alphabet))
182
183 - def __str__(self):
184 """ 185 string representation of motif 186 """ 187 str = "" 188 for inst in self.instances: 189 str = str + inst.tostring() + "\n" 190 191 for i in xrange(self.length): 192 if self.mask[i]: 193 str = str + "*" 194 else: 195 str = str + " " 196 str = str + "\n" 197 198 return str
199
200 - def write(self,stream):
201 """ 202 writes the motif to the stream 203 """ 204 205 stream.write(self.__str__())
206 207 208
209 - def to_fasta(self):
210 """ 211 FASTA representation of motif 212 """ 213 str = "" 214 for i,inst in enumerate(self.instances): 215 str = str + "> instance %d\n"%i + inst.tostring() + "\n" 216 217 return str
218
219 - def weblogo(self,fname,format="PNG",**kwds):
220 """ 221 uses the Berkeley weblogo service to download and save a weblogo of itself 222 223 requires an internet connection. 224 The parameters from **kwds are passed directly to the weblogo server. 225 """ 226 import urllib 227 import urllib2 228 #import Image 229 al= self.to_fasta() 230 231 url = 'http://weblogo.berkeley.edu/logo.cgi' 232 values = {'sequence' : al, 233 'format' : format, 234 'logowidth' : '18', 235 'logoheight' : '5', 236 'logounits' : 'cm', 237 'kind' : 'AUTO', 238 'firstnum' : "1", 239 'command' : 'Create Logo', 240 'smallsamplecorrection' : "on", 241 'symbolsperline' : 32, 242 'res' : '96', 243 'res_units' : 'ppi', 244 'antialias' : 'on', 245 'title' : '', 246 'barbits' : '', 247 'xaxis': 'on', 248 'xaxis_label' : '', 249 'yaxis': 'on', 250 'yaxis_label' : '', 251 'showends' : 'on', 252 'shrink' : '0.5', 253 'fineprint' : 'on', 254 'ticbits' : '1', 255 'colorscheme' : 'DEFAULT', 256 'color1' : 'green', 257 'color2' : 'blue', 258 'color3' : 'red', 259 'color4' : 'black', 260 'color5' : 'purple', 261 'color6' : 'orange', 262 'color1' : 'black', 263 } 264 for k,v in kwds.items(): 265 values[k]=str(v) 266 267 data = urllib.urlencode(values) 268 req = urllib2.Request(url, data) 269 response = urllib2.urlopen(req) 270 f=open(fname,"w") 271 im=response.read() 272 273 f.write(im) 274 f.close()
275