Package Bio :: Package expressions :: Module hmmpfam
[hide private]
[frames] | no frames]

Source Code for Module Bio.expressions.hmmpfam

  1  """Martel expression for the hmmpfam database search program in hmmer. 
  2   
  3  This has been tested with version 2.2g. I should also make it work with 
  4  2.1.1 output. 
  5   
  6  XXX This isn't completely finished and doesn't do everything quite right. 
  7  The main two problems being that it isn't well tested for a wide variety of 
  8  outputs and that the family line is not parsed into it's respective parts  
  9  (see multitude of comments on this below). 
 10  """ 
 11   
 12  import warnings 
 13  warnings.warn("Bio.expressions was deprecated, as it does not work with recent versions of mxTextTools. If you want to continue to use this module, please get in contact with the Biopython developers at biopython-dev@biopython.org to avoid permanent removal of this module from Biopython", DeprecationWarning) 
 14   
 15   
 16   
 17  from Martel import * 
 18  from Martel import RecordReader 
 19  from Bio import Std 
 20   
 21  # -- header 
 22  # hmmpfam - search one or more sequences against HMM database 
 23  program_description = (Std.application_name(Str("hmmpfam")) +  
 24                         ToEol()) 
 25   
 26  # HMMER 2.2g (August 2001) 
 27  program_version = (Str("HMMER ") + 
 28                     Std.application_version(Re(r"\d\.\d\w") | 
 29                                             Re(r"\d\.\d\.\d")) + 
 30                     ToEol()) 
 31   
 32  # Copyright (C) 1992-2001 HHMI/Washington University School of Medicine 
 33  # Freely distributed under the GNU General Public License (GPL) 
 34   
 35  copyright = (ToEol() + 
 36               ToEol()) 
 37   
 38  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 39  # HMM file:                 /data/hmm/Pfam_fs 
 40  # Sequence file:            hmmpfam_test.fasta 
 41  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  
 42            
 43  files = (ToEol() + 
 44           Str("HMM file:") + Spaces() + 
 45           Std.database_name(UntilEol()) + AnyEol() + 
 46           Str("Sequence file:") + Spaces() + 
 47           Group("inputfile_name", UntilEol()) + AnyEol() + 
 48           ToEol()) 
 49   
 50  header = Std.search_header(program_description + program_version + 
 51                             copyright + files) 
 52   
 53  # -- record 
 54   
 55  # 
 56  # Query sequence: AT1G01040 
 57  # Accession:      [none] 
 58  # Description:    Test Sequence for hmmpfam 
 59   
 60  sequence_info = (ToEol() + # blank line 
 61                   Str("Query sequence:") + Spaces() + 
 62                   Group("query_name", UntilEol()) + AnyEol() + 
 63                   Str("Accession:") + Spaces() + 
 64                   Group("query_accession", UntilEol()) + AnyEol() + 
 65                   Str("Description:") + Spaces() + 
 66                   Std.query_description(UntilEol()) + AnyEol()) 
 67   
 68  # generic name for models 
 69  # most model names are something like PFwhatever, but we also have some 
 70  # different kinds like AAA and Sigma54_activat 
 71  model_name = Re(r"[\w-]+") 
 72   
 73  # 
 74  # Scores for sequence family classification (score includes all domains): 
 75  # Model          Description                              Score    E-value  N  
 76  # --------       -----------                              -----    ------- --- 
 77   
 78  family_header = (ToEol() + # blank line 
 79                   Str("Scores") + ToEol() + 
 80                   Str("Model") + ToEol() + 
 81                   Str("-----") + ToEol()) 
 82   
 83  # family lines can either be a hit: 
 84  # PF00636        RNase3 domain                            354.9   1.1e-118   2 
 85  # or no hit: 
 86  #         [no hits above thresholds] 
 87   
 88  no_hit_line = (Spaces() + Str("[no hits above thresholds]") + AnyEol()) 
 89   
 90  family_hit_line = (Group("family_model", model_name) + Spaces() + 
 91                     # XXX This is a pain in the ass, so I gave up 
 92                     # We need to match descriptions but stop when we get to the 
 93                     # score value. 
 94                     # This is very difficult since the description can contain 
 95                     # letters, numbers, (, ), /, - and periods. The numbers and 
 96                     # periods leave me unable to think of a way to distinguish 
 97                     # a "description word" from a score word. 
 98                     # You also can't use spaces as descriptions can have 2 
 99                     # spaces in them (not just one separating every word) and 
100                     # the description to score value can also be separated by 
101                     # only 2 spaces. 
102                     # I can't for the life of me figure this so I just 
103                     # eat the rest of the line for right now and downstream 
104                     # apps will have to get the information out from there. 
105                      ToEol("family_information")) 
106                     #Group("family_description", 
107                     #    Rep1(Re(r"[a-zA-Z0-9_()/-]+") + 
108                     #         Alt(Str("  "), Str(" ")))) + 
109                     # Alt(Spaces()) + 
110                     # Float("family_score") + Spaces() + 
111                     # Float("family_evalue") + Spaces() + 
112                     # Integer("family_n") + AnyEol()) 
113   
114  # 
115  # Parsed for domains: 
116  # Model          Domain  seq-f seq-t    hmm-f hmm-t      score  E-value 
117  # --------       ------- ----- -----    ----- -----      -----  ------- 
118  domain_header = (ToEol() + # blank line 
119                   Str("Parsed for domains:") + AnyEol() + 
120                   Str("Model") + ToEol() + 
121                   Str("-----") + ToEol()) 
122   
123  # domain lines can also be either a hit: 
124  # PF00270          1/1     239   382 ..     1   141 [.    30.3  2.2e-09 
125  # or not hit: 
126  #         [no hits above thresholds] 
127   
128  # I have no idea what these things exactly mean, but parse 'em anyways 
129  symbol_forward = Str(".") | Str("[") 
130  symbol_reverse = Str(".") | Str("]") 
131  match_symbols = symbol_forward + symbol_reverse 
132   
133  domain_hit_line = (Group("domain_model", model_name) + Spaces() + 
134                     Group("domain_domain", Integer() + Str("/") + Integer()) + 
135                     Spaces() + 
136                     Integer("domain_seq-f") + Spaces() + 
137                     Integer("domain_seq-r") + Spaces() + 
138                     Group("domain_seq_symbols", match_symbols) + Spaces() + 
139                     Integer("domain_hmm-f") + Spaces() + 
140                     Integer("domain_hmm-t") + Spaces() + 
141                     Group("domain_hmm_symbols", match_symbols) + Spaces() + 
142                     Float("domain_score") + Spaces() + 
143                     Float("domain_evalue") + AnyEol()) 
144   
145  #  
146  # Alignments of top-scoring domains: 
147  alignment_header = (ToEol() + # empty line 
148                      Str("Alignments of top-scoring domains:") + 
149                      AnyEol()) 
150   
151  # PF00271: domain 1 of 1, from 686 to 767: score 58.6, E = 8.9e-17 
152  #                    *->ell.epgikvarlhG.....glsqeeReeilekFrngkskvLvaTdv 
153  #                       el ++  i++a++ G+++++++++++ ++++ kFr+g + +LvaT+v 
154  #    AT1G01040   686    ELPsLSFIRCASMIGhnnsqEMKSSQMQDTISKFRDGHVTLLVATSV 732   
155  #  
156  #                    aarGlDipdvnlVInydlpwnpesyiQRiGRaGRaG<-* 
157  #                    a++GlDi ++n+V  +dl +++  yiQ +GR +R      
158  #    AT1G01040   733 AEEGLDIRQCNVVMRFDLAKTVLAYIQSRGR-ARKP    767 
159  # 
160  domain_align_header = (Group("dalign_name", model_name) + 
161                         Str(": domain ") + 
162                         Integer("dalign_of_domain") + 
163                         Str(" of ") + 
164                         Integer("dalign_total_domain") + 
165                         Str(", from ") + 
166                         Integer("dalign_domain_start") + 
167                         Str(" to ") + 
168                         Integer("dalign_domain_end") + 
169                         Str(": score ") + 
170                         Float("dalign_score") + 
171                         Str(", E = ") + 
172                         Float("dalign_evalue") + 
173                         AnyEol()) 
174   
175  # occasionally we have a line like: 
176  #                 RF xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 
177  # preceeding the alignment. I'm not sure what this means. 
178  rf_line = (Spaces() + Str("RF ") + ToEol()) 
179   
180  domain_align_top = (Spaces() + 
181                      UntilEol("dalign_match_top") + AnyEol()) 
182   
183  domain_align_middle = (Spaces() + 
184                         UntilEol("dalign_match_middle") + AnyEol()) 
185   
186  domain_align_bottom = (Spaces() + 
187                         ToSep("dalign_query_name", " ") + Spaces() + 
188                         # some match ends can have: 
189                         # AT1G01230     -   - 
190                         # instead of an actual line, so there are fixes 
191                         # in here for that messed up case 
192                         Alt(Integer("dalign_query_start") + Spaces() + 
193                             Group("dalign_match_bottom", 
194                                 Re("[\w\-]+")) + Spaces() + 
195                             Integer("dalign_query_end") + 
196                             Spaces() + AnyEol(), 
197                             Str("- ") + ToEol()) + 
198                         ToEol()) # blank line 
199   
200  domain_alignment = (domain_align_header + 
201                      Rep1(Opt(rf_line) + 
202                           domain_align_top + 
203                           domain_align_middle + 
204                           domain_align_bottom)) 
205   
206  # // 
207  record_end = Str("//") + AnyEol() 
208   
209   
210  record = Std.record(Rep(sequence_info + 
211                          family_header + 
212                          (no_hit_line | Rep1(family_hit_line)) + 
213                          domain_header + 
214                          (no_hit_line | Rep1(domain_hit_line)) + 
215                          alignment_header + 
216                          (no_hit_line | Rep1(domain_alignment)) + 
217                          record_end 
218                      )) 
219   
220  format = HeaderFooter("hmmpfam", {}, 
221                        header, RecordReader.CountLines, (8,), 
222                        record, RecordReader.EndsWith, ("//\n",), 
223                        None, None, None) 
224