Package translate :: Package tools :: Module pogrep
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.pogrep

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Grep XLIFF, Gettext PO and TMX localization files 
 23   
 24  Matches are output to snippet files of the same type which can then be reviewed  
 25  and later merged using pomerge 
 26   
 27  See: http://translate.sourceforge.net/wiki/toolkit/pogrep for examples and 
 28  usage instructions 
 29  """ 
 30   
 31  from translate.storage import factory 
 32  from translate.misc import optrecurse 
 33  from translate.misc.multistring import multistring 
 34  from translate.lang import data 
 35  import re 
 36  import locale 
 37   
 38   
39 -class GrepMatch(object):
40 """Just a small data structure that represents a search match.""" 41 42 # INITIALIZERS #
43 - def __init__(self, unit, part='target', part_n=0, start=0, end=0):
44 self.unit = unit 45 self.part = part 46 self.part_n = part_n 47 self.start = start 48 self.end = end
49 50 # ACCESSORS #
51 - def get_getter(self):
52 if self.part == 'target': 53 if self.unit.hasplural(): 54 getter = lambda: self.unit.target.strings[self.part_n] 55 else: 56 getter = lambda: self.unit.target 57 return getter 58 elif self.part == 'source': 59 if self.unit.hasplural(): 60 getter = lambda: self.unit.source.strings[self.part_n] 61 else: 62 getter = lambda: self.unit.source 63 return getter 64 elif self.part == 'notes': 65 def getter(): 66 return self.unit.getnotes()[self.part_n]
67 return getter 68 elif self.part == 'locations': 69 def getter(): 70 return self.unit.getlocations()[self.part_n]
71 return getter 72
73 - def get_setter(self):
74 if self.part == 'target': 75 if self.unit.hasplural(): 76 def setter(value): 77 strings = self.unit.target.strings 78 strings[self.part_n] = value 79 self.unit.target = strings
80 else: 81 def setter(value): 82 self.unit.target = value 83 return setter 84 85 # SPECIAL METHODS #
86 - def __str__(self):
87 start, end = self.start, self.end 88 if start < 3: 89 start = 3 90 if end > len(self.get_getter()()) - 3: 91 end = len(self.get_getter()()) - 3 92 matchpart = self.get_getter()()[start-2:end+2] 93 return '<GrepMatch "%s" part=%s[%d] start=%d end=%d>' % (matchpart, self.part, self.part_n, self.start, self.end)
94
95 - def __repr__(self):
96 return str(self)
97
98 -def real_index(string, nfc_index):
99 """Calculate the real index in the unnormalized string that corresponds to 100 the index nfc_index in the normalized string.""" 101 length = nfc_index 102 max_length = len(string) 103 while len(data.normalize(string[:length])) <= nfc_index: 104 if length == max_length: 105 return length 106 length += 1 107 return length - 1
108 109
110 -def find_matches(unit, part, strings, re_search):
111 """Return the GrepFilter objects where re_search matches in strings.""" 112 matches = [] 113 part_n = 0 114 for string in strings: 115 normalized = data.normalize(string) 116 for matchobj in re_search.finditer(normalized): 117 start = real_index(string, matchobj.start()) 118 end = real_index(string, matchobj.end()) 119 matches.append(GrepMatch(unit, part=part, part_n=part_n, start=start, end=end)) 120 return matches
121
122 -class GrepFilter:
123 - def __init__(self, searchstring, searchparts, ignorecase=False, useregexp=False, 124 invertmatch=False, accelchar=None, encoding='utf-8', includeheader=False, 125 max_matches=0):
126 """builds a checkfilter using the given checker""" 127 if isinstance(searchstring, unicode): 128 self.searchstring = searchstring 129 else: 130 self.searchstring = searchstring.decode(encoding) 131 self.searchstring = data.normalize(self.searchstring) 132 if searchparts: 133 # For now we still support the old terminology, except for the old 'source' 134 # which has a new meaning now. 135 self.search_source = ('source' in searchparts) or ('msgid' in searchparts) 136 self.search_target = ('target' in searchparts) or ('msgstr' in searchparts) 137 self.search_notes = ('notes' in searchparts) or ('comment' in searchparts) 138 self.search_locations = 'locations' in searchparts 139 else: 140 self.search_source = True 141 self.search_target = True 142 self.search_notes = False 143 self.search_locations = False 144 self.ignorecase = ignorecase 145 if self.ignorecase: 146 self.searchstring = self.searchstring.lower() 147 self.useregexp = useregexp 148 if self.useregexp: 149 self.searchpattern = re.compile(self.searchstring) 150 self.invertmatch = invertmatch 151 self.accelchar = accelchar 152 self.includeheader = includeheader 153 self.max_matches = max_matches
154
155 - def matches(self, teststr):
156 if teststr is None: 157 return False 158 teststr = data.normalize(teststr) 159 if self.ignorecase: 160 teststr = teststr.lower() 161 if self.accelchar: 162 teststr = re.sub(self.accelchar + self.accelchar, "#", teststr) 163 teststr = re.sub(self.accelchar, "", teststr) 164 if self.useregexp: 165 found = self.searchpattern.search(teststr) 166 else: 167 found = teststr.find(self.searchstring) != -1 168 if self.invertmatch: 169 found = not found 170 return found
171
172 - def filterunit(self, unit):
173 """runs filters on an element""" 174 if unit.isheader(): return [] 175 176 if self.search_source: 177 if isinstance(unit.source, multistring): 178 strings = unit.source.strings 179 else: 180 strings = [unit.source] 181 for string in strings: 182 if self.matches(string): 183 return True 184 185 if self.search_target: 186 if isinstance(unit.target, multistring): 187 strings = unit.target.strings 188 else: 189 strings = [unit.target] 190 for string in strings: 191 if self.matches(string): 192 return True 193 194 if self.search_notes: 195 return self.matches(unit.getnotes()) 196 if self.search_locations: 197 return self.matches(u" ".join(unit.getlocations())) 198 return False
199
200 - def filterfile(self, thefile):
201 """runs filters on a translation file object""" 202 thenewfile = type(thefile)() 203 thenewfile.setsourcelanguage(thefile.sourcelanguage) 204 thenewfile.settargetlanguage(thefile.targetlanguage) 205 for unit in thefile.units: 206 if self.filterunit(unit): 207 thenewfile.addunit(unit) 208 if self.includeheader and thenewfile.units > 0: 209 if thefile.units[0].isheader(): 210 thenewfile.units.insert(0, thefile.units[0]) 211 else: 212 thenewfile.units.insert(0, thenewfile.makeheader()) 213 return thenewfile
214
215 - def getmatches(self, units):
216 if not self.searchstring: 217 return [], [] 218 219 searchstring = self.searchstring 220 flags = re.LOCALE | re.MULTILINE | re.UNICODE 221 222 if self.ignorecase: 223 flags |= re.IGNORECASE 224 if not self.useregexp: 225 searchstring = re.escape(searchstring) 226 self.re_search = re.compile(u'(%s)' % (searchstring), flags) 227 228 matches = [] 229 indexes = [] 230 231 for index, unit in enumerate(units): 232 old_length = len(matches) 233 234 if self.search_target: 235 if unit.hasplural(): 236 targets = unit.target.strings 237 else: 238 targets = [unit.target] 239 matches.extend(find_matches(unit, 'target', targets, self.re_search)) 240 if self.search_source: 241 if unit.hasplural(): 242 sources = unit.source.strings 243 else: 244 sources = [unit.source] 245 matches.extend(find_matches(unit, 'source', sources, self.re_search)) 246 if self.search_notes: 247 matches.extend(find_matches(unit, 'notes', unit.getnotes(), self.re_search)) 248 249 if self.search_locations: 250 matches.extend(find_matches(unit, 'locations', unit.getlocations(), self.re_search)) 251 252 # A search for a single letter or an all-inclusive regular 253 # expression could give enough results to cause performance 254 # problems. The answer is probably not very useful at this scale. 255 if self.max_matches and len(matches) > self.max_matches: 256 raise Exception("Too many matches found") 257 258 if len(matches) > old_length: 259 old_length = len(matches) 260 indexes.append(index) 261 262 return matches, indexes
263
264 -class GrepOptionParser(optrecurse.RecursiveOptionParser):
265 """a specialized Option Parser for the grep tool..."""
266 - def parse_args(self, args=None, values=None):
267 """parses the command line options, handling implicit input/output args""" 268 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 269 # some intelligence as to what reasonable people might give on the command line 270 if args: 271 options.searchstring = args[0] 272 args = args[1:] 273 else: 274 self.error("At least one argument must be given for the search string") 275 if args and not options.input: 276 if not options.output: 277 options.input = args[:-1] 278 args = args[-1:] 279 else: 280 options.input = args 281 args = [] 282 if args and not options.output: 283 options.output = args[-1] 284 args = args[:-1] 285 if args: 286 self.error("You have used an invalid combination of --input, --output and freestanding args") 287 if isinstance(options.input, list) and len(options.input) == 1: 288 options.input = options.input[0] 289 return (options, args)
290
291 - def set_usage(self, usage=None):
292 """sets the usage string - if usage not given, uses getusagestring for each option""" 293 if usage is None: 294 self.usage = "%prog searchstring " + " ".join([self.getusagestring(option) for option in self.option_list]) 295 else: 296 super(GrepOptionParser, self).set_usage(usage)
297
298 - def run(self):
299 """parses the arguments, and runs recursiveprocess with the resulting options""" 300 (options, args) = self.parse_args() 301 options.inputformats = self.inputformats 302 options.outputoptions = self.outputoptions 303 options.checkfilter = GrepFilter(options.searchstring, options.searchparts, options.ignorecase, options.useregexp, options.invertmatch, options.accelchar, locale.getpreferredencoding(), options.includeheader) 304 self.usepsyco(options) 305 self.recursiveprocess(options)
306
307 -def rungrep(inputfile, outputfile, templatefile, checkfilter):
308 """reads in inputfile, filters using checkfilter, writes to outputfile""" 309 fromfile = factory.getobject(inputfile) 310 tofile = checkfilter.filterfile(fromfile) 311 if tofile.isempty(): 312 return False 313 outputfile.write(str(tofile)) 314 return True
315
316 -def cmdlineparser():
317 formats = {"po":("po", rungrep), "pot":("pot", rungrep), 318 "mo":("mo", rungrep), "gmo":("gmo", rungrep), 319 "tmx":("tmx", rungrep), 320 "xliff":("xliff", rungrep), "xlf":("xlf", rungrep), "xlff":("xlff", rungrep), 321 None:("po", rungrep)} 322 parser = GrepOptionParser(formats) 323 parser.add_option("", "--search", dest="searchparts", 324 action="append", type="choice", choices=["source", "target", "notes", "locations", "msgid", "msgstr", "comment" ], 325 metavar="SEARCHPARTS", help="searches the given parts (source, target, notes and locations)") 326 parser.add_option("-I", "--ignore-case", dest="ignorecase", 327 action="store_true", default=False, help="ignore case distinctions") 328 parser.add_option("-e", "--regexp", dest="useregexp", 329 action="store_true", default=False, help="use regular expression matching") 330 parser.add_option("-v", "--invert-match", dest="invertmatch", 331 action="store_true", default=False, help="select non-matching lines") 332 parser.add_option("", "--accelerator", dest="accelchar", 333 action="store", type="choice", choices=["&", "_", "~"], 334 metavar="ACCELERATOR", help="ignores the given accelerator when matching") 335 parser.add_option("", "--header", dest="includeheader", 336 action="store_true", default=False, 337 help="include a PO header in the output") 338 parser.set_usage() 339 parser.passthrough.append('checkfilter') 340 parser.description = __doc__ 341 return parser
342
343 -def main():
344 parser = cmdlineparser() 345 parser.run()
346 347 if __name__ == '__main__': 348 main() 349