Package translate :: Package storage :: Module dtd
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.dtd

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile) 
 23  these are specific .dtd files for localisation used by mozilla""" 
 24   
 25  from translate.storage import base 
 26  from translate.misc import quote 
 27   
 28  import re 
 29  import warnings 
 30  try: 
 31      from lxml import etree 
 32      import StringIO 
 33  except ImportError: 
 34      etree = None 
 35   
 36  labelsuffixes = (".label", ".title") 
 37  """Label suffixes: entries with this suffix are able to be comibed with accesskeys 
 38  found in in entries ending with L{accesskeysuffixes}""" 
 39  accesskeysuffixes = (".accesskey", ".accessKey", ".akey") 
 40  """Accesskey Suffixes: entries with this suffix may be combined with labels 
 41  ending in L{labelsuffixes} into accelerator notation""" 
 42   
43 -def quotefordtd(source):
44 if '"' in source: 45 if "'" in source: 46 return "'" + source.replace("'", ''') + "'" 47 else: 48 return quote.singlequotestr(source) 49 else: 50 return quote.quotestr(source)
51
52 -def unquotefromdtd(source):
53 """unquotes a quoted dtd definition""" 54 # extract the string, get rid of quoting 55 if len(source) == 0: 56 source = '""' 57 quotechar = source[0] 58 extracted, quotefinished = quote.extractwithoutquotes(source, quotechar, quotechar, allowreentry=False) 59 if quotechar == "'" and "'" in extracted: 60 extracted = extracted.replace("'", "'") 61 # the quote characters should be the first and last characters in the string 62 # of course there could also be quote characters within the string; not handled here 63 return extracted
64
65 -def removeinvalidamps(name, value):
66 """Find and remove ampersands that are not part of an entity definition. 67 68 A stray & in a DTD file can break an applications ability to parse the file. In Mozilla 69 localisation this is very important and these can break the parsing of files used in XUL 70 and thus break interface rendering. Tracking down the problem is very difficult, 71 thus by removing potential broken & and warning the users we can ensure that the output 72 DTD will always be parsable. 73 74 @type name: String 75 @param name: Entity name 76 @type value: String 77 @param value: Entity text value 78 @rtype: String 79 @return: Entity value without bad ampersands 80 """ 81 def is_valid_entity_name(name): 82 """Check that supplied L{name} is a valid entity name""" 83 if name.replace('.', '').isalnum(): 84 return True 85 elif name[0] == '#' and name[1:].isalnum(): 86 return True 87 return False
88 89 amppos = 0 90 invalid_amps = [] 91 while amppos >= 0: 92 amppos = value.find("&", amppos) 93 if amppos != -1: 94 amppos += 1 95 semipos = value.find(";", amppos) 96 if semipos != -1: 97 if is_valid_entity_name(value[amppos:semipos]): 98 continue 99 invalid_amps.append(amppos-1) 100 if len(invalid_amps) > 0: 101 warnings.warn("invalid ampersands in dtd entity %s" % (name)) 102 adjustment = 0 103 for amppos in invalid_amps: 104 value = value[:amppos-adjustment] + value[amppos-adjustment+1:] 105 adjustment += 1 106 return value 107
108 -class dtdunit(base.TranslationUnit):
109 """this class represents an entity definition from a dtd file (and possibly associated comments)"""
110 - def __init__(self, source=""):
111 """construct the dtdunit, prepare it for parsing""" 112 super(dtdunit, self).__init__(source) 113 self.comments = [] 114 self.unparsedlines = [] 115 self.incomment = False 116 self.inentity = False 117 self.entity = "FakeEntityOnlyForInitialisationAndTesting" 118 self.source = source
119 120 # Note that source and target are equivalent for monolingual units
121 - def setsource(self, source):
122 """Sets the definition to the quoted value of source""" 123 self.definition = quotefordtd(source)
124
125 - def getsource(self):
126 """gets the unquoted source string""" 127 return unquotefromdtd(self.definition)
128 source = property(getsource, setsource) 129
130 - def settarget(self, target):
131 """Sets the definition to the quoted value of target""" 132 if target is None: 133 target = "" 134 self.definition = quotefordtd(target)
135
136 - def gettarget(self):
137 """gets the unquoted target string""" 138 return unquotefromdtd(self.definition)
139 target = property(gettarget, settarget) 140
141 - def isnull(self):
142 """returns whether this dtdunit doesn't actually have an entity definition""" 143 # for dtds, we currently return a blank string if there is no .entity (==location in other files) 144 # TODO: this needs to work better with base class expectations 145 return self.entity is None
146
147 - def parse(self, dtdsrc):
148 """read the first dtd element from the source code into this object, return linesprocessed""" 149 self.comments = [] 150 # make all the lists the same 151 self.locfilenotes = self.comments 152 self.locgroupstarts = self.comments 153 self.locgroupends = self.comments 154 self.locnotes = self.comments 155 # self.locfilenotes = [] 156 # self.locgroupstarts = [] 157 # self.locgroupends = [] 158 # self.locnotes = [] 159 # self.comments = [] 160 self.entity = None 161 self.definition = '' 162 if not dtdsrc: 163 return 0 164 lines = dtdsrc.split("\n") 165 linesprocessed = 0 166 comment = "" 167 for line in lines: 168 line += "\n" 169 linesprocessed += 1 170 # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1] 171 if not self.incomment: 172 if (line.find('<!--') != -1): 173 self.incomment = True 174 self.continuecomment = False 175 # now work out the type of comment, and save it (remember we're not in the comment yet) 176 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0) 177 if comment.find('LOCALIZATION NOTE') != -1: 178 l = quote.findend(comment,'LOCALIZATION NOTE') 179 while (comment[l] == ' '): 180 l += 1 181 if comment.find('FILE', l) == l: 182 self.commenttype = "locfile" 183 elif comment.find('BEGIN', l) == l: 184 self.commenttype = "locgroupstart" 185 elif comment.find('END', l) == l: 186 self.commenttype = "locgroupend" 187 else: 188 self.commenttype = "locnote" 189 else: 190 # plain comment 191 self.commenttype = "comment" 192 #FIXME: bloody entity might share a line with something important 193 elif not self.inentity and re.search("%.*;", line): 194 # now work out the type of comment, and save it (remember we're not in the comment yet) 195 self.comments.append(("comment", line)) 196 line = "" 197 continue 198 199 if self.incomment: 200 # some kind of comment 201 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment) 202 # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment 203 self.continuecomment = self.incomment 204 # strip the comment out of what will be parsed 205 line = line.replace(comment, "", 1) 206 # add a end of line of this is the end of the comment 207 if not self.incomment: 208 if line.isspace(): 209 comment += line 210 line = '' 211 else: 212 comment += '\n' 213 # check if there's actually an entity definition that's commented out 214 # TODO: parse these, store as obsolete messages 215 # if comment.find('<!ENTITY') != -1: 216 # # remove the entity from the comment 217 # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1) 218 # depending on the type of comment (worked out at the start), put it in the right place 219 # make it record the comment and type as a tuple 220 commentpair = (self.commenttype, comment) 221 if self.commenttype == "locfile": 222 self.locfilenotes.append(commentpair) 223 elif self.commenttype == "locgroupstart": 224 self.locgroupstarts.append(commentpair) 225 elif self.commenttype == "locgroupend": 226 self.locgroupends.append(commentpair) 227 elif self.commenttype == "locnote": 228 self.locnotes.append(commentpair) 229 elif self.commenttype == "comment": 230 self.comments.append(commentpair) 231 232 if not self.inentity and not self.incomment: 233 entitypos = line.find('<!ENTITY') 234 if entitypos != -1: 235 self.inentity = True 236 beforeentity = line[:entitypos].strip() 237 if beforeentity.startswith("#"): 238 self.hashprefix = beforeentity 239 self.entitypart = "start" 240 else: 241 self.unparsedlines.append(line) 242 243 if self.inentity: 244 if self.entitypart == "start": 245 # the entity definition 246 e = quote.findend(line,'<!ENTITY') 247 line = line[e:] 248 self.entitypart = "name" 249 self.entitytype = "internal" 250 if self.entitypart == "name": 251 e = 0 252 while (e < len(line) and line[e].isspace()): 253 e += 1 254 self.entity = '' 255 if (e < len(line) and line[e] == '%'): 256 self.entitytype = "external" 257 self.entityparameter = "" 258 e += 1 259 while (e < len(line) and line[e].isspace()): 260 e += 1 261 while (e < len(line) and not line[e].isspace()): 262 self.entity += line[e] 263 e += 1 264 while (e < len(line) and line[e].isspace()): 265 e += 1 266 if self.entity: 267 if self.entitytype == "external": 268 self.entitypart = "parameter" 269 else: 270 self.entitypart = "definition" 271 # remember the start position and the quote character 272 if e == len(line): 273 self.entityhelp = None 274 e = 0 275 continue 276 elif self.entitypart == "definition": 277 self.entityhelp = (e, line[e]) 278 self.instring = False 279 if self.entitypart == "parameter": 280 while (e < len(line) and line[e].isspace()): e += 1 281 paramstart = e 282 while (e < len(line) and line[e].isalnum()): 283 e += 1 284 self.entityparameter += line[paramstart:e] 285 while (e < len(line) and line[e].isspace()): 286 e += 1 287 line = line[e:] 288 e = 0 289 if not line: 290 continue 291 if line[0] in ('"', "'"): 292 self.entitypart = "definition" 293 self.entityhelp = (e, line[e]) 294 self.instring = False 295 if self.entitypart == "definition": 296 if self.entityhelp is None: 297 e = 0 298 while (e < len(line) and line[e].isspace()): 299 e += 1 300 if e == len(line): 301 continue 302 self.entityhelp = (e, line[e]) 303 self.instring = False 304 # actually the lines below should remember instring, rather than using it as dummy 305 e = self.entityhelp[0] 306 if (self.entityhelp[1] == "'"): 307 (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False) 308 elif (self.entityhelp[1] == '"'): 309 (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False) 310 else: 311 raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1])) 312 # for any following lines, start at the beginning of the line. remember the quote character 313 self.entityhelp = (0, self.entityhelp[1]) 314 self.definition += defpart 315 if not self.instring: 316 self.inentity = False 317 break 318 319 # uncomment this line to debug processing 320 if 0: 321 for attr in dir(self): 322 r = repr(getattr(self, attr)) 323 if len(r) > 60: 324 r = r[:57]+"..." 325 self.comments.append(("comment", "self.%s = %s" % (attr, r) )) 326 return linesprocessed
327
328 - def __str__(self):
329 """convert to a string. double check that unicode is handled somehow here""" 330 source = self.getoutput() 331 if isinstance(source, unicode): 332 return source.encode(getattr(self, "encoding", "UTF-8")) 333 return source
334
335 - def getoutput(self):
336 """convert the dtd entity back to string form""" 337 lines = [] 338 lines.extend([comment for commenttype, comment in self.comments]) 339 lines.extend(self.unparsedlines) 340 if self.isnull(): 341 result = "".join(lines) 342 return result.rstrip() + "\n" 343 # for f in self.locfilenotes: yield f 344 # for ge in self.locgroupends: yield ge 345 # for gs in self.locgroupstarts: yield gs 346 # for n in self.locnotes: yield n 347 if len(self.entity) > 0: 348 if getattr(self, 'entitytype', None) == 'external': 349 entityline = '<!ENTITY % '+self.entity+' '+self.entityparameter+' '+self.definition+'>' 350 else: 351 entityline = '<!ENTITY '+self.entity+' '+self.definition+'>' 352 if getattr(self, 'hashprefix', None): 353 entityline = self.hashprefix + " " + entityline 354 if isinstance(entityline, unicode): 355 entityline = entityline.encode('UTF-8') 356 lines.append(entityline+'\n') 357 return "".join(lines)
358
359 -class dtdfile(base.TranslationStore):
360 """this class represents a .dtd file, made up of dtdunits""" 361 UnitClass = dtdunit
362 - def __init__(self, inputfile=None):
363 """construct a dtdfile, optionally reading in from inputfile""" 364 base.TranslationStore.__init__(self, unitclass = self.UnitClass) 365 self.filename = getattr(inputfile, 'name', '') 366 if inputfile is not None: 367 dtdsrc = inputfile.read() 368 self.parse(dtdsrc) 369 self.makeindex()
370
371 - def parse(self, dtdsrc):
372 """read the source code of a dtd file in and include them as dtdunits in self.units""" 373 start = 0 374 end = 0 375 lines = dtdsrc.split("\n") 376 while end < len(lines): 377 if (start == end): 378 end += 1 379 foundentity = False 380 while end < len(lines): 381 if end >= len(lines): 382 break 383 if lines[end].find('<!ENTITY') > -1: 384 foundentity = True 385 if foundentity and re.match("[\"']\s*>", lines[end]): 386 end += 1 387 break 388 end += 1 389 # print "processing from %d to %d" % (start,end) 390 391 linesprocessed = 1 # to initialise loop 392 while linesprocessed >= 1: 393 newdtd = dtdunit() 394 try: 395 linesprocessed = newdtd.parse("\n".join(lines[start:end])) 396 if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines): 397 self.units.append(newdtd) 398 except Exception, e: 399 warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end]))) 400 start += linesprocessed
401
402 - def __str__(self):
403 """convert to a string. double check that unicode is handled somehow here""" 404 source = self.getoutput() 405 if not self._valid_store(): 406 warnings.warn("DTD file '%s' does not validate" % self.filename) 407 return None 408 if isinstance(source, unicode): 409 return source.encode(getattr(self, "encoding", "UTF-8")) 410 return source
411
412 - def getoutput(self):
413 """convert the units back to source""" 414 sources = [str(dtd) for dtd in self.units] 415 return "".join(sources)
416
417 - def makeindex(self):
418 """makes self.index dictionary keyed on entities""" 419 self.index = {} 420 for dtd in self.units: 421 if not dtd.isnull(): 422 self.index[dtd.entity] = dtd
423
424 - def _valid_store(self):
425 """Validate the store to determine if it is valid 426 427 This uses ElementTree to parse the DTD 428 429 @return: If the store passes validation 430 @rtype: Boolean 431 """ 432 if etree is not None: 433 try: 434 # #expand is a Mozilla hack and are removed as they are not valid in DTDs 435 dtd = etree.DTD(StringIO.StringIO(re.sub("#expand", "", self.getoutput()))) 436 except etree.DTDParseError: 437 return False 438 return True
439