Package translate :: Package storage :: Module dtd
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.dtd

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile) 
 23  these are specific .dtd files for localisation used by mozilla""" 
 24   
 25  from translate.storage import base 
 26  from translate.misc import quote 
 27   
 28  import re 
 29  import warnings 
 30  try: 
 31      from lxml import etree 
 32      import StringIO 
 33  except ImportError: 
 34      etree = None 
 35   
 36  labelsuffixes = (".label", ".title") 
 37  """Label suffixes: entries with this suffix are able to be comibed with accesskeys 
 38  found in in entries ending with L{accesskeysuffixes}""" 
 39  accesskeysuffixes = (".accesskey", ".accessKey", ".akey") 
 40  """Accesskey Suffixes: entries with this suffix may be combined with labels 
 41  ending in L{labelsuffixes} into accelerator notation""" 
 42   
43 -def quotefordtd(source):
44 if '"' in source: 45 if "'" in source: 46 return "'" + source.replace("'", ''') + "'" 47 else: 48 return quote.singlequotestr(source) 49 else: 50 return quote.quotestr(source)
51
52 -def unquotefromdtd(source):
53 """unquotes a quoted dtd definition""" 54 # extract the string, get rid of quoting 55 if len(source) == 0: 56 source = '""' 57 quotechar = source[0] 58 extracted, quotefinished = quote.extractwithoutquotes(source, quotechar, quotechar, allowreentry=False) 59 if quotechar == "'" and "'" in extracted: 60 extracted = extracted.replace("'", "'") 61 # the quote characters should be the first and last characters in the string 62 # of course there could also be quote characters within the string; not handled here 63 return extracted
64
65 -def removeinvalidamps(name, value):
66 """Find and remove ampersands that are not part of an entity definition. 67 68 A stray & in a DTD file can break an applications ability to parse the file. In Mozilla 69 localisation this is very important and these can break the parsing of files used in XUL 70 and thus break interface rendering. Tracking down the problem is very difficult, 71 thus by removing potential broken & and warning the users we can ensure that the output 72 DTD will always be parsable. 73 74 @type name: String 75 @param name: Entity name 76 @type value: String 77 @param value: Entity text value 78 @rtype: String 79 @return: Entity value without bad ampersands 80 """ 81 def is_valid_entity_name(name): 82 """Check that supplied L{name} is a valid entity name""" 83 if name.replace('.', '').isalnum(): 84 return True 85 elif name[0] == '#' and name[1:].isalnum(): 86 return True 87 return False
88 89 amppos = 0 90 invalid_amps = [] 91 while amppos >= 0: 92 amppos = value.find("&", amppos) 93 if amppos != -1: 94 amppos += 1 95 semipos = value.find(";", amppos) 96 if semipos != -1: 97 if is_valid_entity_name(value[amppos:semipos]): 98 continue 99 invalid_amps.append(amppos-1) 100 if len(invalid_amps) > 0: 101 warnings.warn("invalid ampersands in dtd entity %s" % (name)) 102 adjustment = 0 103 for amppos in invalid_amps: 104 value = value[:amppos-adjustment] + value[amppos-adjustment+1:] 105 adjustment += 1 106 return value 107
108 -class dtdunit(base.TranslationUnit):
109 """this class represents an entity definition from a dtd file (and possibly associated comments)"""
110 - def __init__(self, source=""):
111 """construct the dtdunit, prepare it for parsing""" 112 super(dtdunit, self).__init__(source) 113 self.comments = [] 114 self.unparsedlines = [] 115 self.incomment = False 116 self.inentity = False 117 self.entity = "FakeEntityOnlyForInitialisationAndTesting" 118 self.source = source
119 120 # Note that source and target are equivalent for monolingual units
121 - def setsource(self, source):
122 """Sets the definition to the quoted value of source""" 123 self.definition = quotefordtd(source)
124
125 - def getsource(self):
126 """gets the unquoted source string""" 127 return unquotefromdtd(self.definition)
128 source = property(getsource, setsource) 129
130 - def settarget(self, target):
131 """Sets the definition to the quoted value of target""" 132 if target is None: 133 target = "" 134 self.definition = quotefordtd(target)
135
136 - def gettarget(self):
137 """gets the unquoted target string""" 138 return unquotefromdtd(self.definition)
139 target = property(gettarget, settarget) 140
141 - def isnull(self):
142 """returns whether this dtdunit doesn't actually have an entity definition""" 143 # for dtds, we currently return a blank string if there is no .entity (==location in other files) 144 # TODO: this needs to work better with base class expectations 145 return self.entity is None
146
147 - def parse(self, dtdsrc):
148 """read the first dtd element from the source code into this object, return linesprocessed""" 149 self.comments = [] 150 # make all the lists the same 151 self.locfilenotes = self.comments 152 self.locgroupstarts = self.comments 153 self.locgroupends = self.comments 154 self.locnotes = self.comments 155 # self.locfilenotes = [] 156 # self.locgroupstarts = [] 157 # self.locgroupends = [] 158 # self.locnotes = [] 159 # self.comments = [] 160 self.entity = None 161 self.definition = '' 162 if not dtdsrc: 163 return 0 164 lines = dtdsrc.split("\n") 165 linesprocessed = 0 166 comment = "" 167 for line in lines: 168 line += "\n" 169 linesprocessed += 1 170 # print "line(%d,%d): " % (self.incomment,self.inentity),line[:-1] 171 if not self.incomment: 172 if (line.find('<!--') != -1): 173 self.incomment = True 174 self.continuecomment = False 175 # now work out the type of comment, and save it (remember we're not in the comment yet) 176 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0) 177 if comment.find('LOCALIZATION NOTE') != -1: 178 l = quote.findend(comment,'LOCALIZATION NOTE') 179 while (comment[l] == ' '): 180 l += 1 181 if comment.find('FILE', l) == l: 182 self.commenttype = "locfile" 183 elif comment.find('BEGIN', l) == l: 184 self.commenttype = "locgroupstart" 185 elif comment.find('END', l) == l: 186 self.commenttype = "locgroupend" 187 else: 188 self.commenttype = "locnote" 189 else: 190 # plain comment 191 self.commenttype = "comment" 192 #FIXME: bloody entity might share a line with something important 193 elif not self.inentity and re.search("%.*;", line): 194 # now work out the type of comment, and save it (remember we're not in the comment yet) 195 self.comments.append(("comment", line)) 196 line = "" 197 continue 198 199 if self.incomment: 200 # some kind of comment 201 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment) 202 # print "comment(%d,%d): " % (self.incomment,self.continuecomment),comment 203 self.continuecomment = self.incomment 204 # strip the comment out of what will be parsed 205 line = line.replace(comment, "", 1) 206 # add a end of line of this is the end of the comment 207 if not self.incomment: 208 if line.isspace(): 209 comment += line 210 line = '' 211 else: 212 comment += '\n' 213 # check if there's actually an entity definition that's commented out 214 # TODO: parse these, store as obsolete messages 215 # if comment.find('<!ENTITY') != -1: 216 # # remove the entity from the comment 217 # comment, dummy = quote.extractwithoutquotes(comment, ">", "<!ENTITY", None, 1) 218 # depending on the type of comment (worked out at the start), put it in the right place 219 # make it record the comment and type as a tuple 220 commentpair = (self.commenttype, comment) 221 if self.commenttype == "locfile": 222 self.locfilenotes.append(commentpair) 223 elif self.commenttype == "locgroupstart": 224 self.locgroupstarts.append(commentpair) 225 elif self.commenttype == "locgroupend": 226 self.locgroupends.append(commentpair) 227 elif self.commenttype == "locnote": 228 self.locnotes.append(commentpair) 229 elif self.commenttype == "comment": 230 self.comments.append(commentpair) 231 232 if not self.inentity and not self.incomment: 233 entitypos = line.find('<!ENTITY') 234 if entitypos != -1: 235 self.inentity = True 236 beforeentity = line[:entitypos].strip() 237 if beforeentity.startswith("#"): 238 self.hashprefix = beforeentity 239 self.entitypart = "start" 240 else: 241 self.unparsedlines.append(line) 242 243 if self.inentity: 244 if self.entitypart == "start": 245 # the entity definition 246 e = quote.findend(line,'<!ENTITY') 247 line = line[e:] 248 self.entitypart = "name" 249 self.entitytype = "internal" 250 if self.entitypart == "name": 251 e = 0 252 while (e < len(line) and line[e].isspace()): 253 e += 1 254 self.entity = '' 255 if (e < len(line) and line[e] == '%'): 256 self.entitytype = "external" 257 self.entityparameter = "" 258 e += 1 259 while (e < len(line) and line[e].isspace()): 260 e += 1 261 while (e < len(line) and not line[e].isspace()): 262 self.entity += line[e] 263 e += 1 264 while (e < len(line) and line[e].isspace()): 265 e += 1 266 if self.entity: 267 if self.entitytype == "external": 268 self.entitypart = "parameter" 269 else: 270 self.entitypart = "definition" 271 # remember the start position and the quote character 272 if e == len(line): 273 self.entityhelp = None 274 e = 0 275 continue 276 elif self.entitypart == "definition": 277 self.entityhelp = (e, line[e]) 278 self.instring = False 279 if self.entitypart == "parameter": 280 while (e < len(line) and line[e].isspace()): 281 e += 1 282 paramstart = e 283 while (e < len(line) and line[e].isalnum()): 284 e += 1 285 self.entityparameter += line[paramstart:e] 286 while (e < len(line) and line[e].isspace()): 287 e += 1 288 line = line[e:] 289 e = 0 290 if not line: 291 continue 292 if line[0] in ('"', "'"): 293 self.entitypart = "definition" 294 self.entityhelp = (e, line[e]) 295 self.instring = False 296 if self.entitypart == "definition": 297 if self.entityhelp is None: 298 e = 0 299 while (e < len(line) and line[e].isspace()): 300 e += 1 301 if e == len(line): 302 continue 303 self.entityhelp = (e, line[e]) 304 self.instring = False 305 # actually the lines below should remember instring, rather than using it as dummy 306 e = self.entityhelp[0] 307 if (self.entityhelp[1] == "'"): 308 (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False) 309 elif (self.entityhelp[1] == '"'): 310 (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False) 311 else: 312 raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1])) 313 # for any following lines, start at the beginning of the line. remember the quote character 314 self.entityhelp = (0, self.entityhelp[1]) 315 self.definition += defpart 316 if not self.instring: 317 self.inentity = False 318 break 319 320 # uncomment this line to debug processing 321 if 0: 322 for attr in dir(self): 323 r = repr(getattr(self, attr)) 324 if len(r) > 60: 325 r = r[:57]+"..." 326 self.comments.append(("comment", "self.%s = %s" % (attr, r) )) 327 return linesprocessed
328
329 - def __str__(self):
330 """convert to a string. double check that unicode is handled somehow here""" 331 source = self.getoutput() 332 if isinstance(source, unicode): 333 return source.encode(getattr(self, "encoding", "UTF-8")) 334 return source
335
336 - def getoutput(self):
337 """convert the dtd entity back to string form""" 338 lines = [] 339 lines.extend([comment for commenttype, comment in self.comments]) 340 lines.extend(self.unparsedlines) 341 if self.isnull(): 342 result = "".join(lines) 343 return result.rstrip() + "\n" 344 # for f in self.locfilenotes: yield f 345 # for ge in self.locgroupends: yield ge 346 # for gs in self.locgroupstarts: yield gs 347 # for n in self.locnotes: yield n 348 if len(self.entity) > 0: 349 if getattr(self, 'entitytype', None) == 'external': 350 entityline = '<!ENTITY % '+self.entity+' '+self.entityparameter+' '+self.definition+'>' 351 else: 352 entityline = '<!ENTITY '+self.entity+' '+self.definition+'>' 353 if getattr(self, 'hashprefix', None): 354 entityline = self.hashprefix + " " + entityline 355 if isinstance(entityline, unicode): 356 entityline = entityline.encode('UTF-8') 357 lines.append(entityline+'\n') 358 return "".join(lines)
359
360 -class dtdfile(base.TranslationStore):
361 """this class represents a .dtd file, made up of dtdunits""" 362 UnitClass = dtdunit
363 - def __init__(self, inputfile=None):
364 """construct a dtdfile, optionally reading in from inputfile""" 365 base.TranslationStore.__init__(self, unitclass = self.UnitClass) 366 self.filename = getattr(inputfile, 'name', '') 367 if inputfile is not None: 368 dtdsrc = inputfile.read() 369 self.parse(dtdsrc) 370 self.makeindex()
371
372 - def parse(self, dtdsrc):
373 """read the source code of a dtd file in and include them as dtdunits in self.units""" 374 start = 0 375 end = 0 376 lines = dtdsrc.split("\n") 377 while end < len(lines): 378 if (start == end): 379 end += 1 380 foundentity = False 381 while end < len(lines): 382 if end >= len(lines): 383 break 384 if lines[end].find('<!ENTITY') > -1: 385 foundentity = True 386 if foundentity and re.match("[\"']\s*>", lines[end]): 387 end += 1 388 break 389 end += 1 390 # print "processing from %d to %d" % (start,end) 391 392 linesprocessed = 1 # to initialise loop 393 while linesprocessed >= 1: 394 newdtd = dtdunit() 395 try: 396 linesprocessed = newdtd.parse("\n".join(lines[start:end])) 397 if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines): 398 self.units.append(newdtd) 399 except Exception, e: 400 warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end]))) 401 start += linesprocessed
402
403 - def __str__(self):
404 """convert to a string. double check that unicode is handled somehow here""" 405 source = self.getoutput() 406 if not self._valid_store(): 407 warnings.warn("DTD file '%s' does not validate" % self.filename) 408 return None 409 if isinstance(source, unicode): 410 return source.encode(getattr(self, "encoding", "UTF-8")) 411 return source
412
413 - def getoutput(self):
414 """convert the units back to source""" 415 sources = [str(dtd) for dtd in self.units] 416 return "".join(sources)
417
418 - def makeindex(self):
419 """makes self.index dictionary keyed on entities""" 420 self.index = {} 421 for dtd in self.units: 422 if not dtd.isnull(): 423 self.index[dtd.entity] = dtd
424
425 - def _valid_store(self):
426 """Validate the store to determine if it is valid 427 428 This uses ElementTree to parse the DTD 429 430 @return: If the store passes validation 431 @rtype: Boolean 432 """ 433 if etree is not None: 434 try: 435 # #expand is a Mozilla hack and are removed as they are not valid in DTDs 436 dtd = etree.DTD(StringIO.StringIO(re.sub("#expand", "", self.getoutput()))) 437 except etree.DTDParseError: 438 return False 439 return True
440