1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """classes that hold units of .dtd files (dtdunit) or entire files (dtdfile)
23 these are specific .dtd files for localisation used by mozilla
24
25 Specifications
26 ==============
27 The following information is provided by Mozilla::
28
29 * U{Specification<http://www.w3.org/TR/REC-xml/#sec-entexpand>}
30
31 There is a grammar for entity definitions, which isn't really precise,
32 as the spec says. There's no formal specification for DTD files, it's
33 just "whatever makes this work" basically. The whole piece is clearly not
34 the strongest point of the xml spec
35
36 XML elements are allowed in entity values. A number of things that are
37 allowed will just break the resulting document, Mozilla forbids these
38 in their DTD parser.
39 """
40
41 from translate.storage import base
42 from translate.misc import quote
43
44 import re
45 import warnings
46 try:
47 from lxml import etree
48 import StringIO
49 except ImportError:
50 etree = None
51
52 labelsuffixes = (".label", ".title")
53 """Label suffixes: entries with this suffix are able to be comibed with accesskeys
54 found in in entries ending with L{accesskeysuffixes}"""
55 accesskeysuffixes = (".accesskey", ".accessKey", ".akey")
56 """Accesskey Suffixes: entries with this suffix may be combined with labels
57 ending in L{labelsuffixes} into accelerator notation"""
58
59
68
69
82
83
85 """Find and remove ampersands that are not part of an entity definition.
86
87 A stray & in a DTD file can break an applications ability to parse the file. In Mozilla
88 localisation this is very important and these can break the parsing of files used in XUL
89 and thus break interface rendering. Tracking down the problem is very difficult,
90 thus by removing potential broken & and warning the users we can ensure that the output
91 DTD will always be parsable.
92
93 @type name: String
94 @param name: Entity name
95 @type value: String
96 @param value: Entity text value
97 @rtype: String
98 @return: Entity value without bad ampersands
99 """
100
101 def is_valid_entity_name(name):
102 """Check that supplied L{name} is a valid entity name"""
103 if name.replace('.', '').isalnum():
104 return True
105 elif name[0] == '#' and name[1:].isalnum():
106 return True
107 return False
108
109 amppos = 0
110 invalid_amps = []
111 while amppos >= 0:
112 amppos = value.find("&", amppos)
113 if amppos != -1:
114 amppos += 1
115 semipos = value.find(";", amppos)
116 if semipos != -1:
117 if is_valid_entity_name(value[amppos:semipos]):
118 continue
119 invalid_amps.append(amppos-1)
120 if len(invalid_amps) > 0:
121 warnings.warn("invalid ampersands in dtd entity %s" % (name))
122 adjustment = 0
123 for amppos in invalid_amps:
124 value = value[:amppos-adjustment] + value[amppos-adjustment+1:]
125 adjustment += 1
126 return value
127
128
129 -class dtdunit(base.TranslationUnit):
130 """this class represents an entity definition from a dtd file (and possibly associated comments)"""
131
133 """construct the dtdunit, prepare it for parsing"""
134 super(dtdunit, self).__init__(source)
135 self.comments = []
136 self.unparsedlines = []
137 self.incomment = False
138 self.inentity = False
139 self.entity = "FakeEntityOnlyForInitialisationAndTesting"
140 self.source = source
141 self.space_pre_entity = ' '
142 self.space_pre_definition = ' '
143 self.closing = ">"
144
145
150
152 """gets the unquoted source string"""
153 return unquotefromdtd(self.definition)
154 source = property(getsource, setsource)
155
162
164 """gets the unquoted target string"""
165 return unquotefromdtd(self.definition)
166 target = property(gettarget, settarget)
167
169 """returns whether this dtdunit doesn't actually have an entity definition"""
170
171
172 return self.entity is None
173
174 - def parse(self, dtdsrc):
175 """read the first dtd element from the source code into this object, return linesprocessed"""
176 self.comments = []
177
178 self.locfilenotes = self.comments
179 self.locgroupstarts = self.comments
180 self.locgroupends = self.comments
181 self.locnotes = self.comments
182
183
184
185
186
187 self.entity = None
188 self.definition = ''
189 if not dtdsrc:
190 return 0
191 lines = dtdsrc.split("\n")
192 linesprocessed = 0
193 comment = ""
194 for line in lines:
195 line += "\n"
196 linesprocessed += 1
197
198 if not self.incomment:
199 if (line.find('<!--') != -1):
200 self.incomment = True
201 self.continuecomment = False
202
203 (comment, dummy) = quote.extract(line, "<!--", "-->", None, 0)
204 if comment.find('LOCALIZATION NOTE') != -1:
205 l = quote.findend(comment, 'LOCALIZATION NOTE')
206 while (comment[l] == ' '):
207 l += 1
208 if comment.find('FILE', l) == l:
209 self.commenttype = "locfile"
210 elif comment.find('BEGIN', l) == l:
211 self.commenttype = "locgroupstart"
212 elif comment.find('END', l) == l:
213 self.commenttype = "locgroupend"
214 else:
215 self.commenttype = "locnote"
216 else:
217
218 self.commenttype = "comment"
219
220 elif not self.inentity and re.search("%.*;", line):
221
222 self.comments.append(("comment", line))
223 line = ""
224 continue
225
226 if self.incomment:
227
228 (comment, self.incomment) = quote.extract(line, "<!--", "-->", None, self.continuecomment)
229
230 self.continuecomment = self.incomment
231
232 line = line.replace(comment, "", 1)
233
234 if not self.incomment:
235 if line.isspace():
236 comment += line
237 line = ''
238 else:
239 comment += '\n'
240
241
242
243
244
245
246
247 commentpair = (self.commenttype, comment)
248 if self.commenttype == "locfile":
249 self.locfilenotes.append(commentpair)
250 elif self.commenttype == "locgroupstart":
251 self.locgroupstarts.append(commentpair)
252 elif self.commenttype == "locgroupend":
253 self.locgroupends.append(commentpair)
254 elif self.commenttype == "locnote":
255 self.locnotes.append(commentpair)
256 elif self.commenttype == "comment":
257 self.comments.append(commentpair)
258
259 if not self.inentity and not self.incomment:
260 entitypos = line.find('<!ENTITY')
261 if entitypos != -1:
262 self.inentity = True
263 beforeentity = line[:entitypos].strip()
264 if beforeentity.startswith("#"):
265 self.hashprefix = beforeentity
266 self.entitypart = "start"
267 else:
268 self.unparsedlines.append(line)
269
270 if self.inentity:
271 if self.entitypart == "start":
272
273 e = quote.findend(line, '<!ENTITY')
274 line = line[e:]
275 self.entitypart = "name"
276 self.entitytype = "internal"
277 if self.entitypart == "name":
278 s = 0
279 e = 0
280 while (e < len(line) and line[e].isspace()):
281 e += 1
282 self.space_pre_entity = ' ' * (e - s)
283 s = e
284 self.entity = ''
285 if (e < len(line) and line[e] == '%'):
286 self.entitytype = "external"
287 self.entityparameter = ""
288 e += 1
289 while (e < len(line) and line[e].isspace()):
290 e += 1
291 while (e < len(line) and not line[e].isspace()):
292 self.entity += line[e]
293 e += 1
294 s = e
295 while (e < len(line) and line[e].isspace()):
296 e += 1
297 self.space_pre_definition = ' ' * (e - s)
298 if self.entity:
299 if self.entitytype == "external":
300 self.entitypart = "parameter"
301 else:
302 self.entitypart = "definition"
303
304 if e == len(line):
305 self.entityhelp = None
306 e = 0
307 continue
308 elif self.entitypart == "definition":
309 self.entityhelp = (e, line[e])
310 self.instring = False
311 if self.entitypart == "parameter":
312 while (e < len(line) and line[e].isspace()):
313 e += 1
314 paramstart = e
315 while (e < len(line) and line[e].isalnum()):
316 e += 1
317 self.entityparameter += line[paramstart:e]
318 while (e < len(line) and line[e].isspace()):
319 e += 1
320 line = line[e:]
321 e = 0
322 if not line:
323 continue
324 if line[0] in ('"', "'"):
325 self.entitypart = "definition"
326 self.entityhelp = (e, line[e])
327 self.instring = False
328 if self.entitypart == "definition":
329 if self.entityhelp is None:
330 e = 0
331 while (e < len(line) and line[e].isspace()):
332 e += 1
333 if e == len(line):
334 continue
335 self.entityhelp = (e, line[e])
336 self.instring = False
337
338 e = self.entityhelp[0]
339 if (self.entityhelp[1] == "'"):
340 (defpart, self.instring) = quote.extract(line[e:], "'", "'", startinstring=self.instring, allowreentry=False)
341 elif (self.entityhelp[1] == '"'):
342 (defpart, self.instring) = quote.extract(line[e:], '"', '"', startinstring=self.instring, allowreentry=False)
343 else:
344 raise ValueError("Unexpected quote character... %r" % (self.entityhelp[1]))
345
346 self.entityhelp = (0, self.entityhelp[1])
347 self.definition += defpart
348 if not self.instring:
349 self.closing = line[e+len(defpart):].rstrip("\n\r")
350 self.inentity = False
351 break
352
353
354 if 0:
355 for attr in dir(self):
356 r = repr(getattr(self, attr))
357 if len(r) > 60:
358 r = r[:57] + "..."
359 self.comments.append(("comment", "self.%s = %s" % (attr, r)))
360 return linesprocessed
361
368
370 """convert the dtd entity back to string form"""
371 lines = []
372 lines.extend([comment for commenttype, comment in self.comments])
373 lines.extend(self.unparsedlines)
374 if self.isnull():
375 result = "".join(lines)
376 return result.rstrip() + "\n"
377
378
379
380
381 if len(self.entity) > 0:
382 if getattr(self, 'entitytype', None) == 'external':
383 entityline = '<!ENTITY % ' + self.entity + ' ' + self.entityparameter + ' ' + self.definition + self.closing
384 else:
385 entityline = '<!ENTITY' + self.space_pre_entity + self.entity + self.space_pre_definition + self.definition + self.closing
386 if getattr(self, 'hashprefix', None):
387 entityline = self.hashprefix + " " + entityline
388 if isinstance(entityline, unicode):
389 entityline = entityline.encode('UTF-8')
390 lines.append(entityline + '\n')
391 return "".join(lines)
392
393
394 -class dtdfile(base.TranslationStore):
395 """this class represents a .dtd file, made up of dtdunits"""
396 UnitClass = dtdunit
397
399 """construct a dtdfile, optionally reading in from inputfile"""
400 base.TranslationStore.__init__(self, unitclass=self.UnitClass)
401 self.filename = getattr(inputfile, 'name', '')
402 if inputfile is not None:
403 dtdsrc = inputfile.read()
404 self.parse(dtdsrc)
405 self.makeindex()
406
407 - def parse(self, dtdsrc):
408 """read the source code of a dtd file in and include them as dtdunits in self.units"""
409 start = 0
410 end = 0
411 lines = dtdsrc.split("\n")
412 while end < len(lines):
413 if (start == end):
414 end += 1
415 foundentity = False
416 while end < len(lines):
417 if end >= len(lines):
418 break
419 if lines[end].find('<!ENTITY') > -1:
420 foundentity = True
421 if foundentity and re.match("[\"']\s*>", lines[end]):
422 end += 1
423 break
424 end += 1
425
426
427 linesprocessed = 1
428 while linesprocessed >= 1:
429 newdtd = dtdunit()
430 try:
431 linesprocessed = newdtd.parse("\n".join(lines[start:end]))
432 if linesprocessed >= 1 and (not newdtd.isnull() or newdtd.unparsedlines):
433 self.units.append(newdtd)
434 except Exception, e:
435 warnings.warn("%s\nError occured between lines %d and %d:\n%s" % (e, start+1, end, "\n".join(lines[start:end])))
436 start += linesprocessed
437
439 """convert to a string. double check that unicode is handled somehow here"""
440 source = self.getoutput()
441 if not self._valid_store():
442 warnings.warn("DTD file '%s' does not validate" % self.filename)
443 return None
444 if isinstance(source, unicode):
445 return source.encode(getattr(self, "encoding", "UTF-8"))
446 return source
447
449 """convert the units back to source"""
450 sources = [str(dtd) for dtd in self.units]
451 return "".join(sources)
452
454 """makes self.index dictionary keyed on entities"""
455 self.index = {}
456 for dtd in self.units:
457 if not dtd.isnull():
458 self.index[dtd.entity] = dtd
459
461 """Validate the store to determine if it is valid
462
463 This uses ElementTree to parse the DTD
464
465 @return: If the store passes validation
466 @rtype: Boolean
467 """
468 if etree is not None:
469 try:
470
471 dtd = etree.DTD(StringIO.StringIO(re.sub("#expand", "", self.getoutput())))
472 except etree.DTDParseError:
473 return False
474 return True
475