1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """module for parsing html files for translation"""
24
25 import re
26 from translate.storage import base
27 from HTMLParser import HTMLParser
28
30 """A unit of translatable/localisable HTML content"""
34
38
41 source = property(getsource, setsource)
42
44 self.locations.append(location)
45
48
49
50 -class htmlfile(HTMLParser, base.TranslationStore):
51 UnitClass = htmlunit
52 markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"]
53 markingattrs = []
54 includeattrs = ["alt", "summary", "standby", "abbr", "content"]
55
56 - def __init__(self, includeuntaggeddata=None, inputfile=None):
57 self.units = []
58 self.filename = getattr(inputfile, 'name', None)
59 self.currentblock = ""
60 self.currentblocknum = 0
61 self.currentcomment = ""
62 self.currenttag = None
63 self.includeuntaggeddata = includeuntaggeddata
64 HTMLParser.__init__(self)
65
66 if inputfile is not None:
67 htmlsrc = inputfile.read()
68 inputfile.close()
69 self.parse(htmlsrc)
70
72 """Returns the encoding of the html text.
73
74 We look for 'charset=' within a meta tag to do this.
75 """
76
77 pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']'''
78 result = re.findall(pattern, htmlsrc)
79 encoding = None
80 if result:
81 encoding = result[0]
82 return encoding
83
85 """Return the html text properly encoded based on a charset."""
86 charset = self.guess_encoding(htmlsrc)
87 if charset:
88 return htmlsrc.decode(charset)
89 else:
90 return htmlsrc
91
93 """Replaces all instances of PHP with placeholder tags, and returns
94 the new text and a dictionary of tags. The current implementation
95 replaces <?foo?> with <?md5(foo)?>. The hash => code conversions
96 are stored in self.phpdict for later use in restoring the real PHP.
97
98 The purpose of this is to remove all potential "tag-like" code from
99 inside PHP. The hash looks nothing like an HTML tag, but the following
100 PHP::
101 $a < $b ? $c : ($d > $e ? $f : $g)
102 looks like it contains an HTML tag::
103 < $b ? $c : ($d >
104 to nearly any regex. Hence, we replace all contents of PHP with simple
105 strings to help our regexes out.
106
107 """
108
109 from translate.misc import hash
110
111 self.phpdict = {}
112 result = re.findall('(?s)<\?(.*?)\?>', text)
113 for cmd in result:
114 h = hash.md5_f(cmd).hexdigest()
115 self.phpdict[h] = cmd
116 text = text.replace(cmd, h)
117 return text
118
124
125 - def parse(self, htmlsrc):
126 htmlsrc = self.do_encoding(htmlsrc)
127 htmlsrc = self.phprep(htmlsrc)
128 self.feed(htmlsrc)
129
138
140 """Strip unnecessary html from the text.
141
142 HTML tags are deemed unnecessary if it fully encloses the translatable
143 text, eg. '<a href="index.html">Home Page</a>'.
144
145 HTML tags that occurs within the normal flow of text will not be removed,
146 eg. 'This is a link to the <a href="index.html">Home Page</a>.'
147 """
148 text = text.strip()
149
150
151 result = re.findall('(?s)^<\?.*?\?>$', text)
152 if len(result) == 1:
153 return ""
154
155
156
157 pattern = re.compile(r'''
158 (?s)^ # We allow newlines, and match start of line
159 <[^?>] # Match start of tag and the first character (not ? or >)
160 (?:
161 (?:
162 [^>] # Anything that's not a > is valid tag material
163 |
164 (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid
165 )* # Repeat over valid tag material
166 [^?>] # If we have > 1 char, the last char can't be ? or >
167 )? # The repeated chars are optional, so that <a>, <p> work
168 > # Match ending > of opening tag
169
170 (.*) # Match actual contents of tag
171
172 </.*[^?]> # Match ending tag; can't end with ?> and must be >=1 char
173 $ # Match end of line
174 ''', re.VERBOSE)
175 result = re.findall(pattern, text)
176 if len(result) == 1:
177 text = self.strip_html(result[0])
178 return text
179
181 """Check if the supplied HTML snippet has any content that needs to be translated."""
182
183 text = text.strip()
184 result = re.findall('(?i).*(charset.*=.*)', text)
185 if len(result) == 1:
186 return False
187
188
189 if text == ' ':
190 return False
191
192 pattern = '<\?.*?\?>'
193 result = re.sub(pattern, '', text).strip()
194 pattern = '<[^>]*>'
195 result = re.sub(pattern, '', result).strip()
196 if result:
197 return True
198 else:
199 return False
200
201
202
204 self.addhtmlblock(self.currentblock)
205 self.currentblock = ""
206 self.currentcomment = ""
207 self.currenttag = tag
208
210 self.addhtmlblock(self.currentblock)
211 self.currentblock = ""
212 self.currentcomment = ""
213 self.currenttag = None
214
216 newblock = 0
217 if tag in self.markingtags:
218 newblock = 1
219 for attrname, attrvalue in attrs:
220 if attrname in self.markingattrs:
221 newblock = 1
222 if attrname in self.includeattrs:
223 self.addhtmlblock(attrvalue)
224
225 if newblock:
226 self.startblock(tag)
227 elif self.currenttag is not None:
228 self.currentblock += self.get_starttag_text()
229
231 for attrname, attrvalue in attrs:
232 if attrname in self.includeattrs:
233 self.addhtmlblock(attrvalue)
234 if self.currenttag is not None:
235 self.currentblock += self.get_starttag_text()
236
238 if tag == self.currenttag:
239 self.endblock()
240 elif self.currenttag is not None:
241 self.currentblock += '</%s>' % tag
242
244 if self.currenttag is not None:
245 self.currentblock += data
246 elif self.includeuntaggeddata:
247 self.startblock(None)
248 self.currentblock += data
249
252
255
262
265
268