Package translate :: Package storage :: Module html
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.html

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2004-2006,2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """module for parsing html files for translation""" 
 24   
 25  import re 
 26  from translate.storage import base 
 27  from HTMLParser import HTMLParser 
 28   
29 -class htmlunit(base.TranslationUnit):
30 """A unit of translatable/localisable HTML content"""
31 - def __init__(self, source=None):
32 self.locations = [] 33 self.setsource(source)
34
35 - def getsource(self):
36 #TODO: Rethink how clever we should try to be with html entities. 37 return self.text.replace("&amp;", "&").replace("&lt;", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
38
39 - def setsource(self, source):
40 self.text = source.replace("&", "&amp;").replace("<", "&lt;")
41 source = property(getsource, setsource) 42
43 - def addlocation(self, location):
44 self.locations.append(location)
45
46 - def getlocations(self):
47 return self.locations
48 49
50 -class htmlfile(HTMLParser, base.TranslationStore):
51 UnitClass = htmlunit 52 markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"] 53 markingattrs = [] 54 includeattrs = ["alt", "summary", "standby", "abbr", "content"] 55
56 - def __init__(self, includeuntaggeddata=None, inputfile=None):
57 self.units = [] 58 self.filename = getattr(inputfile, 'name', None) 59 self.currentblock = "" 60 self.currentblocknum = 0 61 self.currentcomment = "" 62 self.currenttag = None 63 self.includeuntaggeddata = includeuntaggeddata 64 HTMLParser.__init__(self) 65 66 if inputfile is not None: 67 htmlsrc = inputfile.read() 68 inputfile.close() 69 self.parse(htmlsrc)
70
71 - def guess_encoding(self, htmlsrc):
72 """Returns the encoding of the html text. 73 74 We look for 'charset=' within a meta tag to do this. 75 """ 76 77 pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']''' 78 result = re.findall(pattern, htmlsrc) 79 encoding = None 80 if result: 81 encoding = result[0] 82 return encoding
83
84 - def do_encoding(self, htmlsrc):
85 """Return the html text properly encoded based on a charset.""" 86 charset = self.guess_encoding(htmlsrc) 87 if charset: 88 return htmlsrc.decode(charset) 89 else: 90 return htmlsrc
91
92 - def phprep(self, text):
93 """Replaces all instances of PHP with placeholder tags, and returns 94 the new text and a dictionary of tags. The current implementation 95 replaces <?foo?> with <?md5(foo)?>. The hash => code conversions 96 are stored in self.phpdict for later use in restoring the real PHP. 97 98 The purpose of this is to remove all potential "tag-like" code from 99 inside PHP. The hash looks nothing like an HTML tag, but the following 100 PHP:: 101 $a < $b ? $c : ($d > $e ? $f : $g) 102 looks like it contains an HTML tag:: 103 < $b ? $c : ($d > 104 to nearly any regex. Hence, we replace all contents of PHP with simple 105 strings to help our regexes out. 106 107 """ 108 109 from translate.misc import hash 110 111 self.phpdict = {} 112 result = re.findall('(?s)<\?(.*?)\?>', text) 113 for cmd in result: 114 h = hash.md5_f(cmd).hexdigest() 115 self.phpdict[h] = cmd 116 text = text.replace(cmd, h) 117 return text
118
119 - def reintrophp(self, text):
120 """Replaces the PHP placeholders in text with the real code""" 121 for hash, code in self.phpdict.items(): 122 text = text.replace(hash, code) 123 return text
124
125 - def parse(self, htmlsrc):
126 htmlsrc = self.do_encoding(htmlsrc) 127 htmlsrc = self.phprep(htmlsrc) #Clear out the PHP before parsing 128 self.feed(htmlsrc)
129
130 - def addhtmlblock(self, text):
131 text = self.strip_html(text) 132 text = self.reintrophp(text) #Before adding anything, restore PHP 133 if self.has_translatable_content(text): 134 self.currentblocknum += 1 135 unit = self.addsourceunit(text) 136 unit.addlocation("%s:%d" % (self.filename, self.currentblocknum)) 137 unit.addnote(self.currentcomment)
138
139 - def strip_html(self, text):
140 """Strip unnecessary html from the text. 141 142 HTML tags are deemed unnecessary if it fully encloses the translatable 143 text, eg. '<a href="index.html">Home Page</a>'. 144 145 HTML tags that occurs within the normal flow of text will not be removed, 146 eg. 'This is a link to the <a href="index.html">Home Page</a>.' 147 """ 148 text = text.strip() 149 150 # If all that is left is PHP, return "" 151 result = re.findall('(?s)^<\?.*?\?>$', text) 152 if len(result) == 1: 153 return "" 154 155 # These two patterns are the same; the first one is more concise... 156 #pattern = '(?s)^<[^?>](?:(?:[^>]|(?:<\?.*?\?>))*[^?>])?>(.*)</.*[^?]>$' 157 pattern = re.compile(r''' 158 (?s)^ # We allow newlines, and match start of line 159 <[^?>] # Match start of tag and the first character (not ? or >) 160 (?: 161 (?: 162 [^>] # Anything that's not a > is valid tag material 163 | 164 (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid 165 )* # Repeat over valid tag material 166 [^?>] # If we have > 1 char, the last char can't be ? or > 167 )? # The repeated chars are optional, so that <a>, <p> work 168 > # Match ending > of opening tag 169 170 (.*) # Match actual contents of tag 171 172 </.*[^?]> # Match ending tag; can't end with ?> and must be >=1 char 173 $ # Match end of line 174 ''', re.VERBOSE) 175 result = re.findall(pattern, text) 176 if len(result) == 1: 177 text = self.strip_html(result[0]) 178 return text
179
180 - def has_translatable_content(self, text):
181 """Check if the supplied HTML snippet has any content that needs to be translated.""" 182 183 text = text.strip() 184 result = re.findall('(?i).*(charset.*=.*)', text) 185 if len(result) == 1: 186 return False 187 188 # TODO: Get a better way to find untranslatable entities. 189 if text == '&nbsp;': 190 return False 191 192 pattern = '<\?.*?\?>' # Lazily strip all PHP 193 result = re.sub(pattern, '', text).strip() 194 pattern = '<[^>]*>' #Strip all HTML tags 195 result = re.sub(pattern, '', result).strip() 196 if result: 197 return True 198 else: 199 return False
200 201 #From here on below, follows the methods of the HTMLParser 202
203 - def startblock(self, tag):
204 self.addhtmlblock(self.currentblock) 205 self.currentblock = "" 206 self.currentcomment = "" 207 self.currenttag = tag
208
209 - def endblock(self):
210 self.addhtmlblock(self.currentblock) 211 self.currentblock = "" 212 self.currentcomment = "" 213 self.currenttag = None
214
215 - def handle_starttag(self, tag, attrs):
216 newblock = 0 217 if tag in self.markingtags: 218 newblock = 1 219 for attrname, attrvalue in attrs: 220 if attrname in self.markingattrs: 221 newblock = 1 222 if attrname in self.includeattrs: 223 self.addhtmlblock(attrvalue) 224 225 if newblock: 226 self.startblock(tag) 227 elif self.currenttag is not None: 228 self.currentblock += self.get_starttag_text()
229
230 - def handle_startendtag(self, tag, attrs):
231 for attrname, attrvalue in attrs: 232 if attrname in self.includeattrs: 233 self.addhtmlblock(attrvalue) 234 if self.currenttag is not None: 235 self.currentblock += self.get_starttag_text()
236
237 - def handle_endtag(self, tag):
238 if tag == self.currenttag: 239 self.endblock() 240 elif self.currenttag is not None: 241 self.currentblock += '</%s>' % tag
242
243 - def handle_data(self, data):
244 if self.currenttag is not None: 245 self.currentblock += data 246 elif self.includeuntaggeddata: 247 self.startblock(None) 248 self.currentblock += data
249
250 - def handle_charref(self, name):
251 self.handle_data("&#%s;" % name)
252
253 - def handle_entityref(self, name):
254 self.handle_data("&%s;" % name)
255
256 - def handle_comment(self, data):
257 # we can place comments above the msgid as translator comments! 258 if self.currentcomment == "": 259 self.currentcomment = data 260 else: 261 self.currentcomment += '\n' + data
262
263 - def handle_pi(self, data):
264 self.handle_data("<?%s>" % data)
265
266 -class POHTMLParser(htmlfile):
267 pass
268