1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """Module for parsing Gettext .mo files for translation.
32
33 The coding of .mo files was produced from U{Gettext documentation
34 <http://www.gnu.org/software/gettext/manual/gettext.html#MO-Files>},
35 Pythons msgfmt.py and by observing and testing existing .mo files in the wild.
36
37 The hash algorithm is implemented for MO files, this should result in
38 faster access of the MO file. The hash is optional for Gettext
39 and is not needed for reading or writing MO files, in this implementation
40 it is always on and does produce sometimes different results to Gettext
41 in very small files.
42 """
43
44 from translate.storage import base
45 from translate.storage import po
46 from translate.storage import poheader
47 from translate.misc.multistring import multistring
48 import struct
49 import array
50 import re
51
52 MO_MAGIC_NUMBER = 0x950412deL
53
55 """Helper to unpack Gettext MO files into a Python string"""
56 f = open(filename)
57 s = f.read()
58 print "\\x%02x"*len(s) % tuple(map(ord, s))
59 f.close()
60
62 c0 = (result >> 0) & 0xff
63 c1 = (result >> 8) & 0xff
64 c2 = (result >> 16) & 0xff
65 c3 = (result >> 24) & 0xff
66
67 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3
68
70 HASHWORDBITS = 32
71 hval = 0
72 g = None
73 s = str_param
74 for s in str_param:
75 hval = hval << 4
76 hval += ord(s)
77 g = hval & 0xf << (HASHWORDBITS - 4)
78 if (g != 0):
79 hval = hval ^ g >> (HASHWORDBITS - 8)
80 hval = hval ^ g
81 return hval
82
84
85 def is_prime(num):
86
87 if (num < 2) or (num == 4):
88 return False
89 if (num == 2) or (num == 3):
90 return True
91
92 for divider in range(2, num/2):
93 if num % divider == 0:
94 return False
95 return True
96
97 candidate = start
98 while not is_prime(candidate):
99 candidate += 1
100 return candidate
101
102
103 -class mounit(base.TranslationUnit):
104 """A class representing a .mo translation message."""
109
110 - def getcontext(self):
111 """Get the message context"""
112
113 if self.msgctxt is None:
114 return None
115 return "".join(self.msgctxt)
116
118 """Is this a header entry?"""
119 return self.source == u""
120
122 """Is this message translateable?"""
123 return bool(self.source)
124
125 -class mofile(base.TranslationStore, poheader.poheader):
126 """A class representing a .mo file."""
127 UnitClass = mounit
128 Name = _("Gettext MO file")
129 Mimetypes = ["application/x-gettext-catalog", "application/x-mo"]
130 Extensions = ["mo", "gmo"]
131 _binary = True
132
139
141 """Output a string representation of the MO data file"""
142
143 def add_to_hash_table(string, i):
144 V = hashpjw(string)
145 S = hash_size <= 2 and 3 or hash_size
146 hash_cursor = V % S;
147 orig_hash_cursor = hash_cursor;
148 increment = 1 + (V % (S - 2));
149 while True:
150 index = hash_table[hash_cursor]
151 if (index == 0):
152 hash_table[hash_cursor] = i + 1
153 break
154 hash_cursor += increment
155 hash_cursor = hash_cursor % S
156 assert (hash_cursor != orig_hash_cursor)
157
158
159
160
161 hash_size = get_next_prime_number(int((len(self.units) * 4) / 3))
162 if hash_size <= 2:
163 hash_size = 3
164 MESSAGES = {}
165 for unit in self.units:
166 if isinstance(unit.source, multistring):
167 source = "".join(unit.msgidcomments) + "\0".join(unit.source.strings)
168 else:
169 source = "".join(unit.msgidcomments) + unit.source
170 if unit.msgctxt:
171 source = "".join(unit.msgctxt) + "\x04" + source
172 if isinstance(unit.target, multistring):
173 target = "\0".join(unit.target.strings)
174 else:
175 target = unit.target
176 if unit.target:
177 MESSAGES[source.encode("utf-8")] = target
178
179 hash_table = array.array("I", [0] * hash_size)
180 keys = MESSAGES.keys()
181
182 keys.sort()
183 offsets = []
184 ids = strs = ''
185 for i, id in enumerate(keys):
186
187
188
189 add_to_hash_table(id, i)
190 string = MESSAGES[id]
191 if isinstance(string, unicode):
192 string = string.encode('utf-8')
193 offsets.append((len(ids), len(id), len(strs), len(string)))
194 ids = ids + id + '\0'
195 strs = strs + string + '\0'
196 output = ''
197
198 keystart = 7*4+16*len(keys)+hash_size*4
199
200 valuestart = keystart + len(ids)
201 koffsets = []
202 voffsets = []
203
204
205 for o1, l1, o2, l2 in offsets:
206 koffsets = koffsets + [l1, o1+keystart]
207 voffsets = voffsets + [l2, o2+valuestart]
208 offsets = koffsets + voffsets
209 output = struct.pack("Iiiiiii",
210 MO_MAGIC_NUMBER,
211 0,
212 len(keys),
213 7*4,
214 7*4+len(keys)*8,
215 hash_size, 7*4+2*(len(keys)*8))
216
217 if (len(keys) > 0):
218 output = output + array.array("i", offsets).tostring()
219 output = output + hash_table.tostring()
220 output = output + ids
221 output = output + strs
222 return output
223
225 """parses the given file or file source string"""
226 if hasattr(input, 'name'):
227 self.filename = input.name
228 elif not getattr(self, 'filename', ''):
229 self.filename = ''
230 if hasattr(input, "read"):
231 mosrc = input.read()
232 input.close()
233 input = mosrc
234 little, = struct.unpack("<L", input[:4])
235 big, = struct.unpack(">L", input[:4])
236 if little == MO_MAGIC_NUMBER:
237 endian = "<"
238 elif big == MO_MAGIC_NUMBER:
239 endian = ">"
240 else:
241 raise ValueError("This is not an MO file")
242 magic, version, lenkeys, startkey, startvalue, sizehash, offsethash = struct.unpack("%sLiiiiii" % endian, input[:(7*4)])
243 if version > 1:
244 raise ValueError("Unable to process MO files with versions > 1. This is a %d version MO file" % version)
245 encoding = 'UTF-8'
246 for i in range(lenkeys):
247 nextkey = startkey+(i*2*4)
248 nextvalue = startvalue+(i*2*4)
249 klength, koffset = struct.unpack("%sii" % endian, input[nextkey:nextkey+(2*4)])
250 vlength, voffset = struct.unpack("%sii" % endian, input[nextvalue:nextvalue+(2*4)])
251 source = input[koffset:koffset+klength]
252 context = None
253 if "\x04" in source:
254 context, source = source.split("\x04")
255
256 source = multistring(source.split("\0"), encoding=encoding)
257 if source == "":
258 charset = re.search("charset=([^\\s]+)", input[voffset:voffset+vlength])
259 if charset:
260 encoding = po.encodingToUse(charset.group(1))
261 target = multistring(input[voffset:voffset+vlength].split("\0"), encoding=encoding)
262 newunit = mounit(source)
263 newunit.settarget(target)
264 if context is not None:
265 newunit.msgctxt.append(context)
266 self.addunit(newunit)
267