1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """Module for parsing Gettext .mo files for translation.
32
33 The coding of .mo files was produced from U{Gettext documentation
34 <http://www.gnu.org/software/gettext/manual/gettext.html#MO-Files>},
35 Pythons msgfmt.py and by observing and testing existing .mo files in the wild.
36
37 The hash algorithm is implemented for MO files, this should result in
38 faster access of the MO file. The hash is optional for Gettext
39 and is not needed for reading or writing MO files, in this implementation
40 it is always on and does produce sometimes different results to Gettext
41 in very small files.
42 """
43
44 import array
45 import re
46 import struct
47
48 from translate.misc.multistring import multistring
49 from translate.storage import base
50 from translate.storage import po
51 from translate.storage import poheader
52
53 MO_MAGIC_NUMBER = 0x950412deL
54
55
57 """Helper to unpack Gettext MO files into a Python string"""
58 f = open(filename)
59 s = f.read()
60 print "\\x%02x" * len(s) % tuple(map(ord, s))
61 f.close()
62
63
65 c0 = (result >> 0) & 0xff
66 c1 = (result >> 8) & 0xff
67 c2 = (result >> 16) & 0xff
68 c3 = (result >> 24) & 0xff
69
70 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3
71
72
74 HASHWORDBITS = 32
75 hval = 0
76 g = None
77 s = str_param
78 for s in str_param:
79 hval = hval << 4
80 hval += ord(s)
81 g = hval & 0xf << (HASHWORDBITS - 4)
82 if (g != 0):
83 hval = hval ^ g >> (HASHWORDBITS - 8)
84 hval = hval ^ g
85 return hval
86
87
89
90
91 def is_prime(num):
92
93 if (num < 2) or (num == 4):
94 return False
95 if (num == 2) or (num == 3):
96 return True
97
98 for divider in range(2, num / 2):
99 if num % divider == 0:
100 return False
101 return True
102
103 candidate = start
104 while not is_prime(candidate):
105 candidate += 1
106 return candidate
107
108
109 -class mounit(base.TranslationUnit):
110 """A class representing a .mo translation message."""
111
112 - def __init__(self, source=None, encoding=None):
117
118 - def getcontext(self):
119 """Get the message context"""
120
121 if self.msgctxt is None:
122 return None
123 return "".join(self.msgctxt)
124
126 """Is this a header entry?"""
127 return self.source == u""
128
130 """Is this message translateable?"""
131 return bool(self.source)
132
133
134 -class mofile(poheader.poheader, base.TranslationStore):
135 """A class representing a .mo file."""
136 UnitClass = mounit
137 Name = _("Gettext MO file")
138 Mimetypes = ["application/x-gettext-catalog", "application/x-mo"]
139 Extensions = ["mo", "gmo"]
140 _binary = True
141
143 self.UnitClass = unitclass
144 base.TranslationStore.__init__(self, unitclass=unitclass)
145 self.filename = ''
146 self._encoding = "UTF-8"
147 if inputfile is not None:
148 self.parsestring(inputfile)
149
151 """Output a string representation of the MO data file"""
152
153
154 def add_to_hash_table(string, i):
155 V = hashpjw(string)
156
157 S = hash_size <= 2 and 3 or hash_size
158 hash_cursor = V % S
159 orig_hash_cursor = hash_cursor
160 increment = 1 + (V % (S - 2))
161 while True:
162 index = hash_table[hash_cursor]
163 if (index == 0):
164 hash_table[hash_cursor] = i + 1
165 break
166 hash_cursor += increment
167 hash_cursor = hash_cursor % S
168 assert (hash_cursor != orig_hash_cursor)
169
170
171
172
173 hash_size = get_next_prime_number(int((len(self.units) * 4) / 3))
174 if hash_size <= 2:
175 hash_size = 3
176 MESSAGES = {}
177 for unit in self.units:
178
179 if not unit.istranslated():
180 continue
181 if isinstance(unit.source, multistring):
182 source = "".join(unit.msgidcomments) + \
183 "\0".join(unit.source.strings)
184 else:
185 source = "".join(unit.msgidcomments) + unit.source
186 if unit.msgctxt:
187 source = "".join(unit.msgctxt) + "\x04" + source
188 if isinstance(unit.target, multistring):
189 target = "\0".join(unit.target.strings)
190 else:
191 target = unit.target
192 if unit.target:
193 MESSAGES[source.encode("utf-8")] = target
194
195 hash_table = array.array("I", [0] * hash_size)
196 keys = MESSAGES.keys()
197
198 keys.sort()
199 offsets = []
200 ids = strs = ''
201 for i, id in enumerate(keys):
202
203
204
205 add_to_hash_table(id, i)
206 string = MESSAGES[id]
207 if isinstance(string, unicode):
208 string = string.encode('utf-8')
209 offsets.append((len(ids), len(id), len(strs), len(string)))
210 ids = ids + id + '\0'
211 strs = strs + string + '\0'
212 output = ''
213
214 keystart = 7 * 4 + 16 * len(keys) + hash_size * 4
215
216 valuestart = keystart + len(ids)
217 koffsets = []
218 voffsets = []
219
220
221 for o1, l1, o2, l2 in offsets:
222 koffsets = koffsets + [l1, o1 + keystart]
223 voffsets = voffsets + [l2, o2 + valuestart]
224 offsets = koffsets + voffsets
225 output = struct.pack("Iiiiiii",
226 MO_MAGIC_NUMBER,
227 0,
228 len(keys),
229 7 * 4,
230 7 * 4 + len(keys) * 8,
231 hash_size,
232 7 * 4 + 2 * (len(keys) * 8))
233
234 if (len(keys) > 0):
235 output = output + array.array("i", offsets).tostring()
236 output = output + hash_table.tostring()
237 output = output + ids
238 output = output + strs
239 return output
240
242 """parses the given file or file source string"""
243 if hasattr(input, 'name'):
244 self.filename = input.name
245 elif not getattr(self, 'filename', ''):
246 self.filename = ''
247 if hasattr(input, "read"):
248 mosrc = input.read()
249 input.close()
250 input = mosrc
251 little, = struct.unpack("<L", input[:4])
252 big, = struct.unpack(">L", input[:4])
253 if little == MO_MAGIC_NUMBER:
254 endian = "<"
255 elif big == MO_MAGIC_NUMBER:
256 endian = ">"
257 else:
258 raise ValueError("This is not an MO file")
259 magic, version_maj, version_min, lenkeys, startkey, \
260 startvalue, sizehash, offsethash = struct.unpack("%sLHHiiiii" % endian,
261 input[:(7 * 4)])
262 if version_maj >= 1:
263 raise base.ParseError("""Unable to process version %d.%d MO files""" % (version_maj, version_min))
264 for i in range(lenkeys):
265 nextkey = startkey + (i * 2 * 4)
266 nextvalue = startvalue + (i * 2 * 4)
267 klength, koffset = struct.unpack("%sii" % endian,
268 input[nextkey:nextkey + (2 * 4)])
269 vlength, voffset = struct.unpack("%sii" % endian,
270 input[nextvalue:nextvalue + (2 * 4)])
271 source = input[koffset:koffset + klength]
272 context = None
273 if "\x04" in source:
274 context, source = source.split("\x04")
275
276 source = multistring(source.split("\0"), encoding=self._encoding)
277 if source == "":
278 charset = re.search("charset=([^\\s]+)",
279 input[voffset:voffset + vlength])
280 if charset:
281 self._encoding = po.encodingToUse(charset.group(1))
282 target = multistring(input[voffset:voffset + vlength].split("\0"),
283 encoding=self._encoding)
284 newunit = mounit(source)
285 newunit.settarget(target)
286 if context is not None:
287 newunit.msgctxt.append(context)
288 self.addunit(newunit)
289