1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """Manage the Wordfast Translation Memory format
23
24 Wordfast TM format is the Translation Memory format used by the
25 U{Wordfast<http://www.wordfast.net/>} computer aided translation tool.
26
27 It is a bilingual base class derived format with L{WordfastTMFile}
28 and L{WordfastUnit} providing file and unit level access.
29
30 Wordfast tools
31 ==============
32 Wordfast is a computer aided translation tool. It is an application
33 built on top of Microsoft Word and is implemented as a rather
34 sophisticated set of macros. Understanding that helps us understand
35 many of the seemingly strange choices around this format including:
36 encoding, escaping and file naming.
37
38 Implementation
39 ==============
40 The implementation covers the full requirements of a Wordfast TM file.
41 The files are simple Tab Separated Value (TSV) files that can be read
42 by Microsoft Excel and other spreadsheet programs. They use the .txt
43 extension which does make it more difficult to automatically identify
44 such files.
45
46 The dialect of the TSV files is specified by L{WordfastDialect}.
47
48 Encoding
49 --------
50 The files are UTF-16 or ISO-8859-1 (Latin1) encoded. These choices
51 are most likely because Microsoft Word is the base editing tool for
52 Wordfast.
53
54 The format is tab separated so we are able to detect UTF-16 vs Latin-1
55 by searching for the occurance of a UTF-16 tab character and then
56 continuing with the parsing.
57
58 Timestamps
59 ----------
60 L{WordfastTime} allows for the correct management of the Wordfast
61 YYYYMMDD~HHMMSS timestamps. However, timestamps on individual units are
62 not updated when edited.
63
64 Header
65 ------
66 L{WordfastHeader} provides header management support. The header
67 functionality is fully implemented through observing the behaviour of the
68 files in real use cases, input from the Wordfast programmers and
69 public documentation.
70
71 Escaping
72 --------
73 Wordfast TM implements a form of escaping that covers two aspects:
74 1. Placeable: bold, formating, etc. These are left as is and ignored.
75 It is up to the editor and future placeable implementation to manage
76 these.
77 2. Escapes: items that may confuse Excel or translators are
78 escaped as &'XX;. These are fully implemented and are converted to
79 and from Unicode. By observing behaviour and reading documentation
80 we where able to observe all possible escapes. Unfortunately the
81 escaping differs slightly between Windows and Mac version. This
82 might cause errors in future.
83 Functions allow for L{conversion to Unicode<_wf_to_char>} and L{back to
84 Wordfast escapes<_char_to_wf>}.
85
86 Extended Attributes
87 -------------------
88 The last 4 columns allow users to define and manage extended attributes.
89 These are left as is and are not directly managed byour implemenation.
90 """
91
92 import csv
93 import sys
94 import time
95 from translate.storage import base
96
97 WF_TIMEFORMAT = "%Y%m%d~%H%M%S"
98 """Time format used by Wordfast"""
99
100 WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version", "target-lang", "license", "attr1list", "attr2list", "attr3list", "attr4list", "attr5list"]
101 """Field names for the Wordfast header"""
102
103 WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang", "target", "attr1", "attr2", "attr3", "attr4"]
104 """Field names for a Wordfast TU"""
105
106 WF_FIELDNAMES_HEADER_DEFAULTS = {
107 "date": "%19000101~121212",
108 "userlist": "%User ID,TT,TT Translate-Toolkit",
109 "tucount": "%TU=00000001",
110 "src-lang": "%EN-US",
111 "version": "%Wordfast TM v.5.51w9/00",
112 "target-lang": "",
113 "license": "%---00000001",
114 "attr1list": "",
115 "attr2list": "",
116 "attr3list": "",
117 "attr4list": "" }
118 """Default or minimum header entries for a Wordfast file"""
119
120
121
122
123
124 WF_ESCAPE_MAP = (
125 ("&'26;", u"\u0026"),
126 ("&'82;", u"\u201A"),
127 ("&'85;", u"\u2026"),
128 ("&'91;", u"\u2018"),
129 ("&'92;", u"\u2019"),
130 ("&'93;", u"\u201C"),
131 ("&'94;", u"\u201D"),
132 ("&'96;", u"\u2013"),
133 ("&'97;", u"\u2014"),
134 ("&'99;", u"\u2122"),
135
136 ("&'A0;", u"\u00A0"),
137 ("&'A9;", u"\u00A9"),
138 ("&'AE;", u"\u00AE"),
139 ("&'BC;", u"\u00BC"),
140 ("&'BD;", u"\u00BD"),
141 ("&'BE;", u"\u00BE"),
142
143 ("&'A8;", u"\u00AE"),
144 ("&'AA;", u"\u2122"),
145 ("&'C7;", u"\u00AB"),
146 ("&'C8;", u"\u00BB"),
147 ("&'C9;", u"\u2026"),
148 ("&'CA;", u"\u00A0"),
149 ("&'D0;", u"\u2013"),
150 ("&'D1;", u"\u2014"),
151 ("&'D2;", u"\u201C"),
152 ("&'D3;", u"\u201D"),
153 ("&'D4;", u"\u2018"),
154 ("&'D5;", u"\u2019"),
155 ("&'E2;", u"\u201A"),
156 ("&'E3;", u"\u201E"),
157
158
159 )
160 """Mapping of Wordfast &'XX; escapes to correct Unicode characters"""
161
162 TAB_UTF16 = "\x00\x09"
163 """The tab \\t character as it would appear in UTF-16 encoding"""
164
166 """Char -> Wordfast &'XX; escapes
167
168 Full roundtripping is not possible because of the escaping of NEWLINE \\n
169 and TAB \\t"""
170
171 if string:
172 for code, char in WF_ESCAPE_MAP:
173 string = string.replace(char.encode('utf-8'), code)
174 string = string.replace("\n", "\\n").replace("\t", "\\t")
175 return string
176
184
198 csv.register_dialect("wordfast", WordfastDialect)
199
201 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
203 self._time = None
204 if not newtime:
205 self.time = None
206 elif isinstance(newtime, basestring):
207 self.timestring = newtime
208 elif isinstance(newtime, time.struct_time):
209 self.time = newtime
210
212 """Get the time in the Wordfast time format"""
213 if not self._time:
214 return None
215 else:
216 return time.strftime(WF_TIMEFORMAT, self._time)
217
219 """Set the time_sturct object using a Wordfast time formated string
220
221 @param timestring: A Wordfast time string (YYYMMDD~hhmmss)
222 @type timestring: String
223 """
224 self._time = time.strptime(timestring, WF_TIMEFORMAT)
225 timestring = property(get_timestring, set_timestring)
226
228 """Get the time_struct object"""
229 return self._time
230
232 """Set the time_struct object
233
234 @param newtime: a new time object
235 @type newtime: time.time_struct
236 """
237 if newtime and isinstance(newtime, time.struct_time):
238 self._time = newtime
239 else:
240 self._time = None
241 time = property(get_time, set_time)
242
248
250 """A wordfast translation memory header"""
257
263
265 """Get the header dictionary"""
266 return self._header_dict
267
269 self._header_dict = newheader
270 header = property(getheader, setheader)
271
273 self._header_dict['target-lang'] = '%%%s' % newlang
274 targetlang = property(None, settargetlang)
275
277 self._header_dict['tucount'] = '%%TU=%08d' % count
278 tucount = property(None, settucount)
279
281 """A Wordfast translation memory unit"""
287
291
293 """Get the dictionary of values for a Wordfast line"""
294 return self._dict
295
297 """Set the dictionary of values for a Wordfast line
298
299 @param newdict: a new dictionary with Wordfast line elements
300 @type newdict: Dict
301 """
302
303 self._dict = newdict
304 dict = property(getdict, setdict)
305
307 if self._dict.get(key, None) is None:
308 return None
309 elif self._dict[key]:
310 return _wf_to_char(self._dict[key]).decode('utf-8')
311 else:
312 return ""
313
315 if newvalue is None:
316 self._dict[key] = None
317 if isinstance(newvalue, unicode):
318 newvalue = newvalue.encode('utf-8')
319 newvalue = _char_to_wf(newvalue)
320 if not key in self._dict or newvalue != self._dict[key]:
321 self._dict[key] = newvalue
322 self._update_timestamp()
323
326
329 source = property(getsource, setsource)
330
333
336 target = property(gettarget, settarget)
337
339 self._dict['target-lang'] = newlang
340 targetlang = property(None, settargetlang)
341
343 return str(self._dict)
344
346 if not self._dict.get('source', None):
347 return False
348 return bool(self._dict.get('target', None))
349
350
352 """A Wordfast translation memory file"""
353 Name = _("Wordfast Translation Memory")
354 Mimetypes = ["application/x-wordfast"]
355 Extensions = ["txt"]
357 """construct a Wordfast TM, optionally reading in from inputfile."""
358 self.UnitClass = unitclass
359 base.TranslationStore.__init__(self, unitclass=unitclass)
360 self.filename = ''
361 self.header = WordfastHeader()
362 self._encoding = 'iso-8859-1'
363 if inputfile is not None:
364 self.parse(inputfile)
365
367 """parsese the given file or file source string"""
368 if hasattr(input, 'name'):
369 self.filename = input.name
370 elif not getattr(self, 'filename', ''):
371 self.filename = ''
372 if hasattr(input, "read"):
373 tmsrc = input.read()
374 input.close()
375 input = tmsrc
376 if TAB_UTF16 in input.split("\n")[0]:
377 self._encoding = 'utf-16'
378 else:
379 self._encoding = 'iso-8859-1'
380 try:
381 input = input.decode(self._encoding).encode('utf-8')
382 except:
383 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded")
384 for header in csv.DictReader(input.split("\n")[:1], fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast"):
385 self.header = WordfastHeader(header)
386 lines = csv.DictReader(input.split("\n")[1:], fieldnames=WF_FIELDNAMES, dialect="wordfast")
387 for line in lines:
388 newunit = WordfastUnit()
389 newunit.dict = line
390 self.addunit(newunit)
391
393 output = csv.StringIO()
394 header_output = csv.StringIO()
395 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="wordfast")
396 unit_count = 0
397 for unit in self.units:
398 if unit.istranslated():
399 unit_count += 1
400 writer.writerow(unit.dict)
401 if unit_count == 0:
402 return ""
403 output.reset()
404 self.header.tucount = unit_count
405 outheader = csv.DictWriter(header_output, fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast")
406 outheader.writerow(self.header.header)
407 header_output.reset()
408 decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8')
409 try:
410 return decoded.encode(self._encoding)
411 except UnicodeEncodeError:
412 return decoded.encode('utf-16')
413