1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Manage the Wordfast Translation Memory format
22
23 Wordfast TM format is the Translation Memory format used by the
24 U{Wordfast<http://www.wordfast.net/>} computer aided translation tool.
25
26 It is a bilingual base class derived format with L{WordfastTMFile}
27 and L{WordfastUnit} providing file and unit level access.
28
29 Wordfast tools
30 ==============
31 Wordfast is a computer aided translation tool. It is an application
32 built on top of Microsoft Word and is implemented as a rather
33 sophisticated set of macros. Understanding that helps us understand
34 many of the seemingly strange choices around this format including:
35 encoding, escaping and file naming.
36
37 Implementation
38 ==============
39 The implementation covers the full requirements of a Wordfast TM file.
40 The files are simple Tab Separated Value (TSV) files that can be read
41 by Microsoft Excel and other spreadsheet programs. They use the .txt
42 extension which does make it more difficult to automatically identify
43 such files.
44
45 The dialect of the TSV files is specified by L{WordfastDialect}.
46
47 Encoding
48 --------
49 The files are UTF-16 or ISO-8859-1 (Latin1) encoded. These choices
50 are most likely because Microsoft Word is the base editing tool for
51 Wordfast.
52
53 The format is tab separated so we are able to detect UTF-16 vs Latin-1
54 by searching for the occurance of a UTF-16 tab character and then
55 continuing with the parsing.
56
57 Timestamps
58 ----------
59 L{WordfastTime} allows for the correct management of the Wordfast
60 YYYYMMDD~HHMMSS timestamps. However, timestamps on individual units are
61 not updated when edited.
62
63 Header
64 ------
65 L{WordfastHeader} provides header management support. The header
66 functionality is fully implemented through observing the behaviour of the
67 files in real use cases, input from the Wordfast programmers and
68 public documentation.
69
70 Escaping
71 --------
72 Wordfast TM implements a form of escaping that covers two aspects:
73 1. Placeable: bold, formating, etc. These are left as is and ignored.
74 It is up to the editor and future placeable implementation to manage
75 these.
76 2. Escapes: items that may confuse Excel or translators are
77 escaped as &'XX;. These are fully implemented and are converted to
78 and from Unicode. By observing behaviour and reading documentation
79 we where able to observe all possible escapes. Unfortunately the
80 escaping differs slightly between Windows and Mac version. This
81 might cause errors in future.
82 Functions allow for L{conversion to Unicode<_wf_to_char>} and L{back to
83 Wordfast escapes<_char_to_wf>}.
84
85 Extended Attributes
86 -------------------
87 The last 4 columns allow users to define and manage extended attributes.
88 These are left as is and are not directly managed byour implemenation.
89 """
90
91 import csv
92 import sys
93 import time
94 from translate.storage import base
95
96 WF_TIMEFORMAT = "%Y%m%d~%H%M%S"
97 """Time format used by Wordfast"""
98
99 WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version",
100 "target-lang", "license", "attr1list", "attr2list",
101 "attr3list", "attr4list", "attr5list"]
102 """Field names for the Wordfast header"""
103
104 WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang",
105 "target", "attr1", "attr2", "attr3", "attr4"]
106 """Field names for a Wordfast TU"""
107
108 WF_FIELDNAMES_HEADER_DEFAULTS = {
109 "date": "%19000101~121212",
110 "userlist": "%User ID,TT,TT Translate-Toolkit",
111 "tucount": "%TU=00000001",
112 "src-lang": "%EN-US",
113 "version": "%Wordfast TM v.5.51w9/00",
114 "target-lang": "",
115 "license": "%---00000001",
116 "attr1list": "",
117 "attr2list": "",
118 "attr3list": "",
119 "attr4list": ""}
120 """Default or minimum header entries for a Wordfast file"""
121
122
123
124
125
126 WF_ESCAPE_MAP = (
127 ("&'26;", u"\u0026"),
128
129 ("&'82;", u"\u201A"),
130 ("&'85;", u"\u2026"),
131 ("&'91;", u"\u2018"),
132 ("&'92;", u"\u2019"),
133 ("&'93;", u"\u201C"),
134 ("&'94;", u"\u201D"),
135 ("&'96;", u"\u2013"),
136 ("&'97;", u"\u2014"),
137 ("&'99;", u"\u2122"),
138
139 ("&'A0;", u"\u00A0"),
140 ("&'A9;", u"\u00A9"),
141 ("&'AE;", u"\u00AE"),
142 ("&'BC;", u"\u00BC"),
143 ("&'BD;", u"\u00BD"),
144 ("&'BE;", u"\u00BE"),
145
146 ("&'A8;", u"\u00AE"),
147 ("&'AA;", u"\u2122"),
148 ("&'C7;", u"\u00AB"),
149 ("&'C8;", u"\u00BB"),
150 ("&'C9;", u"\u2026"),
151 ("&'CA;", u"\u00A0"),
152 ("&'D0;", u"\u2013"),
153 ("&'D1;", u"\u2014"),
154 ("&'D2;", u"\u201C"),
155 ("&'D3;", u"\u201D"),
156 ("&'D4;", u"\u2018"),
157 ("&'D5;", u"\u2019"),
158 ("&'E2;", u"\u201A"),
159 ("&'E3;", u"\u201E"),
160
161
162
163
164 )
165 """Mapping of Wordfast &'XX; escapes to correct Unicode characters"""
166
167 TAB_UTF16 = "\x00\x09"
168 """The tab \\t character as it would appear in UTF-16 encoding"""
169
170
172 """Char -> Wordfast &'XX; escapes
173
174 Full roundtripping is not possible because of the escaping of
175 NEWLINE \\n and TAB \\t"""
176
177
178 if string:
179 for code, char in WF_ESCAPE_MAP:
180 string = string.replace(char.encode('utf-8'), code)
181 string = string.replace("\n", "\\n").replace("\t", "\\t")
182 return string
183
184
192
193
208 csv.register_dialect("wordfast", WordfastDialect)
209
210
212 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
213
215 self._time = None
216 if not newtime:
217 self.time = None
218 elif isinstance(newtime, basestring):
219 self.timestring = newtime
220 elif isinstance(newtime, time.struct_time):
221 self.time = newtime
222
224 """Get the time in the Wordfast time format"""
225 if not self._time:
226 return None
227 else:
228 return time.strftime(WF_TIMEFORMAT, self._time)
229
231 """Set the time_sturct object using a Wordfast time formated string
232
233 @param timestring: A Wordfast time string (YYYMMDD~hhmmss)
234 @type timestring: String
235 """
236 self._time = time.strptime(timestring, WF_TIMEFORMAT)
237 timestring = property(get_timestring, set_timestring)
238
240 """Get the time_struct object"""
241 return self._time
242
244 """Set the time_struct object
245
246 @param newtime: a new time object
247 @type newtime: time.time_struct
248 """
249 if newtime and isinstance(newtime, time.struct_time):
250 self._time = newtime
251 else:
252 self._time = None
253 time = property(get_time, set_time)
254
260
261
263 """A wordfast translation memory header"""
264
271
278
280 """Get the header dictionary"""
281 return self._header_dict
282
284 self._header_dict = newheader
285 header = property(getheader, setheader)
286
288 self._header_dict['target-lang'] = '%%%s' % newlang
289 targetlang = property(None, settargetlang)
290
292 self._header_dict['tucount'] = '%%TU=%08d' % count
293 tucount = property(None, settucount)
294
295
297 """A Wordfast translation memory unit"""
298
304
308
310 """Get the dictionary of values for a Wordfast line"""
311 return self._dict
312
314 """Set the dictionary of values for a Wordfast line
315
316 @param newdict: a new dictionary with Wordfast line elements
317 @type newdict: Dict
318 """
319
320 self._dict = newdict
321 dict = property(getdict, setdict)
322
324 if self._dict.get(key, None) is None:
325 return None
326 elif self._dict[key]:
327 return _wf_to_char(self._dict[key]).decode('utf-8')
328 else:
329 return ""
330
332 if newvalue is None:
333 self._dict[key] = None
334 if isinstance(newvalue, unicode):
335 newvalue = newvalue.encode('utf-8')
336 newvalue = _char_to_wf(newvalue)
337 if not key in self._dict or newvalue != self._dict[key]:
338 self._dict[key] = newvalue
339 self._update_timestamp()
340
343
347 source = property(getsource, setsource)
348
351
355 target = property(gettarget, settarget)
356
358 self._dict['target-lang'] = newlang
359 targetlang = property(None, settargetlang)
360
362 return str(self._dict)
363
365 if not self._dict.get('source', None):
366 return False
367 return bool(self._dict.get('target', None))
368
369
371 """A Wordfast translation memory file"""
372 Name = _("Wordfast Translation Memory")
373 Mimetypes = ["application/x-wordfast"]
374 Extensions = ["txt"]
375
377 """construct a Wordfast TM, optionally reading in from inputfile."""
378 self.UnitClass = unitclass
379 base.TranslationStore.__init__(self, unitclass=unitclass)
380 self.filename = ''
381 self.header = WordfastHeader()
382 self._encoding = 'iso-8859-1'
383 if inputfile is not None:
384 self.parse(inputfile)
385
387 """parsese the given file or file source string"""
388 if hasattr(input, 'name'):
389 self.filename = input.name
390 elif not getattr(self, 'filename', ''):
391 self.filename = ''
392 if hasattr(input, "read"):
393 tmsrc = input.read()
394 input.close()
395 input = tmsrc
396 if TAB_UTF16 in input.split("\n")[0]:
397 self._encoding = 'utf-16'
398 else:
399 self._encoding = 'iso-8859-1'
400 try:
401 input = input.decode(self._encoding).encode('utf-8')
402 except:
403 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded")
404 for header in csv.DictReader(input.split("\n")[:1],
405 fieldnames=WF_FIELDNAMES_HEADER,
406 dialect="wordfast"):
407 self.header = WordfastHeader(header)
408 lines = csv.DictReader(input.split("\n")[1:],
409 fieldnames=WF_FIELDNAMES,
410 dialect="wordfast")
411 for line in lines:
412 newunit = WordfastUnit()
413 newunit.dict = line
414 self.addunit(newunit)
415
417 output = csv.StringIO()
418 header_output = csv.StringIO()
419 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES,
420 dialect="wordfast")
421 unit_count = 0
422 for unit in self.units:
423 if unit.istranslated():
424 unit_count += 1
425 writer.writerow(unit.dict)
426 if unit_count == 0:
427 return ""
428 output.reset()
429 self.header.tucount = unit_count
430 outheader = csv.DictWriter(header_output,
431 fieldnames=WF_FIELDNAMES_HEADER,
432 dialect="wordfast")
433 outheader.writerow(self.header.header)
434 header_output.reset()
435 decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8')
436 try:
437 return decoded.encode(self._encoding)
438 except UnicodeEncodeError:
439 return decoded.encode('utf-16')
440