Package translate :: Package storage :: Module wordfast
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.wordfast

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2007-2010 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Manage the Wordfast Translation Memory format 
 22   
 23     Wordfast TM format is the Translation Memory format used by the 
 24     U{Wordfast<http://www.wordfast.net/>} computer aided translation tool. 
 25   
 26     It is a bilingual base class derived format with L{WordfastTMFile} 
 27     and L{WordfastUnit} providing file and unit level access. 
 28   
 29     Wordfast tools 
 30     ============== 
 31     Wordfast is a computer aided translation tool.  It is an application 
 32     built on top of Microsoft Word and is implemented as a rather 
 33     sophisticated set of macros.  Understanding that helps us understand 
 34     many of the seemingly strange choices around this format including: 
 35     encoding, escaping and file naming. 
 36   
 37     Implementation 
 38     ============== 
 39     The implementation covers the full requirements of a Wordfast TM file. 
 40     The files are simple Tab Separated Value (TSV) files that can be read 
 41     by Microsoft Excel and other spreadsheet programs.  They use the .txt 
 42     extension which does make it more difficult to automatically identify 
 43     such files. 
 44   
 45     The dialect of the TSV files is specified by L{WordfastDialect}. 
 46   
 47     Encoding 
 48     -------- 
 49     The files are UTF-16 or ISO-8859-1 (Latin1) encoded.  These choices 
 50     are most likely because Microsoft Word is the base editing tool for 
 51     Wordfast. 
 52   
 53     The format is tab separated so we are able to detect UTF-16 vs Latin-1 
 54     by searching for the occurance of a UTF-16 tab character and then 
 55     continuing with the parsing. 
 56   
 57     Timestamps 
 58     ---------- 
 59     L{WordfastTime} allows for the correct management of the Wordfast 
 60     YYYYMMDD~HHMMSS timestamps.  However, timestamps on individual units are 
 61     not updated when edited. 
 62   
 63     Header 
 64     ------ 
 65     L{WordfastHeader} provides header management support.  The header 
 66     functionality is fully implemented through observing the behaviour of the 
 67     files in real use cases, input from the Wordfast programmers and 
 68     public documentation. 
 69   
 70     Escaping 
 71     -------- 
 72     Wordfast TM implements a form of escaping that covers two aspects: 
 73       1. Placeable: bold, formating, etc.  These are left as is and ignored. 
 74          It is up to the editor and future placeable implementation to manage 
 75          these. 
 76       2. Escapes: items that may confuse Excel or translators are 
 77          escaped as &'XX;. These are fully implemented and are converted to 
 78          and from Unicode.  By observing behaviour and reading documentation 
 79          we where able to observe all possible escapes. Unfortunately the 
 80          escaping differs slightly between Windows and Mac version.  This 
 81          might cause errors in future. 
 82     Functions allow for L{conversion to Unicode<_wf_to_char>} and L{back to 
 83     Wordfast escapes<_char_to_wf>}. 
 84   
 85     Extended Attributes 
 86     ------------------- 
 87     The last 4 columns allow users to define and manage extended attributes. 
 88     These are left as is and are not directly managed byour implemenation. 
 89  """ 
 90   
 91  import csv 
 92  import sys 
 93  import time 
 94  from translate.storage import base 
 95   
 96  WF_TIMEFORMAT = "%Y%m%d~%H%M%S" 
 97  """Time format used by Wordfast""" 
 98   
 99  WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version", 
100                          "target-lang", "license", "attr1list", "attr2list", 
101                          "attr3list", "attr4list", "attr5list"] 
102  """Field names for the Wordfast header""" 
103   
104  WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang", 
105                   "target", "attr1", "attr2", "attr3", "attr4"] 
106  """Field names for a Wordfast TU""" 
107   
108  WF_FIELDNAMES_HEADER_DEFAULTS = { 
109  "date": "%19000101~121212", 
110  "userlist": "%User ID,TT,TT Translate-Toolkit", 
111  "tucount": "%TU=00000001", 
112  "src-lang": "%EN-US", 
113  "version": "%Wordfast TM v.5.51w9/00", 
114  "target-lang": "", 
115  "license": "%---00000001", 
116  "attr1list": "", 
117  "attr2list": "", 
118  "attr3list": "", 
119  "attr4list": ""} 
120  """Default or minimum header entries for a Wordfast file""" 
121   
122  # TODO Needs validation.  The following need to be checked against a WF TM file 
123  # to ensure that the correct Unicode values have been chosen for the characters. 
124  # For now these look correct and have been taken from Windows CP1252 and 
125  # Macintosh code points found for the respective character sets on Linux. 
126  WF_ESCAPE_MAP = ( 
127                ("&'26;", u"\u0026"), # & - Ampersand (must be first to prevent 
128                                      #     escaping of escapes) 
129                ("&'82;", u"\u201A"), # ‚ - Single low-9 quotation mark 
130                ("&'85;", u"\u2026"), # … - Elippsis 
131                ("&'91;", u"\u2018"), # ‘ - left single quotation mark 
132                ("&'92;", u"\u2019"), # ’ - right single quotation mark 
133                ("&'93;", u"\u201C"), # “ - left double quotation mark 
134                ("&'94;", u"\u201D"), # ” - right double quotation mark 
135                ("&'96;", u"\u2013"), # – - en dash (validate) 
136                ("&'97;", u"\u2014"), # — - em dash (validate) 
137                ("&'99;", u"\u2122"), # ™ - Trade mark 
138                # Windows only 
139                ("&'A0;", u"\u00A0"), #   - Non breaking space 
140                ("&'A9;", u"\u00A9"), # © - Copyright 
141                ("&'AE;", u"\u00AE"), # ® - Registered 
142                ("&'BC;", u"\u00BC"), # ¼ 
143                ("&'BD;", u"\u00BD"), # ½ 
144                ("&'BE;", u"\u00BE"), # ¾ 
145                # Mac only 
146                ("&'A8;", u"\u00AE"), # ® - Registered 
147                ("&'AA;", u"\u2122"), # ™ - Trade mark 
148                ("&'C7;", u"\u00AB"), # « - Left-pointing double angle quotation mark 
149                ("&'C8;", u"\u00BB"), # » - Right-pointing double angle quotation mark 
150                ("&'C9;", u"\u2026"), # … - Horizontal Elippsis 
151                ("&'CA;", u"\u00A0"), #   - Non breaking space 
152                ("&'D0;", u"\u2013"), # – - en dash (validate) 
153                ("&'D1;", u"\u2014"), # — - em dash (validate) 
154                ("&'D2;", u"\u201C"), # “ - left double quotation mark 
155                ("&'D3;", u"\u201D"), # ” - right double quotation mark 
156                ("&'D4;", u"\u2018"), # ‘ - left single quotation mark 
157                ("&'D5;", u"\u2019"), # ’ - right single quotation mark 
158                ("&'E2;", u"\u201A"), # ‚ - Single low-9 quotation mark 
159                ("&'E3;", u"\u201E"), # „ - Double low-9 quotation mark 
160                # Other markers 
161                #("&'B;", u"\n"), # Soft-break - XXX creates a problem with 
162                                  # roundtripping could also be represented 
163                                  # by \u2028 
164               ) 
165  """Mapping of Wordfast &'XX; escapes to correct Unicode characters""" 
166   
167  TAB_UTF16 = "\x00\x09" 
168  """The tab \\t character as it would appear in UTF-16 encoding""" 
169   
170   
171 -def _char_to_wf(string):
172 """Char -> Wordfast &'XX; escapes 173 174 Full roundtripping is not possible because of the escaping of 175 NEWLINE \\n and TAB \\t""" 176 # FIXME there is no platform check to ensure that we use Mac encodings 177 # when running on a Mac 178 if string: 179 for code, char in WF_ESCAPE_MAP: 180 string = string.replace(char.encode('utf-8'), code) 181 string = string.replace("\n", "\\n").replace("\t", "\\t") 182 return string
183 184
185 -def _wf_to_char(string):
186 """Wordfast &'XX; escapes -> Char""" 187 if string: 188 for code, char in WF_ESCAPE_MAP: 189 string = string.replace(code, char.encode('utf-8')) 190 string = string.replace("\\n", "\n").replace("\\t", "\t") 191 return string
192 193
194 -class WordfastDialect(csv.Dialect):
195 """Describe the properties of a Wordfast generated TAB-delimited file.""" 196 delimiter = "\t" 197 lineterminator = "\r\n" 198 quoting = csv.QUOTE_NONE 199 if sys.version_info < (2, 5, 0): 200 # We need to define the following items for csv in Python < 2.5 201 quoting = csv.QUOTE_MINIMAL # Wordfast does not quote anything, since 202 # we escape \t anyway in _char_to_wf this 203 # should not be a problem 204 doublequote = False 205 skipinitialspace = False 206 escapechar = None 207 quotechar = '"'
208 csv.register_dialect("wordfast", WordfastDialect) 209 210
211 -class WordfastTime(object):
212 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss""" 213
214 - def __init__(self, newtime=None):
215 self._time = None 216 if not newtime: 217 self.time = None 218 elif isinstance(newtime, basestring): 219 self.timestring = newtime 220 elif isinstance(newtime, time.struct_time): 221 self.time = newtime
222
223 - def get_timestring(self):
224 """Get the time in the Wordfast time format""" 225 if not self._time: 226 return None 227 else: 228 return time.strftime(WF_TIMEFORMAT, self._time)
229
230 - def set_timestring(self, timestring):
231 """Set the time_sturct object using a Wordfast time formated string 232 233 @param timestring: A Wordfast time string (YYYMMDD~hhmmss) 234 @type timestring: String 235 """ 236 self._time = time.strptime(timestring, WF_TIMEFORMAT)
237 timestring = property(get_timestring, set_timestring) 238
239 - def get_time(self):
240 """Get the time_struct object""" 241 return self._time
242
243 - def set_time(self, newtime):
244 """Set the time_struct object 245 246 @param newtime: a new time object 247 @type newtime: time.time_struct 248 """ 249 if newtime and isinstance(newtime, time.struct_time): 250 self._time = newtime 251 else: 252 self._time = None
253 time = property(get_time, set_time) 254
255 - def __str__(self):
256 if not self.timestring: 257 return "" 258 else: 259 return self.timestring
260 261
262 -class WordfastHeader(object):
263 """A wordfast translation memory header""" 264
265 - def __init__(self, header=None):
266 self._header_dict = [] 267 if not header: 268 self.header = self._create_default_header() 269 elif isinstance(header, dict): 270 self.header = header
271
272 - def _create_default_header(self):
273 """Create a default Wordfast header with the date set to the current 274 time""" 275 defaultheader = WF_FIELDNAMES_HEADER_DEFAULTS 276 defaultheader['date'] = '%%%s' % WordfastTime(time.localtime()).timestring 277 return defaultheader
278
279 - def getheader(self):
280 """Get the header dictionary""" 281 return self._header_dict
282
283 - def setheader(self, newheader):
284 self._header_dict = newheader
285 header = property(getheader, setheader) 286
287 - def settargetlang(self, newlang):
288 self._header_dict['target-lang'] = '%%%s' % newlang
289 targetlang = property(None, settargetlang) 290
291 - def settucount(self, count):
292 self._header_dict['tucount'] = '%%TU=%08d' % count
293 tucount = property(None, settucount)
294 295
296 -class WordfastUnit(base.TranslationUnit):
297 """A Wordfast translation memory unit""" 298
299 - def __init__(self, source=None):
300 self._dict = {} 301 if source: 302 self.source = source 303 super(WordfastUnit, self).__init__(source)
304
305 - def _update_timestamp(self):
306 """Refresh the timestamp for the unit""" 307 self._dict['date'] = WordfastTime(time.localtime()).timestring
308
309 - def getdict(self):
310 """Get the dictionary of values for a Wordfast line""" 311 return self._dict
312
313 - def setdict(self, newdict):
314 """Set the dictionary of values for a Wordfast line 315 316 @param newdict: a new dictionary with Wordfast line elements 317 @type newdict: Dict 318 """ 319 # TODO First check that the values are OK 320 self._dict = newdict
321 dict = property(getdict, setdict) 322
323 - def _get_source_or_target(self, key):
324 if self._dict.get(key, None) is None: 325 return None 326 elif self._dict[key]: 327 return _wf_to_char(self._dict[key]).decode('utf-8') 328 else: 329 return ""
330
331 - def _set_source_or_target(self, key, newvalue):
332 if newvalue is None: 333 self._dict[key] = None 334 if isinstance(newvalue, unicode): 335 newvalue = newvalue.encode('utf-8') 336 newvalue = _char_to_wf(newvalue) 337 if not key in self._dict or newvalue != self._dict[key]: 338 self._dict[key] = newvalue 339 self._update_timestamp()
340
341 - def getsource(self):
342 return self._get_source_or_target('source')
343
344 - def setsource(self, newsource):
345 self._rich_source = None 346 return self._set_source_or_target('source', newsource)
347 source = property(getsource, setsource) 348
349 - def gettarget(self):
350 return self._get_source_or_target('target')
351
352 - def settarget(self, newtarget):
353 self._rich_target = None 354 return self._set_source_or_target('target', newtarget)
355 target = property(gettarget, settarget) 356
357 - def settargetlang(self, newlang):
358 self._dict['target-lang'] = newlang
359 targetlang = property(None, settargetlang) 360
361 - def __str__(self):
362 return str(self._dict)
363
364 - def istranslated(self):
365 if not self._dict.get('source', None): 366 return False 367 return bool(self._dict.get('target', None))
368 369
370 -class WordfastTMFile(base.TranslationStore):
371 """A Wordfast translation memory file""" 372 Name = _("Wordfast Translation Memory") 373 Mimetypes = ["application/x-wordfast"] 374 Extensions = ["txt"] 375
376 - def __init__(self, inputfile=None, unitclass=WordfastUnit):
377 """construct a Wordfast TM, optionally reading in from inputfile.""" 378 self.UnitClass = unitclass 379 base.TranslationStore.__init__(self, unitclass=unitclass) 380 self.filename = '' 381 self.header = WordfastHeader() 382 self._encoding = 'iso-8859-1' 383 if inputfile is not None: 384 self.parse(inputfile)
385
386 - def parse(self, input):
387 """parsese the given file or file source string""" 388 if hasattr(input, 'name'): 389 self.filename = input.name 390 elif not getattr(self, 'filename', ''): 391 self.filename = '' 392 if hasattr(input, "read"): 393 tmsrc = input.read() 394 input.close() 395 input = tmsrc 396 if TAB_UTF16 in input.split("\n")[0]: 397 self._encoding = 'utf-16' 398 else: 399 self._encoding = 'iso-8859-1' 400 try: 401 input = input.decode(self._encoding).encode('utf-8') 402 except: 403 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded") 404 for header in csv.DictReader(input.split("\n")[:1], 405 fieldnames=WF_FIELDNAMES_HEADER, 406 dialect="wordfast"): 407 self.header = WordfastHeader(header) 408 lines = csv.DictReader(input.split("\n")[1:], 409 fieldnames=WF_FIELDNAMES, 410 dialect="wordfast") 411 for line in lines: 412 newunit = WordfastUnit() 413 newunit.dict = line 414 self.addunit(newunit)
415
416 - def __str__(self):
417 output = csv.StringIO() 418 header_output = csv.StringIO() 419 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, 420 dialect="wordfast") 421 unit_count = 0 422 for unit in self.units: 423 if unit.istranslated(): 424 unit_count += 1 425 writer.writerow(unit.dict) 426 if unit_count == 0: 427 return "" 428 output.reset() 429 self.header.tucount = unit_count 430 outheader = csv.DictWriter(header_output, 431 fieldnames=WF_FIELDNAMES_HEADER, 432 dialect="wordfast") 433 outheader.writerow(self.header.header) 434 header_output.reset() 435 decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8') 436 try: 437 return decoded.encode(self._encoding) 438 except UnicodeEncodeError: 439 return decoded.encode('utf-16')
440