Package translate :: Package storage :: Module tmdb
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.tmdb

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Module to provide a translation memory database.""" 
 23  import math 
 24  import time 
 25  import logging 
 26  import re 
 27  import threading 
 28   
 29  try: 
 30      from sqlite3 import dbapi2 
 31  except ImportError: 
 32      from pysqlite2 import dbapi2 
 33   
 34  from translate.search.lshtein import LevenshteinComparer 
 35  from translate.lang import data 
 36   
 37   
 38  STRIP_REGEXP = re.compile("\W", re.UNICODE) 
 39   
40 -class LanguageError(Exception):
41 - def __init__(self, value):
42 self.value = value
43
44 - def __str__(self):
45 return str(self.value)
46 47
48 -class TMDB(object):
49 _tm_dbs = {}
50 - def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000):
51 52 self.max_candidates = max_candidates 53 self.min_similarity = min_similarity 54 self.max_length = max_length 55 56 self.db_file = db_file 57 # share connections to same database file between different instances 58 if db_file not in self._tm_dbs: 59 self._tm_dbs[db_file] = {} 60 self._tm_db = self._tm_dbs[db_file] 61 62 #FIXME: do we want to do any checks before we initialize the DB? 63 self.init_database() 64 self.fulltext = False 65 self.init_fulltext() 66 67 self.comparer = LevenshteinComparer(self.max_length) 68 69 self.preload_db()
70
71 - def _get_connection(self, index):
72 current_thread = threading.currentThread() 73 if current_thread not in self._tm_db: 74 connection = dbapi2.connect(self.db_file) 75 cursor = connection.cursor() 76 self._tm_db[current_thread] = (connection, cursor) 77 return self._tm_db[current_thread][index]
78 79 connection = property(lambda self: self._get_connection(0)) 80 cursor = property(lambda self: self._get_connection(1)) 81 82
83 - def init_database(self):
84 """creates database tables and indices""" 85 86 script = """ 87 CREATE TABLE IF NOT EXISTS sources ( 88 sid INTEGER PRIMARY KEY AUTOINCREMENT, 89 text VARCHAR NOT NULL, 90 context VARCHAR DEFAULT NULL, 91 lang VARCHAR NOT NULL, 92 length INTEGER NOT NULL 93 ); 94 CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context); 95 CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang); 96 CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length); 97 CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang); 98 99 CREATE TABLE IF NOT EXISTS targets ( 100 tid INTEGER PRIMARY KEY AUTOINCREMENT, 101 sid INTEGER NOT NULL, 102 text VARCHAR NOT NULL, 103 lang VARCHAR NOT NULL, 104 time INTEGER DEFAULT NULL, 105 FOREIGN KEY (sid) references sources(sid) 106 ); 107 CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid); 108 CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang); 109 CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time); 110 CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang); 111 """ 112 113 try: 114 self.cursor.executescript(script) 115 self.connection.commit() 116 except: 117 self.connection.rollback() 118 raise
119
120 - def init_fulltext(self):
121 """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does""" 122 123 #HACKISH: no better way to detect fts3 support except trying to construct a dummy table?! 124 try: 125 script = """ 126 DROP TABLE IF EXISTS test_for_fts3; 127 CREATE VIRTUAL TABLE test_for_fts3 USING fts3; 128 DROP TABLE test_for_fts3; 129 """ 130 self.cursor.executescript(script) 131 logging.debug("fts3 supported") 132 # for some reason CREATE VIRTUAL TABLE doesn't support IF NOT EXISTS syntax 133 # check if fulltext index table exists manually 134 self.cursor.execute("SELECT name FROM sqlite_master WHERE name = 'fulltext'") 135 if not self.cursor.fetchone(): 136 # create fulltext index table, and index all strings in sources 137 script = """ 138 CREATE VIRTUAL TABLE fulltext USING fts3(text); 139 """ 140 logging.debug("fulltext table not exists, creating") 141 self.cursor.executescript(script) 142 logging.debug("created fulltext table") 143 else: 144 logging.debug("fulltext table already exists") 145 146 # create triggers that would sync sources table with fulltext index 147 script = """ 148 INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext); 149 CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW 150 BEGIN 151 INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text); 152 END; 153 CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW 154 BEGIN 155 UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid; 156 END; 157 CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW 158 BEGIN 159 DELETE FROM fulltext WHERE docid = OLD.sid; 160 END; 161 """ 162 self.cursor.executescript(script) 163 self.connection.commit() 164 logging.debug("created fulltext triggers") 165 self.fulltext = True 166 167 except dbapi2.OperationalError, e: 168 self.fulltext = False 169 logging.debug("failed to initialize fts3 support: " + str(e)) 170 script = """ 171 DROP TRIGGER IF EXISTS sources_insert_trig; 172 DROP TRIGGER IF EXISTS sources_update_trig; 173 DROP TRIGGER IF EXISTS sources_delete_trig; 174 """ 175 self.cursor.executescript(script)
176
177 - def preload_db(self):
178 """ugly hack to force caching of sqlite db file in memory for 179 improved performance""" 180 if self.fulltext: 181 query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid""" 182 else: 183 query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid""" 184 self.cursor.execute(query) 185 (numrows,) = self.cursor.fetchone() 186 logging.debug("tmdb has %d records" % numrows) 187 return numrows
188
189 - def add_unit(self, unit, source_lang=None, target_lang=None, commit=True):
190 """inserts unit in the database""" 191 #TODO: is that really the best way to handle unspecified 192 # source and target languages? what about conflicts between 193 # unit attributes and passed arguments 194 if unit.getsourcelanguage(): 195 source_lang = unit.getsourcelanguage() 196 if unit.gettargetlanguage(): 197 target_lang = unit.gettargetlanguage() 198 199 if not source_lang: 200 raise LanguageError("undefined source language") 201 if not target_lang: 202 raise LanguageError("undefined target language") 203 204 unitdict = {"source" : unit.source, 205 "target" : unit.target, 206 "context": unit.getcontext() 207 } 208 self.add_dict(unitdict, source_lang, target_lang, commit)
209
210 - def add_dict(self, unit, source_lang, target_lang, commit=True):
211 """inserts units represented as dictionaries in database""" 212 source_lang = data.normalize_code(source_lang) 213 target_lang = data.normalize_code(target_lang) 214 try: 215 try: 216 self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)", 217 (unit["source"], 218 unit["context"], 219 source_lang, 220 len(unit["source"]))) 221 sid = self.cursor.lastrowid 222 except dbapi2.IntegrityError: 223 # source string already exists in db, run query to find sid 224 self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?", 225 (unit["source"], 226 unit["context"], 227 source_lang)) 228 sid = self.cursor.fetchone() 229 (sid,) = sid 230 try: 231 #FIXME: get time info from translation store 232 #FIXME: do we need so store target length? 233 self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)", 234 (sid, 235 unit["target"], 236 target_lang, 237 int(time.time()))) 238 except dbapi2.IntegrityError: 239 # target string already exists in db, do nothing 240 pass 241 242 if commit: 243 self.connection.commit() 244 except: 245 if commit: 246 self.connection.rollback() 247 raise
248
249 - def add_store(self, store, source_lang, target_lang, commit=True):
250 """insert all units in store in database""" 251 count = 0 252 for unit in store.units: 253 if unit.istranslatable() and unit.istranslated(): 254 self.add_unit(unit, source_lang, target_lang, commit=False) 255 count += 1 256 if commit: 257 self.connection.commit() 258 return count
259
260 - def add_list(self, units, source_lang, target_lang, commit=True):
261 """insert all units in list into the database, units are 262 represented as dictionaries""" 263 count = 0 264 for unit in units: 265 self.add_dict(unit, source_lang, target_lang, commit=False) 266 count += 1 267 if commit: 268 self.connection.commit() 269 return count
270
271 - def translate_unit(self, unit_source, source_langs, target_langs):
272 """return TM suggestions for unit_source""" 273 if isinstance(unit_source, str): 274 unit_source = unicode(unit_source, "utf-8") 275 if isinstance(source_langs, list): 276 source_langs = [data.normalize_code(lang) for lang in source_langs] 277 source_langs = ','.join(source_langs) 278 else: 279 source_langs = data.normalize_code(source_langs) 280 if isinstance(target_langs, list): 281 target_langs = [data.normalize_code(lang) for lang in target_langs] 282 target_langs = ','.join(target_langs) 283 else: 284 target_langs = data.normalize_code(target_langs) 285 286 minlen = min_levenshtein_length(len(unit_source), self.min_similarity) 287 maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length) 288 289 # split source into words, remove punctuation and special 290 # chars, keep words that are at least 3 chars long 291 unit_words = STRIP_REGEXP.sub(' ', unit_source).split() 292 unit_words = filter(lambda word: len(word) > 2, unit_words) 293 294 if self.fulltext and len(unit_words) > 3: 295 logging.debug("fulltext matching") 296 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid 297 WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ? 298 AND fulltext MATCH ?""" 299 search_str = " OR ".join(unit_words) 300 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen, search_str)) 301 else: 302 logging.debug("nonfulltext matching") 303 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid 304 WHERE s.lang IN (?) AND t.lang IN (?) 305 AND s.length >= ? AND s.length <= ?""" 306 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen)) 307 308 results = [] 309 for row in self.cursor: 310 result = {} 311 result['source'] = row[0] 312 result['target'] = row[1] 313 result['context'] = row[2] 314 result['quality'] = self.comparer.similarity(unit_source, result['source'], self.min_similarity) 315 if result['quality'] >= self.min_similarity: 316 results.append(result) 317 results.sort(key=lambda match: match['quality'], reverse=True) 318 results = results[:self.max_candidates] 319 logging.debug("results: %s", unicode(results)) 320 return results
321 322
323 -def min_levenshtein_length(length, min_similarity):
324 return math.ceil(max(length * (min_similarity/100.0), 2))
325
326 -def max_levenshtein_length(length, min_similarity, max_length):
327 return math.floor(min(length / (min_similarity/100.0), max_length))
328