1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """Module to provide a translation memory database."""
23 import math
24 import time
25 import logging
26 import re
27 import threading
28
29 try:
30 from sqlite3 import dbapi2
31 except ImportError:
32 from pysqlite2 import dbapi2
33
34 from translate.search.lshtein import LevenshteinComparer
35 from translate.lang import data
36
37
38 STRIP_REGEXP = re.compile("\W", re.UNICODE)
39
43
45 return str(self.value)
46
47
49 _tm_dbs = {}
50 - def __init__(self, db_file, max_candidates=3, min_similarity=75, max_length=1000):
51
52 self.max_candidates = max_candidates
53 self.min_similarity = min_similarity
54 self.max_length = max_length
55
56 self.db_file = db_file
57
58 if db_file not in self._tm_dbs:
59 self._tm_dbs[db_file] = {}
60 self._tm_db = self._tm_dbs[db_file]
61
62
63 self.init_database()
64 self.fulltext = False
65 self.init_fulltext()
66
67 self.comparer = LevenshteinComparer(self.max_length)
68
69 self.preload_db()
70
72 current_thread = threading.currentThread()
73 if current_thread not in self._tm_db:
74 connection = dbapi2.connect(self.db_file)
75 cursor = connection.cursor()
76 self._tm_db[current_thread] = (connection, cursor)
77 return self._tm_db[current_thread][index]
78
79 connection = property(lambda self: self._get_connection(0))
80 cursor = property(lambda self: self._get_connection(1))
81
82
84 """creates database tables and indices"""
85
86 script = """
87 CREATE TABLE IF NOT EXISTS sources (
88 sid INTEGER PRIMARY KEY AUTOINCREMENT,
89 text VARCHAR NOT NULL,
90 context VARCHAR DEFAULT NULL,
91 lang VARCHAR NOT NULL,
92 length INTEGER NOT NULL
93 );
94 CREATE INDEX IF NOT EXISTS sources_context_idx ON sources (context);
95 CREATE INDEX IF NOT EXISTS sources_lang_idx ON sources (lang);
96 CREATE INDEX IF NOT EXISTS sources_length_idx ON sources (length);
97 CREATE UNIQUE INDEX IF NOT EXISTS sources_uniq_idx ON sources (text, context, lang);
98
99 CREATE TABLE IF NOT EXISTS targets (
100 tid INTEGER PRIMARY KEY AUTOINCREMENT,
101 sid INTEGER NOT NULL,
102 text VARCHAR NOT NULL,
103 lang VARCHAR NOT NULL,
104 time INTEGER DEFAULT NULL,
105 FOREIGN KEY (sid) references sources(sid)
106 );
107 CREATE INDEX IF NOT EXISTS targets_sid_idx ON targets (sid);
108 CREATE INDEX IF NOT EXISTS targets_lang_idx ON targets (lang);
109 CREATE INDEX IF NOT EXISTS targets_time_idx ON targets (time);
110 CREATE UNIQUE INDEX IF NOT EXISTS targets_uniq_idx ON targets (sid, text, lang);
111 """
112
113 try:
114 self.cursor.executescript(script)
115 self.connection.commit()
116 except:
117 self.connection.rollback()
118 raise
119
120 - def init_fulltext(self):
121 """detects if fts3 fulltext indexing module exists, initializes fulltext table if it does"""
122
123
124 try:
125 script = """
126 DROP TABLE IF EXISTS test_for_fts3;
127 CREATE VIRTUAL TABLE test_for_fts3 USING fts3;
128 DROP TABLE test_for_fts3;
129 """
130 self.cursor.executescript(script)
131 logging.debug("fts3 supported")
132
133
134 self.cursor.execute("SELECT name FROM sqlite_master WHERE name = 'fulltext'")
135 if not self.cursor.fetchone():
136
137 script = """
138 CREATE VIRTUAL TABLE fulltext USING fts3(text);
139 """
140 logging.debug("fulltext table not exists, creating")
141 self.cursor.executescript(script)
142 logging.debug("created fulltext table")
143 else:
144 logging.debug("fulltext table already exists")
145
146
147 script = """
148 INSERT INTO fulltext (rowid, text) SELECT sid, text FROM sources WHERE sid NOT IN (SELECT rowid FROM fulltext);
149 CREATE TRIGGER IF NOT EXISTS sources_insert_trig AFTER INSERT ON sources FOR EACH ROW
150 BEGIN
151 INSERT INTO fulltext (docid, text) VALUES (NEW.sid, NEW.text);
152 END;
153 CREATE TRIGGER IF NOT EXISTS sources_update_trig AFTER UPDATE OF text ON sources FOR EACH ROW
154 BEGIN
155 UPDATE fulltext SET text = NEW.text WHERE docid = NEW.sid;
156 END;
157 CREATE TRIGGER IF NOT EXISTS sources_delete_trig AFTER DELETE ON sources FOR EACH ROW
158 BEGIN
159 DELETE FROM fulltext WHERE docid = OLD.sid;
160 END;
161 """
162 self.cursor.executescript(script)
163 self.connection.commit()
164 logging.debug("created fulltext triggers")
165 self.fulltext = True
166
167 except dbapi2.OperationalError, e:
168 self.fulltext = False
169 logging.debug("failed to initialize fts3 support: " + str(e))
170 script = """
171 DROP TRIGGER IF EXISTS sources_insert_trig;
172 DROP TRIGGER IF EXISTS sources_update_trig;
173 DROP TRIGGER IF EXISTS sources_delete_trig;
174 """
175 self.cursor.executescript(script)
176
178 """ugly hack to force caching of sqlite db file in memory for
179 improved performance"""
180 if self.fulltext:
181 query = """SELECT COUNT(*) FROM sources s JOIN fulltext f ON s.sid = f.docid JOIN targets t on s.sid = t.sid"""
182 else:
183 query = """SELECT COUNT(*) FROM sources s JOIN targets t on s.sid = t.sid"""
184 self.cursor.execute(query)
185 (numrows,) = self.cursor.fetchone()
186 logging.debug("tmdb has %d records" % numrows)
187 return numrows
188
189 - def add_unit(self, unit, source_lang=None, target_lang=None, commit=True):
209
210 - def add_dict(self, unit, source_lang, target_lang, commit=True):
211 """inserts units represented as dictionaries in database"""
212 source_lang = data.normalize_code(source_lang)
213 target_lang = data.normalize_code(target_lang)
214 try:
215 try:
216 self.cursor.execute("INSERT INTO sources (text, context, lang, length) VALUES(?, ?, ?, ?)",
217 (unit["source"],
218 unit["context"],
219 source_lang,
220 len(unit["source"])))
221 sid = self.cursor.lastrowid
222 except dbapi2.IntegrityError:
223
224 self.cursor.execute("SELECT sid FROM sources WHERE text=? AND context=? and lang=?",
225 (unit["source"],
226 unit["context"],
227 source_lang))
228 sid = self.cursor.fetchone()
229 (sid,) = sid
230 try:
231
232
233 self.cursor.execute("INSERT INTO targets (sid, text, lang, time) VALUES (?, ?, ?, ?)",
234 (sid,
235 unit["target"],
236 target_lang,
237 int(time.time())))
238 except dbapi2.IntegrityError:
239
240 pass
241
242 if commit:
243 self.connection.commit()
244 except:
245 if commit:
246 self.connection.rollback()
247 raise
248
249 - def add_store(self, store, source_lang, target_lang, commit=True):
259
260 - def add_list(self, units, source_lang, target_lang, commit=True):
261 """insert all units in list into the database, units are
262 represented as dictionaries"""
263 count = 0
264 for unit in units:
265 self.add_dict(unit, source_lang, target_lang, commit=False)
266 count += 1
267 if commit:
268 self.connection.commit()
269 return count
270
272 """return TM suggestions for unit_source"""
273 if isinstance(unit_source, str):
274 unit_source = unicode(unit_source, "utf-8")
275 if isinstance(source_langs, list):
276 source_langs = [data.normalize_code(lang) for lang in source_langs]
277 source_langs = ','.join(source_langs)
278 else:
279 source_langs = data.normalize_code(source_langs)
280 if isinstance(target_langs, list):
281 target_langs = [data.normalize_code(lang) for lang in target_langs]
282 target_langs = ','.join(target_langs)
283 else:
284 target_langs = data.normalize_code(target_langs)
285
286 minlen = min_levenshtein_length(len(unit_source), self.min_similarity)
287 maxlen = max_levenshtein_length(len(unit_source), self.min_similarity, self.max_length)
288
289
290
291 unit_words = STRIP_REGEXP.sub(' ', unit_source).split()
292 unit_words = filter(lambda word: len(word) > 2, unit_words)
293
294 if self.fulltext and len(unit_words) > 3:
295 logging.debug("fulltext matching")
296 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid JOIN fulltext f ON s.sid = f.docid
297 WHERE s.lang IN (?) AND t.lang IN (?) AND s.length BETWEEN ? AND ?
298 AND fulltext MATCH ?"""
299 search_str = " OR ".join(unit_words)
300 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen, search_str))
301 else:
302 logging.debug("nonfulltext matching")
303 query = """SELECT s.text, t.text, s.context, s.lang, t.lang FROM sources s JOIN targets t ON s.sid = t.sid
304 WHERE s.lang IN (?) AND t.lang IN (?)
305 AND s.length >= ? AND s.length <= ?"""
306 self.cursor.execute(query, (source_langs, target_langs, minlen, maxlen))
307
308 results = []
309 for row in self.cursor:
310 result = {}
311 result['source'] = row[0]
312 result['target'] = row[1]
313 result['context'] = row[2]
314 result['quality'] = self.comparer.similarity(unit_source, result['source'], self.min_similarity)
315 if result['quality'] >= self.min_similarity:
316 results.append(result)
317 results.sort(key=lambda match: match['quality'], reverse=True)
318 results = results[:self.max_candidates]
319 logging.debug("results: %s", unicode(results))
320 return results
321
322
324 return math.ceil(max(length * (min_similarity/100.0), 2))
325
327 return math.floor(min(length / (min_similarity/100.0), max_length))
328