1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Class to perform translation memory matching from a store of translation units"""
22
23 import heapq
24 import re
25
26 from translate.search import lshtein
27 from translate.search import terminology
28 from translate.storage import base
29 from translate.storage import po
30 from translate.misc.multistring import multistring
31
32
34 """Returns the length of the source string"""
35 return len(unit.source)
36
37
39 """A class that will do matching and store configuration for the matching process"""
40
41 sort_reverse = False
42
43 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
44 """max_candidates is the maximum number of candidates that should be assembled,
45 min_similarity is the minimum similarity that must be attained to be included in
46 the result, comparer is an optional Comparer with similarity() function"""
47 if comparer is None:
48 comparer = lshtein.LevenshteinComparer(max_length)
49 self.comparer = comparer
50 self.setparameters(max_candidates, min_similarity, max_length)
51 self.usefuzzy = usefuzzy
52 self.inittm(store)
53 self.addpercentage = True
54
69
70 - def inittm(self, stores, reverse=False):
71 """Initialises the memory for later use. We use simple base units for
72 speedup."""
73
74 self.existingunits = {}
75 self.candidates = base.TranslationStore()
76
77 if isinstance(stores, base.TranslationStore):
78 stores = [stores]
79 for store in stores:
80 self.extendtm(store.units, store=store, sort=False)
81 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
82
83
84
85 - def extendtm(self, units, store=None, sort=True):
86 """Extends the memory with extra unit(s).
87
88 @param units: The units to add to the TM.
89 @param store: Optional store from where some metadata can be retrieved
90 and associated with each unit.
91 @param sort: Optional parameter that can be set to False to supress
92 sorting of the candidates list. This should probably only be used in
93 inittm().
94 """
95 if isinstance(units, base.TranslationUnit):
96 units = [units]
97 candidates = filter(self.usable, units)
98 for candidate in candidates:
99 simpleunit = base.TranslationUnit("")
100
101
102 if isinstance(candidate.source, multistring):
103 if len(candidate.source.strings) > 1:
104 simpleunit.orig_source = candidate.source
105 simpleunit.orig_target = candidate.target
106 simpleunit.source = unicode(candidate.source)
107 simpleunit.target = unicode(candidate.target)
108 else:
109 simpleunit.source = candidate.source
110 simpleunit.target = candidate.target
111
112
113
114
115 simpleunit.addnote(candidate.getnotes(origin="translator"))
116 simpleunit.fuzzy = candidate.isfuzzy()
117 self.candidates.units.append(simpleunit)
118 if sort:
119 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
120
121 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
122 """Sets the parameters without reinitialising the tm. If a parameter
123 is not specified, it is set to the default, not ignored"""
124 self.MAX_CANDIDATES = max_candidates
125 self.MIN_SIMILARITY = min_similarity
126 self.MAX_LENGTH = max_length
127
129 """Calculates a length beyond which we are not interested.
130 The extra fat is because we don't use plain character distance only."""
131 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
132
134 """Calculates the minimum length we are interested in.
135 The extra fat is because we don't use plain character distance only."""
136 return max(len(text) * (min_similarity/100.0), 1)
137
139 """Returns a list of possible matches for given source text.
140
141 @type text: String
142 @param text: The text that will be search for in the translation memory
143 @rtype: list
144 @return: a list of units with the source and target strings from the
145 translation memory. If self.addpercentage is true (default) the match
146 quality is given as a percentage in the notes.
147 """
148 bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES
149
150
151 min_similarity = self.MIN_SIMILARITY
152
153
154
155
156
157
158
159 startlength = self.getstartlength(min_similarity, text)
160 startindex = 0
161 endindex = len(self.candidates.units)
162 while startindex < endindex:
163 mid = (startindex + endindex) // 2
164 if sourcelen(self.candidates.units[mid]) < startlength:
165 startindex = mid + 1
166 else:
167 endindex = mid
168
169
170 stoplength = self.getstoplength(min_similarity, text)
171 lowestscore = 0
172
173 for candidate in self.candidates.units[startindex:]:
174 cmpstring = candidate.source
175 if len(cmpstring) > stoplength:
176 break
177 similarity = self.comparer.similarity(text, cmpstring, min_similarity)
178 if similarity < min_similarity:
179 continue
180 if similarity > lowestscore:
181 heapq.heapreplace(bestcandidates, (similarity, candidate))
182 lowestscore = bestcandidates[0][0]
183 if lowestscore >= 100:
184 break
185 if min_similarity < lowestscore:
186 min_similarity = lowestscore
187 stoplength = self.getstoplength(min_similarity, text)
188
189
190 def notzero(item):
191 score = item[0]
192 return score != 0
193 bestcandidates = filter(notzero, bestcandidates)
194
195 bestcandidates.sort(reverse=True)
196 return self.buildunits(bestcandidates)
197
199 """Builds a list of units conforming to base API, with the score in the comment"""
200 units = []
201 for score, candidate in candidates:
202 if hasattr(candidate, "orig_source"):
203 candidate.source = candidate.orig_source
204 candidate.target = candidate.orig_target
205 newunit = po.pounit(candidate.source)
206 newunit.target = candidate.target
207 newunit.markfuzzy(candidate.fuzzy)
208 candidatenotes = candidate.getnotes().strip()
209 if candidatenotes:
210 newunit.addnote(candidatenotes)
211 if self.addpercentage:
212 newunit.addnote("%d%%" % score)
213 units.append(newunit)
214 return units
215
216
217
218
219
220
221
222
223
224 ignorepatterns = [
225 ("y\s*$", "ie"),
226 ("[\s-]+", ""),
227 ("-", " "),
228 (" ", "-"),
229 ]
230
231 context_re = re.compile("\s+\(.*\)\s*$")
232
234 """A matcher with settings specifically for terminology matching"""
235
236 sort_reverse = True
237
238 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
239 if comparer is None:
240 comparer = terminology.TerminologyComparer(max_length)
241 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer)
242 self.addpercentage = False
243 self.match_info = {}
244
246 """Normal initialisation, but convert all source strings to lower case"""
247 matcher.inittm(self, store)
248 extras = []
249 for unit in self.candidates.units:
250 source = unit.source = context_re.sub("", unit.source).lower()
251 for ignorepattern in ignorepatterns:
252 (newterm, occurrences) = re.subn(ignorepattern[0], ignorepattern[1], source)
253 if occurrences:
254 new_unit = type(unit).buildfromunit(unit)
255 new_unit.source = newterm
256
257 unit.markfuzzy()
258 extras.append(new_unit)
259 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
260 if extras:
261
262
263 self.extendtm(extras, sort=False)
264
269
274
276 """Returns whether this translation unit is usable for terminology."""
277 if not unit.istranslated():
278 return False
279 l = len(context_re.sub("", unit.source))
280 return l <= self.MAX_LENGTH and l >= self.getstartlength(None, None)
281
283 """Normal matching after converting text to lower case. Then replace
284 with the original unit to retain comments, etc."""
285 text = text.lower()
286 comparer = self.comparer
287 comparer.match_info = {}
288 matches = []
289 known = set()
290 for cand in self.candidates.units:
291 if (cand.source, cand.target) in known:
292 continue
293 source = cand.source
294 if comparer.similarity(text, source, self.MIN_SIMILARITY):
295 self.match_info[source] = {'pos': comparer.match_info[source]['pos']}
296 matches.append(cand)
297 known.add((cand.source, cand.target))
298 return matches
299
300
301
306
308 """extracts match quality from po comments"""
309 quality = re.search('([0-9]+)%', comment)
310 if quality:
311 return quality.group(1)
312