Package translate :: Package search :: Module match
[hide private]
[frames] | no frames]

Source Code for Module translate.search.match

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2006-2009 Zuza Software Foundation 
  5  # 
  6  # This file is part of the Translate Toolkit. 
  7  # 
  8  # This program is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with this program; if not, see <http://www.gnu.org/licenses/>. 
 20   
 21  """Class to perform translation memory matching from a store of translation units""" 
 22   
 23  import heapq 
 24  import re 
 25   
 26  from translate.search import lshtein 
 27  from translate.search import terminology 
 28  from translate.storage import base 
 29  from translate.storage import po 
 30  from translate.misc.multistring import multistring 
 31   
 32   
33 -def sourcelen(unit):
34 """Returns the length of the source string""" 35 return len(unit.source)
36 37
38 -class matcher(object):
39 """A class that will do matching and store configuration for the matching process""" 40 41 sort_reverse = False 42
43 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
44 """max_candidates is the maximum number of candidates that should be assembled, 45 min_similarity is the minimum similarity that must be attained to be included in 46 the result, comparer is an optional Comparer with similarity() function""" 47 if comparer is None: 48 comparer = lshtein.LevenshteinComparer(max_length) 49 self.comparer = comparer 50 self.setparameters(max_candidates, min_similarity, max_length) 51 self.usefuzzy = usefuzzy 52 self.inittm(store) 53 self.addpercentage = True
54
55 - def usable(self, unit):
56 """Returns whether this translation unit is usable for TM""" 57 #TODO: We might want to consider more attributes, such as approved, reviewed, etc. 58 source = unit.source 59 target = unit.target 60 if source and target and (self.usefuzzy or not unit.isfuzzy()): 61 if len(source) < 2: 62 return False 63 if source in self.existingunits and self.existingunits[source] == target: 64 return False 65 else: 66 self.existingunits[source] = target 67 return True 68 return False
69
70 - def inittm(self, stores, reverse=False):
71 """Initialises the memory for later use. We use simple base units for 72 speedup.""" 73 # reverse is deprectated - just use self.sort_reverse 74 self.existingunits = {} 75 self.candidates = base.TranslationStore() 76 77 if isinstance(stores, base.TranslationStore): 78 stores = [stores] 79 for store in stores: 80 self.extendtm(store.units, store=store, sort=False) 81 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
82 # print "TM initialised with %d candidates (%d to %d characters long)" % \ 83 # (len(self.candidates.units), len(self.candidates.units[0].source), len(self.candidates.units[-1].source)) 84
85 - def extendtm(self, units, store=None, sort=True):
86 """Extends the memory with extra unit(s). 87 88 @param units: The units to add to the TM. 89 @param store: Optional store from where some metadata can be retrieved 90 and associated with each unit. 91 @param sort: Optional parameter that can be set to False to supress 92 sorting of the candidates list. This should probably only be used in 93 inittm(). 94 """ 95 if isinstance(units, base.TranslationUnit): 96 units = [units] 97 candidates = filter(self.usable, units) 98 for candidate in candidates: 99 simpleunit = base.TranslationUnit("") 100 # We need to ensure that we don't pass multistrings futher, since 101 # some modules (like the native Levenshtein) can't use it. 102 if isinstance(candidate.source, multistring): 103 if len(candidate.source.strings) > 1: 104 simpleunit.orig_source = candidate.source 105 simpleunit.orig_target = candidate.target 106 simpleunit.source = unicode(candidate.source) 107 simpleunit.target = unicode(candidate.target) 108 else: 109 simpleunit.source = candidate.source 110 simpleunit.target = candidate.target 111 # If we now only get translator comments, we don't get programmer 112 # comments in TM suggestions (in Pootle, for example). If we get all 113 # notes, pot2po adds all previous comments as translator comments 114 # in the new po file 115 simpleunit.addnote(candidate.getnotes(origin="translator")) 116 simpleunit.fuzzy = candidate.isfuzzy() 117 self.candidates.units.append(simpleunit) 118 if sort: 119 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
120
121 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
122 """Sets the parameters without reinitialising the tm. If a parameter 123 is not specified, it is set to the default, not ignored""" 124 self.MAX_CANDIDATES = max_candidates 125 self.MIN_SIMILARITY = min_similarity 126 self.MAX_LENGTH = max_length
127
128 - def getstoplength(self, min_similarity, text):
129 """Calculates a length beyond which we are not interested. 130 The extra fat is because we don't use plain character distance only.""" 131 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
132
133 - def getstartlength(self, min_similarity, text):
134 """Calculates the minimum length we are interested in. 135 The extra fat is because we don't use plain character distance only.""" 136 return max(len(text) * (min_similarity/100.0), 1)
137
138 - def matches(self, text):
139 """Returns a list of possible matches for given source text. 140 141 @type text: String 142 @param text: The text that will be search for in the translation memory 143 @rtype: list 144 @return: a list of units with the source and target strings from the 145 translation memory. If self.addpercentage is true (default) the match 146 quality is given as a percentage in the notes. 147 """ 148 bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES 149 #We use self.MIN_SIMILARITY, but if we already know we have max_candidates 150 #that are better, we can adjust min_similarity upwards for speedup 151 min_similarity = self.MIN_SIMILARITY 152 153 # We want to limit our search in self.candidates, so we want to ignore 154 # all units with a source string that is too short or too long. We use 155 # a binary search to find the shortest string, from where we start our 156 # search in the candidates. 157 158 # minimum source string length to be considered 159 startlength = self.getstartlength(min_similarity, text) 160 startindex = 0 161 endindex = len(self.candidates.units) 162 while startindex < endindex: 163 mid = (startindex + endindex) // 2 164 if sourcelen(self.candidates.units[mid]) < startlength: 165 startindex = mid + 1 166 else: 167 endindex = mid 168 169 # maximum source string length to be considered 170 stoplength = self.getstoplength(min_similarity, text) 171 lowestscore = 0 172 173 for candidate in self.candidates.units[startindex:]: 174 cmpstring = candidate.source 175 if len(cmpstring) > stoplength: 176 break 177 similarity = self.comparer.similarity(text, cmpstring, min_similarity) 178 if similarity < min_similarity: 179 continue 180 if similarity > lowestscore: 181 heapq.heapreplace(bestcandidates, (similarity, candidate)) 182 lowestscore = bestcandidates[0][0] 183 if lowestscore >= 100: 184 break 185 if min_similarity < lowestscore: 186 min_similarity = lowestscore 187 stoplength = self.getstoplength(min_similarity, text) 188 189 #Remove the empty ones: 190 def notzero(item): 191 score = item[0] 192 return score != 0
193 bestcandidates = filter(notzero, bestcandidates) 194 #Sort for use as a general list, and reverse so the best one is at index 0 195 bestcandidates.sort(reverse=True) 196 return self.buildunits(bestcandidates)
197
198 - def buildunits(self, candidates):
199 """Builds a list of units conforming to base API, with the score in the comment""" 200 units = [] 201 for score, candidate in candidates: 202 if hasattr(candidate, "orig_source"): 203 candidate.source = candidate.orig_source 204 candidate.target = candidate.orig_target 205 newunit = po.pounit(candidate.source) 206 newunit.target = candidate.target 207 newunit.markfuzzy(candidate.fuzzy) 208 candidatenotes = candidate.getnotes().strip() 209 if candidatenotes: 210 newunit.addnote(candidatenotes) 211 if self.addpercentage: 212 newunit.addnote("%d%%" % score) 213 units.append(newunit) 214 return units
215 216 217 # We don't want to miss certain forms of words that only change a little 218 # at the end. Now we are tying this code to English, but it should serve 219 # us well. For example "category" should be found in "categories", 220 # "copy" should be found in "copied" 221 # 222 # The tuples define a regular expression to search for, and with what it 223 # should be replaced. 224 ignorepatterns = [ 225 ("y\s*$", "ie"), #category/categories, identify/identifies, apply/applied 226 ("[\s-]+", ""), #down time / downtime, pre-order / preorder 227 ("-", " "), #pre-order / pre order 228 (" ", "-"), #pre order / pre-order 229 ] 230 231 context_re = re.compile("\s+\(.*\)\s*$") 232
233 -class terminologymatcher(matcher):
234 """A matcher with settings specifically for terminology matching""" 235 236 sort_reverse = True 237
238 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
239 if comparer is None: 240 comparer = terminology.TerminologyComparer(max_length) 241 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer) 242 self.addpercentage = False 243 self.match_info = {}
244
245 - def inittm(self, store):
246 """Normal initialisation, but convert all source strings to lower case""" 247 matcher.inittm(self, store) 248 extras = [] 249 for unit in self.candidates.units: 250 source = unit.source = context_re.sub("", unit.source).lower() 251 for ignorepattern in ignorepatterns: 252 (newterm, occurrences) = re.subn(ignorepattern[0], ignorepattern[1], source) 253 if occurrences: 254 new_unit = type(unit).buildfromunit(unit) 255 new_unit.source = newterm 256 # We mark it fuzzy to indicate that it isn't pristine 257 unit.markfuzzy() 258 extras.append(new_unit) 259 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse) 260 if extras: 261 # We don't sort, so that the altered forms are at the back and 262 # considered last. 263 self.extendtm(extras, sort=False)
264
265 - def getstartlength(self, min_similarity, text):
266 # Let's number false matches by not working with terms of two 267 # characters or less 268 return 3
269
270 - def getstoplength(self, min_similarity, text):
271 # Let's ignore terms with more than 30 characters. Perhaps someone 272 # gave a file with normal (long) translations 273 return 30
274
275 - def usable(self, unit):
276 """Returns whether this translation unit is usable for terminology.""" 277 if not unit.istranslated(): 278 return False 279 l = len(context_re.sub("", unit.source)) 280 return l <= self.MAX_LENGTH and l >= self.getstartlength(None, None)
281
282 - def matches(self, text):
283 """Normal matching after converting text to lower case. Then replace 284 with the original unit to retain comments, etc.""" 285 text = text.lower() 286 comparer = self.comparer 287 comparer.match_info = {} 288 matches = [] 289 known = set() 290 for cand in self.candidates.units: 291 if (cand.source, cand.target) in known: 292 continue 293 source = cand.source 294 if comparer.similarity(text, source, self.MIN_SIMILARITY): 295 self.match_info[source] = {'pos': comparer.match_info[source]['pos']} 296 matches.append(cand) 297 known.add((cand.source, cand.target)) 298 return matches
299 300 301 # utility functions used by virtaal and tmserver to convert matching units in easily marshallable dictionaries
302 -def unit2dict(unit):
303 """converts a pounit to a simple dict structure for use over the web""" 304 return {"source": unit.source, "target": unit.target, 305 "quality": _parse_quality(unit.getnotes()), "context": unit.getcontext()}
306
307 -def _parse_quality(comment):
308 """extracts match quality from po comments""" 309 quality = re.search('([0-9]+)%', comment) 310 if quality: 311 return quality.group(1)
312