Package translate :: Package tools :: Module pogrep
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.pogrep

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Grep XLIFF, Gettext PO and TMX localization files 
 23   
 24  Matches are output to snippet files of the same type which can then be reviewed  
 25  and later merged using pomerge 
 26   
 27  See: http://translate.sourceforge.net/wiki/toolkit/pogrep for examples and 
 28  usage instructions 
 29  """ 
 30   
 31  from translate.storage import factory 
 32  from translate.storage.poheader import poheader 
 33  from translate.misc import optrecurse 
 34  from translate.misc.multistring import multistring 
 35  from translate.lang import data 
 36  import re 
 37  import locale 
 38   
 39   
40 -class GrepMatch(object):
41 """Just a small data structure that represents a search match.""" 42 43 # INITIALIZERS #
44 - def __init__(self, unit, part='target', part_n=0, start=0, end=0):
45 self.unit = unit 46 self.part = part 47 self.part_n = part_n 48 self.start = start 49 self.end = end
50 51 # ACCESSORS #
52 - def get_getter(self):
53 if self.part == 'target': 54 if self.unit.hasplural(): 55 getter = lambda: self.unit.target.strings[self.part_n] 56 else: 57 getter = lambda: self.unit.target 58 return getter 59 elif self.part == 'source': 60 if self.unit.hasplural(): 61 getter = lambda: self.unit.source.strings[self.part_n] 62 else: 63 getter = lambda: self.unit.source 64 return getter 65 elif self.part == 'notes': 66 def getter(): 67 return self.unit.getnotes()[self.part_n]
68 return getter 69 elif self.part == 'locations': 70 def getter(): 71 return self.unit.getlocations()[self.part_n]
72 return getter 73
74 - def get_setter(self):
75 if self.part == 'target': 76 if self.unit.hasplural(): 77 def setter(value): 78 strings = self.unit.target.strings 79 strings[self.part_n] = value 80 self.unit.target = strings
81 else: 82 def setter(value): 83 self.unit.target = value 84 return setter 85 86 # SPECIAL METHODS #
87 - def __str__(self):
88 start, end = self.start, self.end 89 if start < 3: 90 start = 3 91 if end > len(self.get_getter()()) - 3: 92 end = len(self.get_getter()()) - 3 93 matchpart = self.get_getter()()[start-2:end+2] 94 return '<GrepMatch "%s" part=%s[%d] start=%d end=%d>' % (matchpart, self.part, self.part_n, self.start, self.end)
95
96 - def __repr__(self):
97 return str(self)
98
99 -def real_index(string, nfc_index):
100 """Calculate the real index in the unnormalized string that corresponds to 101 the index nfc_index in the normalized string.""" 102 length = nfc_index 103 max_length = len(string) 104 while len(data.normalize(string[:length])) <= nfc_index: 105 if length == max_length: 106 return length 107 length += 1 108 return length - 1
109 110
111 -def find_matches(unit, part, strings, re_search):
112 """Return the GrepFilter objects where re_search matches in strings.""" 113 matches = [] 114 for n, string in enumerate(strings): 115 if not string: 116 continue 117 normalized = data.normalize(string) 118 for matchobj in re_search.finditer(normalized): 119 start = real_index(string, matchobj.start()) 120 end = real_index(string, matchobj.end()) 121 matches.append(GrepMatch(unit, part=part, part_n=n, start=start, end=end)) 122 return matches
123
124 -class GrepFilter:
125 - def __init__(self, searchstring, searchparts, ignorecase=False, useregexp=False, 126 invertmatch=False, accelchar=None, encoding='utf-8', 127 max_matches=0):
128 """builds a checkfilter using the given checker""" 129 if isinstance(searchstring, unicode): 130 self.searchstring = searchstring 131 else: 132 self.searchstring = searchstring.decode(encoding) 133 self.searchstring = data.normalize(self.searchstring) 134 if searchparts: 135 # For now we still support the old terminology, except for the old 'source' 136 # which has a new meaning now. 137 self.search_source = ('source' in searchparts) or ('msgid' in searchparts) 138 self.search_target = ('target' in searchparts) or ('msgstr' in searchparts) 139 self.search_notes = ('notes' in searchparts) or ('comment' in searchparts) 140 self.search_locations = 'locations' in searchparts 141 else: 142 self.search_source = True 143 self.search_target = True 144 self.search_notes = False 145 self.search_locations = False 146 self.ignorecase = ignorecase 147 if self.ignorecase: 148 self.searchstring = self.searchstring.lower() 149 self.useregexp = useregexp 150 if self.useregexp: 151 self.searchpattern = re.compile(self.searchstring) 152 self.invertmatch = invertmatch 153 self.accelchar = accelchar 154 self.max_matches = max_matches
155
156 - def matches(self, teststr):
157 if teststr is None: 158 return False 159 teststr = data.normalize(teststr) 160 if self.ignorecase: 161 teststr = teststr.lower() 162 if self.accelchar: 163 teststr = re.sub(self.accelchar + self.accelchar, "#", teststr) 164 teststr = re.sub(self.accelchar, "", teststr) 165 if self.useregexp: 166 found = self.searchpattern.search(teststr) 167 else: 168 found = teststr.find(self.searchstring) != -1 169 if self.invertmatch: 170 found = not found 171 return found
172
173 - def filterunit(self, unit):
174 """runs filters on an element""" 175 if unit.isheader(): return [] 176 177 if self.search_source: 178 if isinstance(unit.source, multistring): 179 strings = unit.source.strings 180 else: 181 strings = [unit.source] 182 for string in strings: 183 if self.matches(string): 184 return True 185 186 if self.search_target: 187 if isinstance(unit.target, multistring): 188 strings = unit.target.strings 189 else: 190 strings = [unit.target] 191 for string in strings: 192 if self.matches(string): 193 return True 194 195 if self.search_notes: 196 if self.matches(unit.getnotes()): 197 return True 198 if self.search_locations: 199 if self.matches(u" ".join(unit.getlocations())): 200 return True 201 return False
202
203 - def filterfile(self, thefile):
204 """runs filters on a translation file object""" 205 thenewfile = type(thefile)() 206 thenewfile.setsourcelanguage(thefile.sourcelanguage) 207 thenewfile.settargetlanguage(thefile.targetlanguage) 208 for unit in thefile.units: 209 if self.filterunit(unit): 210 thenewfile.addunit(unit) 211 212 if isinstance(thenewfile, poheader): 213 thenewfile.updateheader(add=True, **thefile.parseheader()) 214 return thenewfile
215
216 - def getmatches(self, units):
217 if not self.searchstring: 218 return [], [] 219 220 searchstring = self.searchstring 221 flags = re.LOCALE | re.MULTILINE | re.UNICODE 222 223 if self.ignorecase: 224 flags |= re.IGNORECASE 225 if not self.useregexp: 226 searchstring = re.escape(searchstring) 227 self.re_search = re.compile(u'(%s)' % (searchstring), flags) 228 229 matches = [] 230 indexes = [] 231 232 for index, unit in enumerate(units): 233 old_length = len(matches) 234 235 if self.search_target: 236 if unit.hasplural(): 237 targets = unit.target.strings 238 else: 239 targets = [unit.target] 240 matches.extend(find_matches(unit, 'target', targets, self.re_search)) 241 if self.search_source: 242 if unit.hasplural(): 243 sources = unit.source.strings 244 else: 245 sources = [unit.source] 246 matches.extend(find_matches(unit, 'source', sources, self.re_search)) 247 if self.search_notes: 248 matches.extend(find_matches(unit, 'notes', unit.getnotes(), self.re_search)) 249 250 if self.search_locations: 251 matches.extend(find_matches(unit, 'locations', unit.getlocations(), self.re_search)) 252 253 # A search for a single letter or an all-inclusive regular 254 # expression could give enough results to cause performance 255 # problems. The answer is probably not very useful at this scale. 256 if self.max_matches and len(matches) > self.max_matches: 257 raise Exception("Too many matches found") 258 259 if len(matches) > old_length: 260 old_length = len(matches) 261 indexes.append(index) 262 263 return matches, indexes
264
265 -class GrepOptionParser(optrecurse.RecursiveOptionParser):
266 """a specialized Option Parser for the grep tool..."""
267 - def parse_args(self, args=None, values=None):
268 """parses the command line options, handling implicit input/output args""" 269 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 270 # some intelligence as to what reasonable people might give on the command line 271 if args: 272 options.searchstring = args[0] 273 args = args[1:] 274 else: 275 self.error("At least one argument must be given for the search string") 276 if args and not options.input: 277 if not options.output: 278 options.input = args[:-1] 279 args = args[-1:] 280 else: 281 options.input = args 282 args = [] 283 if args and not options.output: 284 options.output = args[-1] 285 args = args[:-1] 286 if args: 287 self.error("You have used an invalid combination of --input, --output and freestanding args") 288 if isinstance(options.input, list) and len(options.input) == 1: 289 options.input = options.input[0] 290 return (options, args)
291
292 - def set_usage(self, usage=None):
293 """sets the usage string - if usage not given, uses getusagestring for each option""" 294 if usage is None: 295 self.usage = "%prog searchstring " + " ".join([self.getusagestring(option) for option in self.option_list]) 296 else: 297 super(GrepOptionParser, self).set_usage(usage)
298
299 - def run(self):
300 """parses the arguments, and runs recursiveprocess with the resulting options""" 301 (options, args) = self.parse_args() 302 options.inputformats = self.inputformats 303 options.outputoptions = self.outputoptions 304 options.checkfilter = GrepFilter(options.searchstring, options.searchparts, options.ignorecase, options.useregexp, options.invertmatch, options.accelchar, locale.getpreferredencoding()) 305 self.usepsyco(options) 306 self.recursiveprocess(options)
307
308 -def rungrep(inputfile, outputfile, templatefile, checkfilter):
309 """reads in inputfile, filters using checkfilter, writes to outputfile""" 310 fromfile = factory.getobject(inputfile) 311 tofile = checkfilter.filterfile(fromfile) 312 if tofile.isempty(): 313 return False 314 outputfile.write(str(tofile)) 315 return True
316
317 -def cmdlineparser():
318 formats = {"po":("po", rungrep), "pot":("pot", rungrep), 319 "mo":("mo", rungrep), "gmo":("gmo", rungrep), 320 "tmx":("tmx", rungrep), 321 "xliff":("xliff", rungrep), "xlf":("xlf", rungrep), "xlff":("xlff", rungrep), 322 None:("po", rungrep)} 323 parser = GrepOptionParser(formats) 324 parser.add_option("", "--search", dest="searchparts", 325 action="append", type="choice", choices=["source", "target", "notes", "locations", "msgid", "msgstr", "comment" ], 326 metavar="SEARCHPARTS", help="searches the given parts (source, target, notes and locations)") 327 parser.add_option("-I", "--ignore-case", dest="ignorecase", 328 action="store_true", default=False, help="ignore case distinctions") 329 parser.add_option("-e", "--regexp", dest="useregexp", 330 action="store_true", default=False, help="use regular expression matching") 331 parser.add_option("-v", "--invert-match", dest="invertmatch", 332 action="store_true", default=False, help="select non-matching lines") 333 parser.add_option("", "--accelerator", dest="accelchar", 334 action="store", type="choice", choices=["&", "_", "~"], 335 metavar="ACCELERATOR", help="ignores the given accelerator when matching") 336 parser.set_usage() 337 parser.passthrough.append('checkfilter') 338 parser.description = __doc__ 339 return parser
340
341 -def main():
342 parser = cmdlineparser() 343 parser.run()
344 345 if __name__ == '__main__': 346 main() 347