Package translate :: Package tools :: Module pogrep
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.pogrep

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Grep XLIFF, Gettext PO and TMX localization files 
 23   
 24  Matches are output to snippet files of the same type which can then be reviewed  
 25  and later merged using pomerge 
 26   
 27  See: http://translate.sourceforge.net/wiki/toolkit/pogrep for examples and 
 28  usage instructions 
 29  """ 
 30   
 31  from translate.storage import factory 
 32  from translate.misc import optrecurse 
 33  from translate.misc.multistring import multistring 
 34  from translate.lang import data 
 35  import re 
 36  import locale 
 37   
 38   
39 -class GrepMatch(object):
40 """Just a small data structure that represents a search match.""" 41 42 # INITIALIZERS #
43 - def __init__(self, unit, part='target', part_n=0, start=0, end=0):
44 self.unit = unit 45 self.part = part 46 self.part_n = part_n 47 self.start = start 48 self.end = end
49 50 # ACCESSORS #
51 - def get_getter(self):
52 if self.part == 'target': 53 if self.unit.hasplural(): 54 getter = lambda: self.unit.target.strings[self.part_n] 55 else: 56 getter = lambda: self.unit.target 57 return getter 58 elif self.part == 'source': 59 if self.unit.hasplural(): 60 getter = lambda: self.unit.source.strings[self.part_n] 61 else: 62 getter = lambda: self.unit.source 63 return getter 64 elif self.part == 'notes': 65 def getter(): 66 return self.unit.getnotes()[self.part_n]
67 return getter 68 elif self.part == 'locations': 69 def getter(): 70 return self.unit.getlocations()[self.part_n]
71 return getter 72
73 - def get_setter(self):
74 if self.part == 'target': 75 if self.unit.hasplural(): 76 def setter(value): 77 strings = self.unit.target.strings 78 strings[self.part_n] = value 79 self.unit.target = strings
80 else: 81 def setter(value): 82 self.unit.target = value 83 return setter 84 85 # SPECIAL METHODS #
86 - def __str__(self):
87 start, end = self.start, self.end 88 if start < 3: 89 start = 3 90 if end > len(self.get_getter()()) - 3: 91 end = len(self.get_getter()()) - 3 92 matchpart = self.get_getter()()[start-2:end+2] 93 return '<GrepMatch "%s" part=%s[%d] start=%d end=%d>' % (matchpart, self.part, self.part_n, self.start, self.end)
94
95 - def __repr__(self):
96 return str(self)
97
98 -def real_index(string, nfc_index):
99 """Calculate the real index in the unnormalized string that corresponds to 100 the index nfc_index in the normalized string.""" 101 length = nfc_index 102 max_length = len(string) 103 while len(data.normalize(string[:length])) <= nfc_index: 104 if length == max_length: 105 return length 106 length += 1 107 return length - 1
108 109
110 -def find_matches(unit, part, strings, re_search):
111 """Return the GrepFilter objects where re_search matches in strings.""" 112 matches = [] 113 part_n = 0 114 for string in strings: 115 if not string: 116 continue 117 normalized = data.normalize(string) 118 for matchobj in re_search.finditer(normalized): 119 start = real_index(string, matchobj.start()) 120 end = real_index(string, matchobj.end()) 121 matches.append(GrepMatch(unit, part=part, part_n=part_n, start=start, end=end)) 122 return matches
123
124 -class GrepFilter:
125 - def __init__(self, searchstring, searchparts, ignorecase=False, useregexp=False, 126 invertmatch=False, accelchar=None, encoding='utf-8', includeheader=False, 127 max_matches=0):
128 """builds a checkfilter using the given checker""" 129 if isinstance(searchstring, unicode): 130 self.searchstring = searchstring 131 else: 132 self.searchstring = searchstring.decode(encoding) 133 self.searchstring = data.normalize(self.searchstring) 134 if searchparts: 135 # For now we still support the old terminology, except for the old 'source' 136 # which has a new meaning now. 137 self.search_source = ('source' in searchparts) or ('msgid' in searchparts) 138 self.search_target = ('target' in searchparts) or ('msgstr' in searchparts) 139 self.search_notes = ('notes' in searchparts) or ('comment' in searchparts) 140 self.search_locations = 'locations' in searchparts 141 else: 142 self.search_source = True 143 self.search_target = True 144 self.search_notes = False 145 self.search_locations = False 146 self.ignorecase = ignorecase 147 if self.ignorecase: 148 self.searchstring = self.searchstring.lower() 149 self.useregexp = useregexp 150 if self.useregexp: 151 self.searchpattern = re.compile(self.searchstring) 152 self.invertmatch = invertmatch 153 self.accelchar = accelchar 154 self.includeheader = includeheader 155 self.max_matches = max_matches
156
157 - def matches(self, teststr):
158 if teststr is None: 159 return False 160 teststr = data.normalize(teststr) 161 if self.ignorecase: 162 teststr = teststr.lower() 163 if self.accelchar: 164 teststr = re.sub(self.accelchar + self.accelchar, "#", teststr) 165 teststr = re.sub(self.accelchar, "", teststr) 166 if self.useregexp: 167 found = self.searchpattern.search(teststr) 168 else: 169 found = teststr.find(self.searchstring) != -1 170 if self.invertmatch: 171 found = not found 172 return found
173
174 - def filterunit(self, unit):
175 """runs filters on an element""" 176 if unit.isheader(): return [] 177 178 if self.search_source: 179 if isinstance(unit.source, multistring): 180 strings = unit.source.strings 181 else: 182 strings = [unit.source] 183 for string in strings: 184 if self.matches(string): 185 return True 186 187 if self.search_target: 188 if isinstance(unit.target, multistring): 189 strings = unit.target.strings 190 else: 191 strings = [unit.target] 192 for string in strings: 193 if self.matches(string): 194 return True 195 196 if self.search_notes: 197 if self.matches(unit.getnotes()): 198 return True 199 if self.search_locations: 200 if self.matches(u" ".join(unit.getlocations())): 201 return True 202 return False
203
204 - def filterfile(self, thefile):
205 """runs filters on a translation file object""" 206 thenewfile = type(thefile)() 207 thenewfile.setsourcelanguage(thefile.sourcelanguage) 208 thenewfile.settargetlanguage(thefile.targetlanguage) 209 for unit in thefile.units: 210 if self.filterunit(unit): 211 thenewfile.addunit(unit) 212 if self.includeheader and thenewfile.units > 0: 213 if thefile.units[0].isheader(): 214 thenewfile.units.insert(0, thefile.units[0]) 215 else: 216 thenewfile.units.insert(0, thenewfile.makeheader()) 217 return thenewfile
218
219 - def getmatches(self, units):
220 if not self.searchstring: 221 return [], [] 222 223 searchstring = self.searchstring 224 flags = re.LOCALE | re.MULTILINE | re.UNICODE 225 226 if self.ignorecase: 227 flags |= re.IGNORECASE 228 if not self.useregexp: 229 searchstring = re.escape(searchstring) 230 self.re_search = re.compile(u'(%s)' % (searchstring), flags) 231 232 matches = [] 233 indexes = [] 234 235 for index, unit in enumerate(units): 236 old_length = len(matches) 237 238 if self.search_target: 239 if unit.hasplural(): 240 targets = unit.target.strings 241 else: 242 targets = [unit.target] 243 matches.extend(find_matches(unit, 'target', targets, self.re_search)) 244 if self.search_source: 245 if unit.hasplural(): 246 sources = unit.source.strings 247 else: 248 sources = [unit.source] 249 matches.extend(find_matches(unit, 'source', sources, self.re_search)) 250 if self.search_notes: 251 matches.extend(find_matches(unit, 'notes', unit.getnotes(), self.re_search)) 252 253 if self.search_locations: 254 matches.extend(find_matches(unit, 'locations', unit.getlocations(), self.re_search)) 255 256 # A search for a single letter or an all-inclusive regular 257 # expression could give enough results to cause performance 258 # problems. The answer is probably not very useful at this scale. 259 if self.max_matches and len(matches) > self.max_matches: 260 raise Exception("Too many matches found") 261 262 if len(matches) > old_length: 263 old_length = len(matches) 264 indexes.append(index) 265 266 return matches, indexes
267
268 -class GrepOptionParser(optrecurse.RecursiveOptionParser):
269 """a specialized Option Parser for the grep tool..."""
270 - def parse_args(self, args=None, values=None):
271 """parses the command line options, handling implicit input/output args""" 272 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 273 # some intelligence as to what reasonable people might give on the command line 274 if args: 275 options.searchstring = args[0] 276 args = args[1:] 277 else: 278 self.error("At least one argument must be given for the search string") 279 if args and not options.input: 280 if not options.output: 281 options.input = args[:-1] 282 args = args[-1:] 283 else: 284 options.input = args 285 args = [] 286 if args and not options.output: 287 options.output = args[-1] 288 args = args[:-1] 289 if args: 290 self.error("You have used an invalid combination of --input, --output and freestanding args") 291 if isinstance(options.input, list) and len(options.input) == 1: 292 options.input = options.input[0] 293 return (options, args)
294
295 - def set_usage(self, usage=None):
296 """sets the usage string - if usage not given, uses getusagestring for each option""" 297 if usage is None: 298 self.usage = "%prog searchstring " + " ".join([self.getusagestring(option) for option in self.option_list]) 299 else: 300 super(GrepOptionParser, self).set_usage(usage)
301
302 - def run(self):
303 """parses the arguments, and runs recursiveprocess with the resulting options""" 304 (options, args) = self.parse_args() 305 options.inputformats = self.inputformats 306 options.outputoptions = self.outputoptions 307 options.checkfilter = GrepFilter(options.searchstring, options.searchparts, options.ignorecase, options.useregexp, options.invertmatch, options.accelchar, locale.getpreferredencoding(), options.includeheader) 308 self.usepsyco(options) 309 self.recursiveprocess(options)
310
311 -def rungrep(inputfile, outputfile, templatefile, checkfilter):
312 """reads in inputfile, filters using checkfilter, writes to outputfile""" 313 fromfile = factory.getobject(inputfile) 314 tofile = checkfilter.filterfile(fromfile) 315 if tofile.isempty(): 316 return False 317 outputfile.write(str(tofile)) 318 return True
319
320 -def cmdlineparser():
321 formats = {"po":("po", rungrep), "pot":("pot", rungrep), 322 "mo":("mo", rungrep), "gmo":("gmo", rungrep), 323 "tmx":("tmx", rungrep), 324 "xliff":("xliff", rungrep), "xlf":("xlf", rungrep), "xlff":("xlff", rungrep), 325 None:("po", rungrep)} 326 parser = GrepOptionParser(formats) 327 parser.add_option("", "--search", dest="searchparts", 328 action="append", type="choice", choices=["source", "target", "notes", "locations", "msgid", "msgstr", "comment" ], 329 metavar="SEARCHPARTS", help="searches the given parts (source, target, notes and locations)") 330 parser.add_option("-I", "--ignore-case", dest="ignorecase", 331 action="store_true", default=False, help="ignore case distinctions") 332 parser.add_option("-e", "--regexp", dest="useregexp", 333 action="store_true", default=False, help="use regular expression matching") 334 parser.add_option("-v", "--invert-match", dest="invertmatch", 335 action="store_true", default=False, help="select non-matching lines") 336 parser.add_option("", "--accelerator", dest="accelchar", 337 action="store", type="choice", choices=["&", "_", "~"], 338 metavar="ACCELERATOR", help="ignores the given accelerator when matching") 339 parser.add_option("", "--header", dest="includeheader", 340 action="store_true", default=False, 341 help="include a PO header in the output") 342 parser.set_usage() 343 parser.passthrough.append('checkfilter') 344 parser.description = __doc__ 345 return parser
346
347 -def main():
348 parser = cmdlineparser() 349 parser.run()
350 351 if __name__ == '__main__': 352 main() 353