Package translate :: Package search :: Package indexing :: Module XapianIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.XapianIndexer

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2008-2009 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """ 
 24  Interface to the Xapian indexing engine for the Translate Toolkit 
 25   
 26  Xapian v1.0 or higher is supported. 
 27   
 28  If you are interested in writing an interface for Xapian 0.x, then 
 29  you should checkout the following:: 
 30      svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/ 
 31  It is not completely working, but it should give you a good start. 
 32  """ 
 33   
 34  __revision__ = "$Id: XapianIndexer.py 13411 2009-11-30 20:51:48Z alaaosh $" 
 35   
 36  # xapian module versions before 1.0.13 hangs apache under mod_python 
 37  import sys 
 38  import re 
 39   
 40  # detect if running under apache 
 41  if 'apache' in sys.modules or '_apache' in sys.modules or 'mod_wsgi' in sys.modules: 
42 - def _str2version(version):
43 return [int(i) for i in version.split('.')]
44 45 import subprocess 46 # even checking xapian version leads to deadlock under apache, must figure version from command line 47 try: 48 command = subprocess.Popen(['xapian-check', '--version'], stdout=subprocess.PIPE) 49 stdout, stderr = command.communicate() 50 if _str2version(re.match('.*([0-9]+\.[0-9]+\.[0-9]+).*', stdout).groups()[0]) < [1, 0, 13]: 51 raise ImportError("Running under apache, can't load xapain") 52 except: 53 #FIXME: report is xapian-check command is missing? 54 raise ImportError("Running under apache, can't load xapian") 55 56 import CommonIndexer 57 import xapian 58 import os 59 60
61 -def is_available():
62 return xapian.major_version() > 0
63 64 65 # in xapian there is a length restriction for term strings 66 # see http://osdir.com/ml/search.xapian.general/2006-11/msg00210.html 67 # a maximum length of around 240 is described there - but we need less anyway 68 _MAX_TERM_LENGTH = 128 69 70
71 -class XapianDatabase(CommonIndexer.CommonDatabase):
72 """interface to the xapian (http://xapian.org) indexer 73 """ 74 75 QUERY_TYPE = xapian.Query 76 INDEX_DIRECTORY_NAME = "xapian" 77
78 - def __init__(self, basedir, analyzer=None, create_allowed=True):
79 """initialize or open a xapian database 80 81 @raise ValueError: the given location exists, but the database type 82 is incompatible (e.g. created by a different indexing engine) 83 @raise OSError: the database failed to initialize 84 85 @param basedir: the parent directory of the database 86 @type basedir: str 87 @param analyzer: bitwise combination of possible analyzer flags 88 to be used as the default analyzer for this database. Leave it empty 89 to use the system default analyzer (self.ANALYZER_DEFAULT). 90 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 91 @type analyzer: int 92 @param create_allowed: create the database, if necessary; default: True 93 @type create_allowed: bool 94 """ 95 # call the __init__ function of our parent 96 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer, 97 create_allowed=create_allowed) 98 if os.path.exists(self.location): 99 # try to open an existing database 100 try: 101 self.database = xapian.WritableDatabase(self.location, 102 xapian.DB_OPEN) 103 except xapian.DatabaseOpeningError, err_msg: 104 raise ValueError("Indexer: failed to open xapian database " \ 105 + "(%s) - maybe it is not a xapian database: %s" \ 106 % (self.location, err_msg)) 107 else: 108 # create a new database 109 if not create_allowed: 110 raise OSError("Indexer: skipping database creation") 111 try: 112 # create the parent directory if it does not exist 113 parent_path = os.path.dirname(self.location) 114 if not os.path.isdir(parent_path): 115 # recursively create all directories up to parent_path 116 os.makedirs(parent_path) 117 except IOError, err_msg: 118 raise OSError("Indexer: failed to create the parent " \ 119 + "directory (%s) of the indexing database: %s" \ 120 % (parent_path, err_msg)) 121 try: 122 self.database = xapian.WritableDatabase(self.location, 123 xapian.DB_CREATE_OR_OPEN) 124 except xapian.DatabaseOpeningError, err_msg: 125 raise OSError("Indexer: failed to open or create a xapian " \ 126 + "database (%s): %s" % (self.location, err_msg))
127
128 - def flush(self, optimize=False):
129 """force to write the current changes to disk immediately 130 131 @param optimize: ignored for xapian 132 @type optimize: bool 133 """ 134 # write changes to disk (only if database is read-write) 135 if (isinstance(self.database, xapian.WritableDatabase)): 136 self.database.flush() 137 # free the database to remove locks - this is a xapian-specific issue 138 self.database = None 139 # reopen it as read-only 140 self._prepare_database()
141
142 - def _create_query_for_query(self, query):
143 """generate a query based on an existing query object 144 145 basically this function should just create a copy of the original 146 147 @param query: the original query object 148 @type query: xapian.Query 149 @return: the resulting query object 150 @rtype: xapian.Query 151 """ 152 # create a copy of the original query 153 return xapian.Query(query)
154
155 - def _create_query_for_string(self, text, require_all=True, 156 analyzer=None):
157 """generate a query for a plain term of a string query 158 159 basically this function parses the string and returns the resulting 160 query 161 162 @param text: the query string 163 @type text: str 164 @param require_all: boolean operator 165 (True -> AND (default) / False -> OR) 166 @type require_all: bool 167 @param analyzer: Define query options (partial matching, exact matching, 168 tokenizing, ...) as bitwise combinations of 169 CommonIndexer.ANALYZER_???. 170 This can override previously defined field analyzer settings. 171 If analyzer is None (default), then the configured analyzer for the 172 field is used. 173 @type analyzer: int 174 @return: resulting query object 175 @rtype: xapian.Query 176 """ 177 qp = xapian.QueryParser() 178 qp.set_database(self.database) 179 if require_all: 180 qp.set_default_op(xapian.Query.OP_AND) 181 else: 182 qp.set_default_op(xapian.Query.OP_OR) 183 if analyzer is None: 184 analyzer = self.analyzer 185 if analyzer & self.ANALYZER_PARTIAL > 0: 186 match_flags = xapian.QueryParser.FLAG_PARTIAL 187 return qp.parse_query(text, match_flags) 188 elif analyzer == self.ANALYZER_EXACT: 189 # exact matching - 190 return xapian.Query(text) 191 else: 192 # everything else (not partial and not exact) 193 match_flags = 0 194 return qp.parse_query(text, match_flags)
195
196 - def _create_query_for_field(self, field, value, analyzer=None):
197 """generate a field query 198 199 this functions creates a field->value query 200 201 @param field: the fieldname to be used 202 @type field: str 203 @param value: the wanted value of the field 204 @type value: str 205 @param analyzer: Define query options (partial matching, exact matching, 206 tokenizing, ...) as bitwise combinations of 207 CommonIndexer.ANALYZER_???. 208 This can override previously defined field analyzer settings. 209 If analyzer is None (default), then the configured analyzer for the 210 field is used. 211 @type analyzer: int 212 @return: the resulting query object 213 @rtype: xapian.Query 214 """ 215 if analyzer is None: 216 analyzer = self.analyzer 217 if analyzer == self.ANALYZER_EXACT: 218 # exact matching -> keep special characters 219 return xapian.Query("%s%s" % (field.upper(), value)) 220 # other queries need a parser object 221 qp = xapian.QueryParser() 222 qp.set_database(self.database) 223 if (analyzer & self.ANALYZER_PARTIAL > 0): 224 # partial matching 225 match_flags = xapian.QueryParser.FLAG_PARTIAL 226 return qp.parse_query(value, match_flags, field.upper()) 227 else: 228 # everything else (not partial and not exact) 229 match_flags = 0 230 return qp.parse_query(value, match_flags, field.upper())
231
232 - def _create_query_combined(self, queries, require_all=True):
233 """generate a combined query 234 235 @param queries: list of the original queries 236 @type queries: list of xapian.Query 237 @param require_all: boolean operator 238 (True -> AND (default) / False -> OR) 239 @type require_all: bool 240 @return: the resulting combined query object 241 @rtype: xapian.Query 242 """ 243 if require_all: 244 query_op = xapian.Query.OP_AND 245 else: 246 query_op = xapian.Query.OP_OR 247 return xapian.Query(query_op, queries)
248
249 - def _create_empty_document(self):
250 """create an empty document to be filled and added to the index later 251 252 @return: the new document object 253 @rtype: xapian.Document 254 """ 255 return xapian.Document()
256
257 - def _add_plain_term(self, document, term, tokenize=True):
258 """add a term to a document 259 260 @param document: the document to be changed 261 @type document: xapian.Document 262 @param term: a single term to be added 263 @type term: str 264 @param tokenize: should the term be tokenized automatically 265 @type tokenize: bool 266 """ 267 if tokenize: 268 term_gen = xapian.TermGenerator() 269 term_gen.set_document(document) 270 term_gen.index_text(term) 271 else: 272 document.add_term(_truncate_term_length(term))
273
274 - def _add_field_term(self, document, field, term, tokenize=True):
275 """add a field term to a document 276 277 @param document: the document to be changed 278 @type document: xapian.Document 279 @param field: name of the field 280 @type field: str 281 @param term: term to be associated to the field 282 @type term: str 283 @param tokenize: should the term be tokenized automatically 284 @type tokenize: bool 285 """ 286 if tokenize: 287 term_gen = xapian.TermGenerator() 288 term_gen.set_document(document) 289 term_gen.index_text(term, 1, field.upper()) 290 else: 291 document.add_term(_truncate_term_length("%s%s" % \ 292 (field.upper(), term)))
293
294 - def _add_document_to_index(self, document):
295 """add a prepared document to the index database 296 297 @param document: the document to be added 298 @type document: xapian.Document 299 """ 300 # open the database for writing 301 self._prepare_database(writable=True) 302 self.database.add_document(document)
303
304 - def begin_transaction(self):
305 """begin a transaction 306 307 Xapian supports transactions to group multiple database modifications. 308 This avoids intermediate flushing and therefore increases performance. 309 """ 310 self._prepare_database(writable=True) 311 self.database.begin_transaction()
312
313 - def cancel_transaction(self):
314 """cancel an ongoing transaction 315 316 no changes since the last execution of 'begin_transcation' are written 317 """ 318 self._prepare_database(writable=True) 319 self.database.cancel_transaction()
320
321 - def commit_transaction(self):
322 """submit the changes of an ongoing transaction 323 324 all changes since the last execution of 'begin_transaction' are written 325 """ 326 self._prepare_database(writable=True) 327 self.database.commit_transaction()
328
329 - def get_query_result(self, query):
330 """return an object containing the results of a query 331 332 @param query: a pre-compiled xapian query 333 @type query: xapian.Query 334 @return: an object that allows access to the results 335 @rtype: XapianIndexer.CommonEnquire 336 """ 337 enquire = xapian.Enquire(self.database) 338 enquire.set_query(query) 339 return XapianEnquire(enquire)
340
341 - def delete_document_by_id(self, docid):
342 """delete a specified document 343 344 @param docid: the document ID to be deleted 345 @type docid: int 346 """ 347 # open the database for writing 348 self._prepare_database(writable=True) 349 try: 350 self.database.delete_document(docid) 351 return True 352 except xapian.DocNotFoundError: 353 return False
354
355 - def search(self, query, fieldnames):
356 """return a list of the contents of specified fields for all matches of 357 a query 358 359 @param query: the query to be issued 360 @type query: xapian.Query 361 @param fieldnames: the name(s) of a field of the document content 362 @type fieldnames: string | list of strings 363 @return: a list of dicts containing the specified field(s) 364 @rtype: list of dicts 365 """ 366 result = [] 367 if isinstance(fieldnames, basestring): 368 fieldnames = [fieldnames] 369 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames)) 370 return result
371
372 - def _prepare_database(self, writable=False):
373 """reopen the database as read-only or as writable if necessary 374 375 this fixes a xapian specific issue regarding open locks for 376 writable databases 377 378 @param writable: True for opening a writable database 379 @type writable: bool 380 """ 381 if writable and (not isinstance(self.database, 382 xapian.WritableDatabase)): 383 self.database = xapian.WritableDatabase(self.location, 384 xapian.DB_OPEN) 385 elif not writable and (not isinstance(self.database, xapian.Database)): 386 self.database = xapian.Database(self.location)
387 388
389 -class XapianEnquire(CommonIndexer.CommonEnquire):
390 """interface to the xapian object for storing sets of matches 391 """ 392
393 - def get_matches(self, start, number):
394 """return a specified number of qualified matches of a previous query 395 396 @param start: index of the first match to return (starting from zero) 397 @type start: int 398 @param number: the number of matching entries to return 399 @type number: int 400 @return: a set of matching entries and some statistics 401 @rtype: tuple of (returned number, available number, matches) 402 "matches" is a dictionary of:: 403 ["rank", "percent", "document", "docid"] 404 """ 405 matches = self.enquire.get_mset(start, number) 406 result = [] 407 for match in matches: 408 elem = {} 409 elem["rank"] = match[xapian.MSET_RANK] 410 elem["docid"] = match[xapian.MSET_DID] 411 elem["percent"] = match[xapian.MSET_PERCENT] 412 elem["document"] = match[xapian.MSET_DOCUMENT] 413 result.append(elem) 414 return (matches.size(), matches.get_matches_estimated(), result)
415 416
417 -def _truncate_term_length(term, taken=0):
418 """truncate the length of a term string length to the maximum allowed 419 for xapian terms 420 421 @param term: the value of the term, that should be truncated 422 @type term: str 423 @param taken: since a term consists of the name of the term and its 424 actual value, this additional parameter can be used to reduce the 425 maximum count of possible characters 426 @type taken: int 427 @return: the truncated string 428 @rtype: str 429 """ 430 if len(term) > _MAX_TERM_LENGTH - taken: 431 return term[0:_MAX_TERM_LENGTH - taken - 1] 432 else: 433 return term
434
435 -def _extract_fieldvalues(match, (result, fieldnames)):
436 """add a dict of field values to a list 437 438 usually this function should be used together with '_walk_matches' 439 for traversing a list of matches 440 @param match: a single match object 441 @type match: xapian.MSet 442 @param result: the resulting dict will be added to this list 443 @type result: list of dict 444 @param fieldnames: the names of the fields to be added to the dict 445 @type fieldnames: list of str 446 """ 447 # prepare empty dict 448 item_fields = {} 449 # fill the dict 450 for term in match["document"].termlist(): 451 for fname in fieldnames: 452 if ((fname is None) and re.match("[^A-Z]", term.term)): 453 value = term.term 454 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term): 455 value = term.term[len(fname):] 456 else: 457 continue 458 # we found a matching field/term 459 if item_fields.has_key(fname): 460 item_fields[fname].append(value) 461 else: 462 item_fields[fname] = [value] 463 result.append(item_fields)
464