Package translate :: Package search :: Package indexing :: Module PyLuceneIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.PyLuceneIndexer

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright 2008 Zuza Software Foundation 
  4  #  
  5  # This file is part of translate. 
  6  # 
  7  # translate is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  #  
 12  # translate is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with translate; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22   
 23  """ 
 24  interface for the PyLucene (v2.x) indexing engine 
 25   
 26  take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface 
 27  """ 
 28   
 29  __revision__ = "$Id: PyLuceneIndexer.py 13070 2009-11-13 16:47:01Z alaaosh $" 
 30   
 31  import CommonIndexer 
 32  import tempfile 
 33  import re 
 34  import os 
 35  import time 
 36   
 37  # try to import the PyLucene package (with the two possible names) 
 38  # remember the type of the detected package (compiled with jcc (>=v2.3) or 
 39  # with gcj (<=v2.2) 
 40  try: 
 41      import PyLucene 
 42      _COMPILER = 'gcj' 
 43  except ImportError: 
 44      # if this fails, then there is no pylucene installed 
 45      import lucene 
 46      PyLucene = lucene 
 47      PyLucene.initVM(PyLucene.CLASSPATH) 
 48      _COMPILER = 'jcc' 
 49   
 50   
 51  UNNAMED_FIELD_NAME = "FieldWithoutAName" 
 52  MAX_FIELD_SIZE = 1048576 
 53   
 54   
55 -def is_available():
56 return _get_pylucene_version() == 2
57 58
59 -class PyLuceneDatabase(CommonIndexer.CommonDatabase):
60 """manage and use a pylucene indexing database""" 61 62 QUERY_TYPE = PyLucene.Query 63 INDEX_DIRECTORY_NAME = "lucene" 64
65 - def __init__(self, basedir, analyzer=None, create_allowed=True):
66 """initialize or open an indexing database 67 68 Any derived class must override __init__. 69 70 @raise ValueError: the given location exists, but the database type 71 is incompatible (e.g. created by a different indexing engine) 72 @raise OSError: the database failed to initialize 73 74 @param basedir: the parent directory of the database 75 @type basedir: str 76 @param analyzer: bitwise combination of possible analyzer flags 77 to be used as the default analyzer for this database. Leave it empty 78 to use the system default analyzer (self.ANALYZER_DEFAULT). 79 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 80 @type analyzer: int 81 @param create_allowed: create the database, if necessary; default: True 82 @type create_allowed: bool 83 """ 84 super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, 85 create_allowed=create_allowed) 86 self.pyl_analyzer = PyLucene.StandardAnalyzer() 87 self.writer = None 88 self.reader = None 89 self.index_version = None 90 try: 91 # try to open an existing database 92 tempreader = PyLucene.IndexReader.open(self.location) 93 tempreader.close() 94 except PyLucene.JavaError, err_msg: 95 # Write an error out, in case this is a real problem instead of an absence of an index 96 # TODO: turn the following two lines into debug output 97 #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() 98 #DEBUG_FOO("could not open index, so going to create: " + errorstr) 99 # Create the index, so we can open cached readers on it 100 if not create_allowed: 101 raise OSError("Indexer: skipping database creation") 102 try: 103 # create the parent directory if it does not exist 104 parent_path = os.path.dirname(self.location) 105 if not os.path.isdir(parent_path): 106 # recursively create all directories up to parent_path 107 os.makedirs(parent_path) 108 except IOError, err_msg: 109 raise OSError("Indexer: failed to create the parent " \ 110 + "directory (%s) of the indexing database: %s" \ 111 % (parent_path, err_msg)) 112 try: 113 tempwriter = PyLucene.IndexWriter(self.location, 114 self.pyl_analyzer, True) 115 tempwriter.close() 116 except PyLucene.JavaError, err_msg: 117 raise OSError("Indexer: failed to open or create a Lucene" \ 118 + " database (%s): %s" % (self.location, err_msg)) 119 # the indexer is initialized - now we prepare the searcher 120 # windows file locking seems inconsistent, so we try 10 times 121 numtries = 0 122 #self.dir_lock.acquire(blocking=True) 123 # read "self.reader", "self.indexVersion" and "self.searcher" 124 try: 125 while numtries < 10: 126 try: 127 self.reader = PyLucene.IndexReader.open(self.location) 128 self.indexVersion = self.reader.getCurrentVersion( 129 self.location) 130 self.searcher = PyLucene.IndexSearcher(self.reader) 131 break 132 except PyLucene.JavaError, e: 133 # store error message for possible later re-raise (below) 134 lock_error_msg = e 135 time.sleep(0.01) 136 numtries += 1 137 else: 138 # locking failed for 10 times 139 raise OSError("Indexer: failed to lock index database" \ 140 + " (%s)" % lock_error_msg) 141 finally: 142 pass 143 # self.dir_lock.release() 144 # initialize the searcher and the reader 145 self._index_refresh()
146
147 - def __del__(self):
148 """remove lock and close writer after loosing the last reference""" 149 self._writer_close()
150
151 - def flush(self, optimize=False):
152 """flush the content of the database - to force changes to be written 153 to disk 154 155 some databases also support index optimization 156 157 @param optimize: should the index be optimized if possible? 158 @type optimize: bool 159 """ 160 if self._writer_is_open(): 161 try: 162 if optimize: 163 self.writer.optimize() 164 finally: 165 # close the database even if optimizing failed 166 self._writer_close() 167 # the reader/searcher needs an update, too 168 self._index_refresh()
169
170 - def _create_query_for_query(self, query):
171 """generate a query based on an existing query object 172 173 basically this function should just create a copy of the original 174 175 @param query: the original query object 176 @type query: PyLucene.Query 177 @return: resulting query object 178 @rtype: PyLucene.Query 179 """ 180 # TODO: a deep copy or a clone would be safer 181 # somehow not working (returns "null"): copy.deepcopy(query) 182 return query
183
184 - def _create_query_for_string(self, text, require_all=True, 185 analyzer=None):
186 """generate a query for a plain term of a string query 187 188 basically this function parses the string and returns the resulting 189 query 190 191 @param text: the query string 192 @type text: str 193 @param require_all: boolean operator 194 (True -> AND (default) / False -> OR) 195 @type require_all: bool 196 @param analyzer: the analyzer to be used 197 possible analyzers are: 198 - L{CommonDatabase.ANALYZER_TOKENIZE} 199 the field value is splitted to be matched word-wise 200 - L{CommonDatabase.ANALYZER_PARTIAL} 201 the field value must start with the query string 202 - L{CommonDatabase.ANALYZER_EXACT} 203 keep special characters and the like 204 @type analyzer: bool 205 @return: resulting query object 206 @rtype: PyLucene.Query 207 """ 208 if analyzer is None: 209 analyzer = self.analyzer 210 if analyzer == self.ANALYZER_EXACT: 211 analyzer_obj = PyLucene.KeywordAnalyzer() 212 else: 213 text = _escape_term_value(text) 214 analyzer_obj = PyLucene.StandardAnalyzer() 215 qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj) 216 if (analyzer & self.ANALYZER_PARTIAL > 0): 217 # PyLucene uses explicit wildcards for partial matching 218 text += "*" 219 if require_all: 220 qp.setDefaultOperator(qp.Operator.AND) 221 else: 222 qp.setDefaultOperator(qp.Operator.OR) 223 return qp.parse(text)
224
225 - def _create_query_for_field(self, field, value, analyzer=None):
226 """generate a field query 227 228 this functions creates a field->value query 229 230 @param field: the fieldname to be used 231 @type field: str 232 @param value: the wanted value of the field 233 @type value: str 234 @param analyzer: the analyzer to be used 235 possible analyzers are: 236 - L{CommonDatabase.ANALYZER_TOKENIZE} 237 the field value is splitted to be matched word-wise 238 - L{CommonDatabase.ANALYZER_PARTIAL} 239 the field value must start with the query string 240 - L{CommonDatabase.ANALYZER_EXACT} 241 keep special characters and the like 242 @type analyzer: bool 243 @return: resulting query object 244 @rtype: PyLucene.Query 245 """ 246 if analyzer is None: 247 analyzer = self.analyzer 248 if analyzer == self.ANALYZER_EXACT: 249 analyzer_obj = PyLucene.KeywordAnalyzer() 250 else: 251 value = _escape_term_value(value) 252 analyzer_obj = PyLucene.StandardAnalyzer() 253 qp = PyLucene.QueryParser(field, analyzer_obj) 254 if (analyzer & self.ANALYZER_PARTIAL > 0): 255 # PyLucene uses explicit wildcards for partial matching 256 value += "*" 257 return qp.parse(value)
258
259 - def _create_query_combined(self, queries, require_all=True):
260 """generate a combined query 261 262 @param queries: list of the original queries 263 @type queries: list of PyLucene.Query 264 @param require_all: boolean operator 265 (True -> AND (default) / False -> OR) 266 @type require_all: bool 267 @return: the resulting combined query object 268 @rtype: PyLucene.Query 269 """ 270 combined_query = PyLucene.BooleanQuery() 271 for query in queries: 272 combined_query.add( 273 PyLucene.BooleanClause(query, _occur(require_all, False))) 274 return combined_query
275
276 - def _create_empty_document(self):
277 """create an empty document to be filled and added to the index later 278 279 @return: the new document object 280 @rtype: PyLucene.Document 281 """ 282 return PyLucene.Document()
283
284 - def _add_plain_term(self, document, term, tokenize=True):
285 """add a term to a document 286 287 @param document: the document to be changed 288 @type document: PyLucene.Document 289 @param term: a single term to be added 290 @type term: str 291 @param tokenize: should the term be tokenized automatically 292 @type tokenize: bool 293 """ 294 if tokenize: 295 token_flag = PyLucene.Field.Index.TOKENIZED 296 else: 297 token_flag = PyLucene.Field.Index.UN_TOKENIZED 298 document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term, 299 PyLucene.Field.Store.YES, token_flag))
300
301 - def _add_field_term(self, document, field, term, tokenize=True):
302 """add a field term to a document 303 304 @param document: the document to be changed 305 @type document: PyLucene.Document 306 @param field: name of the field 307 @type field: str 308 @param term: term to be associated to the field 309 @type term: str 310 @param tokenize: should the term be tokenized automatically 311 @type tokenize: bool 312 """ 313 if tokenize: 314 token_flag = PyLucene.Field.Index.TOKENIZED 315 else: 316 token_flag = PyLucene.Field.Index.UN_TOKENIZED 317 document.add(PyLucene.Field(str(field), term, 318 PyLucene.Field.Store.YES, token_flag))
319
320 - def _add_document_to_index(self, document):
321 """add a prepared document to the index database 322 323 @param document: the document to be added 324 @type document: PyLucene.Document 325 """ 326 self._writer_open() 327 self.writer.addDocument(document)
328
329 - def begin_transaction(self):
330 """PyLucene does not support transactions 331 332 Thus this function just opens the database for write access. 333 Call "cancel_transaction" or "commit_transaction" to close write 334 access in order to remove the exclusive lock from the database 335 directory. 336 """ 337 self._writer_open()
338
339 - def cancel_transaction(self):
340 """PyLucene does not support transactions 341 342 Thus this function just closes the database write access and removes 343 the exclusive lock. 344 345 See 'start_transaction' for details. 346 """ 347 self._writer_close()
348
349 - def commit_transaction(self):
350 """PyLucene does not support transactions 351 352 Thus this function just closes the database write access and removes 353 the exclusive lock. 354 355 See 'start_transaction' for details. 356 """ 357 self._writer_close() 358 self._index_refresh()
359
360 - def get_query_result(self, query):
361 """return an object containing the results of a query 362 363 @param query: a pre-compiled query 364 @type query: a query object of the real implementation 365 @return: an object that allows access to the results 366 @rtype: subclass of CommonEnquire 367 """ 368 return PyLuceneHits(self.searcher.search(query))
369
370 - def delete_document_by_id(self, docid):
371 """delete a specified document 372 373 @param docid: the document ID to be deleted 374 @type docid: int 375 """ 376 self._delete_stale_lock() 377 self.reader.deleteDocument(docid) 378 self.reader.flush() 379 # TODO: check the performance impact of calling "refresh" for each id 380 self._index_refresh()
381
382 - def search(self, query, fieldnames):
383 """return a list of the contents of specified fields for all matches of 384 a query 385 386 @param query: the query to be issued 387 @type query: a query object of the real implementation 388 @param fieldnames: the name(s) of a field of the document content 389 @type fieldnames: string | list of strings 390 @return: a list of dicts containing the specified field(s) 391 @rtype: list of dicts 392 """ 393 if isinstance(fieldnames, basestring): 394 fieldnames = [fieldnames] 395 hits = self.searcher.search(query) 396 if _COMPILER == 'jcc': 397 # add the ranking number and the retrieved document to the array 398 hits = [(hit, hits.doc(hit)) for hit in range(hits.length())] 399 result = [] 400 for hit, doc in hits: 401 fields = {} 402 for fieldname in fieldnames: 403 # take care for the special field "None" 404 if fieldname is None: 405 pyl_fieldname = UNNAMED_FIELD_NAME 406 else: 407 pyl_fieldname = fieldname 408 fields[fieldname] = doc.getValues(pyl_fieldname) 409 result.append(fields) 410 return result
411
412 - def _delete_stale_lock(self):
413 if self.reader.isLocked(self.location): 414 #HACKISH: there is a lock but Lucene api can't tell us how old it 415 # is, will have to check the filesystem 416 try: 417 # in try block just in case lock disappears on us while testing it 418 stat = os.stat(os.path.join(self.location, 'write.lock')) 419 age = (time.time() - stat.st_mtime) / 60 420 if age > 15: 421 logging.warning("stale lock found in %s, removing.", self.location) 422 self.reader.unlock(self.reader.directory()) 423 except: 424 pass
425
426 - def _writer_open(self):
427 """open write access for the indexing database and acquire an 428 exclusive lock 429 """ 430 if not self._writer_is_open(): 431 self._delete_stale_lock() 432 self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer, 433 False) 434 # "setMaxFieldLength" is available since PyLucene v2 435 # we must stay compatible to v1 for the derived class 436 # (PyLuceneIndexer1) - thus we make this step optional 437 if hasattr(self.writer, "setMaxFieldLength"): 438 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
439 # do nothing, if it is already open 440
441 - def _writer_close(self):
442 """close indexing write access and remove the database lock""" 443 if self._writer_is_open(): 444 self.writer.commit() 445 self.writer.close() 446 self.writer = None
447
448 - def _writer_is_open(self):
449 """check if the indexing write access is currently open""" 450 return not self.writer is None
451
452 - def _index_refresh(self):
453 """re-read the indexer database""" 454 try: 455 if self.reader is None or self.searcher is None: 456 self.reader = PyLucene.IndexReader.open(self.location) 457 self.searcher = PyLucene.IndexSearcher(self.reader) 458 elif self.index_version != self.reader.getCurrentVersion( \ 459 self.location): 460 self.searcher.close() 461 self.reader.close() 462 self.reader = PyLucene.IndexReader.open(self.location) 463 self.searcher = PyLucene.IndexSearcher(self.reader) 464 self.index_version = self.reader.getCurrentVersion(self.location) 465 except PyLucene.JavaError,e: 466 # TODO: add some debugging output? 467 #self.errorhandler.logerror("Error attempting to read index - try reindexing: "+str(e)) 468 pass
469 470
471 -class PyLuceneHits(CommonIndexer.CommonEnquire):
472 """an enquire object contains the information about the result of a request 473 """ 474
475 - def get_matches(self, start, number):
476 """return a specified number of qualified matches of a previous query 477 478 @param start: index of the first match to return (starting from zero) 479 @type start: int 480 @param number: the number of matching entries to return 481 @type number: int 482 @return: a set of matching entries and some statistics 483 @rtype: tuple of (returned number, available number, matches) 484 "matches" is a dictionary of:: 485 ["rank", "percent", "document", "docid"] 486 """ 487 # check if requested results do not exist 488 # stop is the lowest index number to be ommitted 489 stop = start + number 490 if stop > self.enquire.length(): 491 stop = self.enquire.length() 492 # invalid request range 493 if stop <= start: 494 return (0, self.enquire.length(), []) 495 result = [] 496 for index in range(start, stop): 497 item = {} 498 item["rank"] = index 499 item["docid"] = self.enquire.id(index) 500 item["percent"] = self.enquire.score(index) 501 item["document"] = self.enquire.doc(index) 502 result.append(item) 503 return (stop-start, self.enquire.length(), result)
504
505 -def _occur(required, prohibited):
506 if required == True and prohibited == False: 507 return PyLucene.BooleanClause.Occur.MUST 508 elif required == False and prohibited == False: 509 return PyLucene.BooleanClause.Occur.SHOULD 510 elif required == False and prohibited == True: 511 return PyLucene.BooleanClause.Occur.MUST_NOT 512 else: 513 # It is an error to specify a clause as both required 514 # and prohibited 515 return None
516
517 -def _get_pylucene_version():
518 """get the installed pylucene version 519 520 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown 521 @rtype: int 522 """ 523 version = PyLucene.VERSION 524 if version.startswith("1."): 525 return 1 526 elif version.startswith("2."): 527 return 2 528 else: 529 return 0
530 531
532 -def _escape_term_value(text):
533 return re.sub("\*", "", text)
534