1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 interface for the PyLucene (v2.x) indexing engine
25
26 take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface
27 """
28
29 __revision__ = "$Id: PyLuceneIndexer.py 13070 2009-11-13 16:47:01Z alaaosh $"
30
31 import CommonIndexer
32 import tempfile
33 import re
34 import os
35 import time
36
37
38
39
40 try:
41 import PyLucene
42 _COMPILER = 'gcj'
43 except ImportError:
44
45 import lucene
46 PyLucene = lucene
47 PyLucene.initVM(PyLucene.CLASSPATH)
48 _COMPILER = 'jcc'
49
50
51 UNNAMED_FIELD_NAME = "FieldWithoutAName"
52 MAX_FIELD_SIZE = 1048576
53
54
57
58
60 """manage and use a pylucene indexing database"""
61
62 QUERY_TYPE = PyLucene.Query
63 INDEX_DIRECTORY_NAME = "lucene"
64
65 - def __init__(self, basedir, analyzer=None, create_allowed=True):
66 """initialize or open an indexing database
67
68 Any derived class must override __init__.
69
70 @raise ValueError: the given location exists, but the database type
71 is incompatible (e.g. created by a different indexing engine)
72 @raise OSError: the database failed to initialize
73
74 @param basedir: the parent directory of the database
75 @type basedir: str
76 @param analyzer: bitwise combination of possible analyzer flags
77 to be used as the default analyzer for this database. Leave it empty
78 to use the system default analyzer (self.ANALYZER_DEFAULT).
79 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
80 @type analyzer: int
81 @param create_allowed: create the database, if necessary; default: True
82 @type create_allowed: bool
83 """
84 super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer,
85 create_allowed=create_allowed)
86 self.pyl_analyzer = PyLucene.StandardAnalyzer()
87 self.writer = None
88 self.reader = None
89 self.index_version = None
90 try:
91
92 tempreader = PyLucene.IndexReader.open(self.location)
93 tempreader.close()
94 except PyLucene.JavaError, err_msg:
95
96
97
98
99
100 if not create_allowed:
101 raise OSError("Indexer: skipping database creation")
102 try:
103
104 parent_path = os.path.dirname(self.location)
105 if not os.path.isdir(parent_path):
106
107 os.makedirs(parent_path)
108 except IOError, err_msg:
109 raise OSError("Indexer: failed to create the parent " \
110 + "directory (%s) of the indexing database: %s" \
111 % (parent_path, err_msg))
112 try:
113 tempwriter = PyLucene.IndexWriter(self.location,
114 self.pyl_analyzer, True)
115 tempwriter.close()
116 except PyLucene.JavaError, err_msg:
117 raise OSError("Indexer: failed to open or create a Lucene" \
118 + " database (%s): %s" % (self.location, err_msg))
119
120
121 numtries = 0
122
123
124 try:
125 while numtries < 10:
126 try:
127 self.reader = PyLucene.IndexReader.open(self.location)
128 self.indexVersion = self.reader.getCurrentVersion(
129 self.location)
130 self.searcher = PyLucene.IndexSearcher(self.reader)
131 break
132 except PyLucene.JavaError, e:
133
134 lock_error_msg = e
135 time.sleep(0.01)
136 numtries += 1
137 else:
138
139 raise OSError("Indexer: failed to lock index database" \
140 + " (%s)" % lock_error_msg)
141 finally:
142 pass
143
144
145 self._index_refresh()
146
148 """remove lock and close writer after loosing the last reference"""
149 self._writer_close()
150
151 - def flush(self, optimize=False):
152 """flush the content of the database - to force changes to be written
153 to disk
154
155 some databases also support index optimization
156
157 @param optimize: should the index be optimized if possible?
158 @type optimize: bool
159 """
160 if self._writer_is_open():
161 try:
162 if optimize:
163 self.writer.optimize()
164 finally:
165
166 self._writer_close()
167
168 self._index_refresh()
169
171 """generate a query based on an existing query object
172
173 basically this function should just create a copy of the original
174
175 @param query: the original query object
176 @type query: PyLucene.Query
177 @return: resulting query object
178 @rtype: PyLucene.Query
179 """
180
181
182 return query
183
186 """generate a query for a plain term of a string query
187
188 basically this function parses the string and returns the resulting
189 query
190
191 @param text: the query string
192 @type text: str
193 @param require_all: boolean operator
194 (True -> AND (default) / False -> OR)
195 @type require_all: bool
196 @param analyzer: the analyzer to be used
197 possible analyzers are:
198 - L{CommonDatabase.ANALYZER_TOKENIZE}
199 the field value is splitted to be matched word-wise
200 - L{CommonDatabase.ANALYZER_PARTIAL}
201 the field value must start with the query string
202 - L{CommonDatabase.ANALYZER_EXACT}
203 keep special characters and the like
204 @type analyzer: bool
205 @return: resulting query object
206 @rtype: PyLucene.Query
207 """
208 if analyzer is None:
209 analyzer = self.analyzer
210 if analyzer == self.ANALYZER_EXACT:
211 analyzer_obj = PyLucene.KeywordAnalyzer()
212 else:
213 text = _escape_term_value(text)
214 analyzer_obj = PyLucene.StandardAnalyzer()
215 qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj)
216 if (analyzer & self.ANALYZER_PARTIAL > 0):
217
218 text += "*"
219 if require_all:
220 qp.setDefaultOperator(qp.Operator.AND)
221 else:
222 qp.setDefaultOperator(qp.Operator.OR)
223 return qp.parse(text)
224
226 """generate a field query
227
228 this functions creates a field->value query
229
230 @param field: the fieldname to be used
231 @type field: str
232 @param value: the wanted value of the field
233 @type value: str
234 @param analyzer: the analyzer to be used
235 possible analyzers are:
236 - L{CommonDatabase.ANALYZER_TOKENIZE}
237 the field value is splitted to be matched word-wise
238 - L{CommonDatabase.ANALYZER_PARTIAL}
239 the field value must start with the query string
240 - L{CommonDatabase.ANALYZER_EXACT}
241 keep special characters and the like
242 @type analyzer: bool
243 @return: resulting query object
244 @rtype: PyLucene.Query
245 """
246 if analyzer is None:
247 analyzer = self.analyzer
248 if analyzer == self.ANALYZER_EXACT:
249 analyzer_obj = PyLucene.KeywordAnalyzer()
250 else:
251 value = _escape_term_value(value)
252 analyzer_obj = PyLucene.StandardAnalyzer()
253 qp = PyLucene.QueryParser(field, analyzer_obj)
254 if (analyzer & self.ANALYZER_PARTIAL > 0):
255
256 value += "*"
257 return qp.parse(value)
258
260 """generate a combined query
261
262 @param queries: list of the original queries
263 @type queries: list of PyLucene.Query
264 @param require_all: boolean operator
265 (True -> AND (default) / False -> OR)
266 @type require_all: bool
267 @return: the resulting combined query object
268 @rtype: PyLucene.Query
269 """
270 combined_query = PyLucene.BooleanQuery()
271 for query in queries:
272 combined_query.add(
273 PyLucene.BooleanClause(query, _occur(require_all, False)))
274 return combined_query
275
277 """create an empty document to be filled and added to the index later
278
279 @return: the new document object
280 @rtype: PyLucene.Document
281 """
282 return PyLucene.Document()
283
285 """add a term to a document
286
287 @param document: the document to be changed
288 @type document: PyLucene.Document
289 @param term: a single term to be added
290 @type term: str
291 @param tokenize: should the term be tokenized automatically
292 @type tokenize: bool
293 """
294 if tokenize:
295 token_flag = PyLucene.Field.Index.TOKENIZED
296 else:
297 token_flag = PyLucene.Field.Index.UN_TOKENIZED
298 document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term,
299 PyLucene.Field.Store.YES, token_flag))
300
302 """add a field term to a document
303
304 @param document: the document to be changed
305 @type document: PyLucene.Document
306 @param field: name of the field
307 @type field: str
308 @param term: term to be associated to the field
309 @type term: str
310 @param tokenize: should the term be tokenized automatically
311 @type tokenize: bool
312 """
313 if tokenize:
314 token_flag = PyLucene.Field.Index.TOKENIZED
315 else:
316 token_flag = PyLucene.Field.Index.UN_TOKENIZED
317 document.add(PyLucene.Field(str(field), term,
318 PyLucene.Field.Store.YES, token_flag))
319
321 """add a prepared document to the index database
322
323 @param document: the document to be added
324 @type document: PyLucene.Document
325 """
326 self._writer_open()
327 self.writer.addDocument(document)
328
330 """PyLucene does not support transactions
331
332 Thus this function just opens the database for write access.
333 Call "cancel_transaction" or "commit_transaction" to close write
334 access in order to remove the exclusive lock from the database
335 directory.
336 """
337 self._writer_open()
338
340 """PyLucene does not support transactions
341
342 Thus this function just closes the database write access and removes
343 the exclusive lock.
344
345 See 'start_transaction' for details.
346 """
347 self._writer_close()
348
350 """PyLucene does not support transactions
351
352 Thus this function just closes the database write access and removes
353 the exclusive lock.
354
355 See 'start_transaction' for details.
356 """
357 self._writer_close()
358 self._index_refresh()
359
361 """return an object containing the results of a query
362
363 @param query: a pre-compiled query
364 @type query: a query object of the real implementation
365 @return: an object that allows access to the results
366 @rtype: subclass of CommonEnquire
367 """
368 return PyLuceneHits(self.searcher.search(query))
369
371 """delete a specified document
372
373 @param docid: the document ID to be deleted
374 @type docid: int
375 """
376 self._delete_stale_lock()
377 self.reader.deleteDocument(docid)
378 self.reader.flush()
379
380 self._index_refresh()
381
382 - def search(self, query, fieldnames):
383 """return a list of the contents of specified fields for all matches of
384 a query
385
386 @param query: the query to be issued
387 @type query: a query object of the real implementation
388 @param fieldnames: the name(s) of a field of the document content
389 @type fieldnames: string | list of strings
390 @return: a list of dicts containing the specified field(s)
391 @rtype: list of dicts
392 """
393 if isinstance(fieldnames, basestring):
394 fieldnames = [fieldnames]
395 hits = self.searcher.search(query)
396 if _COMPILER == 'jcc':
397
398 hits = [(hit, hits.doc(hit)) for hit in range(hits.length())]
399 result = []
400 for hit, doc in hits:
401 fields = {}
402 for fieldname in fieldnames:
403
404 if fieldname is None:
405 pyl_fieldname = UNNAMED_FIELD_NAME
406 else:
407 pyl_fieldname = fieldname
408 fields[fieldname] = doc.getValues(pyl_fieldname)
409 result.append(fields)
410 return result
411
413 if self.reader.isLocked(self.location):
414
415
416 try:
417
418 stat = os.stat(os.path.join(self.location, 'write.lock'))
419 age = (time.time() - stat.st_mtime) / 60
420 if age > 15:
421 logging.warning("stale lock found in %s, removing.", self.location)
422 self.reader.unlock(self.reader.directory())
423 except:
424 pass
425
427 """open write access for the indexing database and acquire an
428 exclusive lock
429 """
430 if not self._writer_is_open():
431 self._delete_stale_lock()
432 self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer,
433 False)
434
435
436
437 if hasattr(self.writer, "setMaxFieldLength"):
438 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
439
440
442 """close indexing write access and remove the database lock"""
443 if self._writer_is_open():
444 self.writer.commit()
445 self.writer.close()
446 self.writer = None
447
449 """check if the indexing write access is currently open"""
450 return not self.writer is None
451
453 """re-read the indexer database"""
454 try:
455 if self.reader is None or self.searcher is None:
456 self.reader = PyLucene.IndexReader.open(self.location)
457 self.searcher = PyLucene.IndexSearcher(self.reader)
458 elif self.index_version != self.reader.getCurrentVersion( \
459 self.location):
460 self.searcher.close()
461 self.reader.close()
462 self.reader = PyLucene.IndexReader.open(self.location)
463 self.searcher = PyLucene.IndexSearcher(self.reader)
464 self.index_version = self.reader.getCurrentVersion(self.location)
465 except PyLucene.JavaError,e:
466
467
468 pass
469
470
472 """an enquire object contains the information about the result of a request
473 """
474
476 """return a specified number of qualified matches of a previous query
477
478 @param start: index of the first match to return (starting from zero)
479 @type start: int
480 @param number: the number of matching entries to return
481 @type number: int
482 @return: a set of matching entries and some statistics
483 @rtype: tuple of (returned number, available number, matches)
484 "matches" is a dictionary of::
485 ["rank", "percent", "document", "docid"]
486 """
487
488
489 stop = start + number
490 if stop > self.enquire.length():
491 stop = self.enquire.length()
492
493 if stop <= start:
494 return (0, self.enquire.length(), [])
495 result = []
496 for index in range(start, stop):
497 item = {}
498 item["rank"] = index
499 item["docid"] = self.enquire.id(index)
500 item["percent"] = self.enquire.score(index)
501 item["document"] = self.enquire.doc(index)
502 result.append(item)
503 return (stop-start, self.enquire.length(), result)
504
505 -def _occur(required, prohibited):
506 if required == True and prohibited == False:
507 return PyLucene.BooleanClause.Occur.MUST
508 elif required == False and prohibited == False:
509 return PyLucene.BooleanClause.Occur.SHOULD
510 elif required == False and prohibited == True:
511 return PyLucene.BooleanClause.Occur.MUST_NOT
512 else:
513
514
515 return None
516
518 """get the installed pylucene version
519
520 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown
521 @rtype: int
522 """
523 version = PyLucene.VERSION
524 if version.startswith("1."):
525 return 1
526 elif version.startswith("2."):
527 return 2
528 else:
529 return 0
530
531
534