1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 Interface to the Xapian indexing engine for the Translate Toolkit
25
26 Xapian v1.0 or higher is supported.
27
28 If you are interested in writing an interface for Xapian 0.x, then
29 you should checkout the following::
30 svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/
31 It is not completely working, but it should give you a good start.
32 """
33
34 __revision__ = "$Id: XapianIndexer.py 13124 2009-11-16 10:51:22Z friedelwolff $"
35
36
37 import sys
38 import re
39
40 if 'apache' in sys.modules or '_apache' in sys.modules:
41 import subprocess
42
43 try:
44 command = subprocess.Popen(['xapian-check', '--version'], stdout=subprocess.PIPE)
45 stdout, stderr = command.communicate()
46 if re.match('.*([0-9]+\.[0-9]+\.[0-9]+).*', stdout).groups()[0] < '1.0.13':
47 raise ImportError("Running under apache, can't load xapain")
48 except:
49 raise ImportError("Running under apache, can't load xapian")
50
51 import CommonIndexer
52 import xapian
53 import os
54
55
57 return xapian.major_version() > 0
58
59
60
61
62
63 _MAX_TERM_LENGTH = 128
64
65
67 """interface to the xapian (http://xapian.org) indexer
68 """
69
70 QUERY_TYPE = xapian.Query
71 INDEX_DIRECTORY_NAME = "xapian"
72
73 - def __init__(self, basedir, analyzer=None, create_allowed=True):
74 """initialize or open a xapian database
75
76 @raise ValueError: the given location exists, but the database type
77 is incompatible (e.g. created by a different indexing engine)
78 @raise OSError: the database failed to initialize
79
80 @param basedir: the parent directory of the database
81 @type basedir: str
82 @param analyzer: bitwise combination of possible analyzer flags
83 to be used as the default analyzer for this database. Leave it empty
84 to use the system default analyzer (self.ANALYZER_DEFAULT).
85 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
86 @type analyzer: int
87 @param create_allowed: create the database, if necessary; default: True
88 @type create_allowed: bool
89 """
90
91 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer,
92 create_allowed=create_allowed)
93 if os.path.exists(self.location):
94
95 try:
96 self.database = xapian.WritableDatabase(self.location,
97 xapian.DB_OPEN)
98 except xapian.DatabaseOpeningError, err_msg:
99 raise ValueError("Indexer: failed to open xapian database " \
100 + "(%s) - maybe it is not a xapian database: %s" \
101 % (self.location, err_msg))
102 else:
103
104 if not create_allowed:
105 raise OSError("Indexer: skipping database creation")
106 try:
107
108 parent_path = os.path.dirname(self.location)
109 if not os.path.isdir(parent_path):
110
111 os.makedirs(parent_path)
112 except IOError, err_msg:
113 raise OSError("Indexer: failed to create the parent " \
114 + "directory (%s) of the indexing database: %s" \
115 % (parent_path, err_msg))
116 try:
117 self.database = xapian.WritableDatabase(self.location,
118 xapian.DB_CREATE_OR_OPEN)
119 except xapian.DatabaseOpeningError, err_msg:
120 raise OSError("Indexer: failed to open or create a xapian " \
121 + "database (%s): %s" % (self.location, err_msg))
122
123 - def flush(self, optimize=False):
124 """force to write the current changes to disk immediately
125
126 @param optimize: ignored for xapian
127 @type optimize: bool
128 """
129
130 if (isinstance(self.database, xapian.WritableDatabase)):
131 self.database.flush()
132
133 self.database = None
134
135 self._prepare_database()
136
138 """generate a query based on an existing query object
139
140 basically this function should just create a copy of the original
141
142 @param query: the original query object
143 @type query: xapian.Query
144 @return: the resulting query object
145 @rtype: xapian.Query
146 """
147
148 return xapian.Query(query)
149
152 """generate a query for a plain term of a string query
153
154 basically this function parses the string and returns the resulting
155 query
156
157 @param text: the query string
158 @type text: str
159 @param require_all: boolean operator
160 (True -> AND (default) / False -> OR)
161 @type require_all: bool
162 @param analyzer: Define query options (partial matching, exact matching,
163 tokenizing, ...) as bitwise combinations of
164 CommonIndexer.ANALYZER_???.
165 This can override previously defined field analyzer settings.
166 If analyzer is None (default), then the configured analyzer for the
167 field is used.
168 @type analyzer: int
169 @return: resulting query object
170 @rtype: xapian.Query
171 """
172 qp = xapian.QueryParser()
173 qp.set_database(self.database)
174 if require_all:
175 qp.set_default_op(xapian.Query.OP_AND)
176 else:
177 qp.set_default_op(xapian.Query.OP_OR)
178 if analyzer is None:
179 analyzer = self.analyzer
180 if analyzer & self.ANALYZER_PARTIAL > 0:
181 match_flags = xapian.QueryParser.FLAG_PARTIAL
182 return qp.parse_query(text, match_flags)
183 elif analyzer == self.ANALYZER_EXACT:
184
185 return xapian.Query(text)
186 else:
187
188 match_flags = 0
189 return qp.parse_query(text, match_flags)
190
192 """generate a field query
193
194 this functions creates a field->value query
195
196 @param field: the fieldname to be used
197 @type field: str
198 @param value: the wanted value of the field
199 @type value: str
200 @param analyzer: Define query options (partial matching, exact matching,
201 tokenizing, ...) as bitwise combinations of
202 CommonIndexer.ANALYZER_???.
203 This can override previously defined field analyzer settings.
204 If analyzer is None (default), then the configured analyzer for the
205 field is used.
206 @type analyzer: int
207 @return: the resulting query object
208 @rtype: xapian.Query
209 """
210 if analyzer is None:
211 analyzer = self.analyzer
212 if analyzer == self.ANALYZER_EXACT:
213
214 return xapian.Query("%s%s" % (field.upper(), value))
215
216 qp = xapian.QueryParser()
217 qp.set_database(self.database)
218 if (analyzer & self.ANALYZER_PARTIAL > 0):
219
220 match_flags = xapian.QueryParser.FLAG_PARTIAL
221 return qp.parse_query(value, match_flags, field.upper())
222 else:
223
224 match_flags = 0
225 return qp.parse_query(value, match_flags, field.upper())
226
228 """generate a combined query
229
230 @param queries: list of the original queries
231 @type queries: list of xapian.Query
232 @param require_all: boolean operator
233 (True -> AND (default) / False -> OR)
234 @type require_all: bool
235 @return: the resulting combined query object
236 @rtype: xapian.Query
237 """
238 if require_all:
239 query_op = xapian.Query.OP_AND
240 else:
241 query_op = xapian.Query.OP_OR
242 return xapian.Query(query_op, queries)
243
245 """create an empty document to be filled and added to the index later
246
247 @return: the new document object
248 @rtype: xapian.Document
249 """
250 return xapian.Document()
251
253 """add a term to a document
254
255 @param document: the document to be changed
256 @type document: xapian.Document
257 @param term: a single term to be added
258 @type term: str
259 @param tokenize: should the term be tokenized automatically
260 @type tokenize: bool
261 """
262 if tokenize:
263 term_gen = xapian.TermGenerator()
264 term_gen.set_document(document)
265 term_gen.index_text(term)
266 else:
267 document.add_term(_truncate_term_length(term))
268
270 """add a field term to a document
271
272 @param document: the document to be changed
273 @type document: xapian.Document
274 @param field: name of the field
275 @type field: str
276 @param term: term to be associated to the field
277 @type term: str
278 @param tokenize: should the term be tokenized automatically
279 @type tokenize: bool
280 """
281 if tokenize:
282 term_gen = xapian.TermGenerator()
283 term_gen.set_document(document)
284 term_gen.index_text(term, 1, field.upper())
285 else:
286 document.add_term(_truncate_term_length("%s%s" % \
287 (field.upper(), term)))
288
290 """add a prepared document to the index database
291
292 @param document: the document to be added
293 @type document: xapian.Document
294 """
295
296 self._prepare_database(writable=True)
297 self.database.add_document(document)
298
300 """begin a transaction
301
302 Xapian supports transactions to group multiple database modifications.
303 This avoids intermediate flushing and therefore increases performance.
304 """
305 self._prepare_database(writable=True)
306 self.database.begin_transaction()
307
309 """cancel an ongoing transaction
310
311 no changes since the last execution of 'begin_transcation' are written
312 """
313 self._prepare_database(writable=True)
314 self.database.cancel_transaction()
315
317 """submit the changes of an ongoing transaction
318
319 all changes since the last execution of 'begin_transaction' are written
320 """
321 self._prepare_database(writable=True)
322 self.database.commit_transaction()
323
325 """return an object containing the results of a query
326
327 @param query: a pre-compiled xapian query
328 @type query: xapian.Query
329 @return: an object that allows access to the results
330 @rtype: XapianIndexer.CommonEnquire
331 """
332 enquire = xapian.Enquire(self.database)
333 enquire.set_query(query)
334 return XapianEnquire(enquire)
335
337 """delete a specified document
338
339 @param docid: the document ID to be deleted
340 @type docid: int
341 """
342
343 self._prepare_database(writable=True)
344 try:
345 self.database.delete_document(docid)
346 return True
347 except xapian.DocNotFoundError:
348 return False
349
350 - def search(self, query, fieldnames):
351 """return a list of the contents of specified fields for all matches of
352 a query
353
354 @param query: the query to be issued
355 @type query: xapian.Query
356 @param fieldnames: the name(s) of a field of the document content
357 @type fieldnames: string | list of strings
358 @return: a list of dicts containing the specified field(s)
359 @rtype: list of dicts
360 """
361 result = []
362 if isinstance(fieldnames, basestring):
363 fieldnames = [fieldnames]
364 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames))
365 return result
366
368 """reopen the database as read-only or as writable if necessary
369
370 this fixes a xapian specific issue regarding open locks for
371 writable databases
372
373 @param writable: True for opening a writable database
374 @type writable: bool
375 """
376 if writable and (not isinstance(self.database,
377 xapian.WritableDatabase)):
378 self.database = xapian.WritableDatabase(self.location,
379 xapian.DB_OPEN)
380 elif not writable and (not isinstance(self.database, xapian.Database)):
381 self.database = xapian.Database(self.location)
382
383
385 """interface to the xapian object for storing sets of matches
386 """
387
389 """return a specified number of qualified matches of a previous query
390
391 @param start: index of the first match to return (starting from zero)
392 @type start: int
393 @param number: the number of matching entries to return
394 @type number: int
395 @return: a set of matching entries and some statistics
396 @rtype: tuple of (returned number, available number, matches)
397 "matches" is a dictionary of::
398 ["rank", "percent", "document", "docid"]
399 """
400 matches = self.enquire.get_mset(start, number)
401 result = []
402 for match in matches:
403 elem = {}
404 elem["rank"] = match[xapian.MSET_RANK]
405 elem["docid"] = match[xapian.MSET_DID]
406 elem["percent"] = match[xapian.MSET_PERCENT]
407 elem["document"] = match[xapian.MSET_DOCUMENT]
408 result.append(elem)
409 return (matches.size(), matches.get_matches_estimated(), result)
410
411
413 """truncate the length of a term string length to the maximum allowed
414 for xapian terms
415
416 @param term: the value of the term, that should be truncated
417 @type term: str
418 @param taken: since a term consists of the name of the term and its
419 actual value, this additional parameter can be used to reduce the
420 maximum count of possible characters
421 @type taken: int
422 @return: the truncated string
423 @rtype: str
424 """
425 if len(term) > _MAX_TERM_LENGTH - taken:
426 return term[0:_MAX_TERM_LENGTH - taken - 1]
427 else:
428 return term
429
431 """add a dict of field values to a list
432
433 usually this function should be used together with '_walk_matches'
434 for traversing a list of matches
435 @param match: a single match object
436 @type match: xapian.MSet
437 @param result: the resulting dict will be added to this list
438 @type result: list of dict
439 @param fieldnames: the names of the fields to be added to the dict
440 @type fieldnames: list of str
441 """
442
443 item_fields = {}
444
445 for term in match["document"].termlist():
446 for fname in fieldnames:
447 if ((fname is None) and re.match("[^A-Z]", term.term)):
448 value = term.term
449 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term):
450 value = term.term[len(fname):]
451 else:
452 continue
453
454 if item_fields.has_key(fname):
455 item_fields[fname].append(value)
456 else:
457 item_fields[fname] = [value]
458 result.append(item_fields)
459