1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 Interface to the Xapian indexing engine for the Translate Toolkit
25
26 Xapian v1.0 or higher is supported.
27
28 If you are interested in writing an interface for Xapian 0.x, then
29 you should checkout the following::
30 svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/
31 It is not completely working, but it should give you a good start.
32 """
33
34 __revision__ = "$Id: XapianIndexer.py 13411 2009-11-30 20:51:48Z alaaosh $"
35
36
37 import sys
38 import re
39
40
41 if 'apache' in sys.modules or '_apache' in sys.modules or 'mod_wsgi' in sys.modules:
43 return [int(i) for i in version.split('.')]
44
45 import subprocess
46
47 try:
48 command = subprocess.Popen(['xapian-check', '--version'], stdout=subprocess.PIPE)
49 stdout, stderr = command.communicate()
50 if _str2version(re.match('.*([0-9]+\.[0-9]+\.[0-9]+).*', stdout).groups()[0]) < [1, 0, 13]:
51 raise ImportError("Running under apache, can't load xapain")
52 except:
53
54 raise ImportError("Running under apache, can't load xapian")
55
56 import CommonIndexer
57 import xapian
58 import os
59
60
62 return xapian.major_version() > 0
63
64
65
66
67
68 _MAX_TERM_LENGTH = 128
69
70
72 """interface to the xapian (http://xapian.org) indexer
73 """
74
75 QUERY_TYPE = xapian.Query
76 INDEX_DIRECTORY_NAME = "xapian"
77
78 - def __init__(self, basedir, analyzer=None, create_allowed=True):
79 """initialize or open a xapian database
80
81 @raise ValueError: the given location exists, but the database type
82 is incompatible (e.g. created by a different indexing engine)
83 @raise OSError: the database failed to initialize
84
85 @param basedir: the parent directory of the database
86 @type basedir: str
87 @param analyzer: bitwise combination of possible analyzer flags
88 to be used as the default analyzer for this database. Leave it empty
89 to use the system default analyzer (self.ANALYZER_DEFAULT).
90 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
91 @type analyzer: int
92 @param create_allowed: create the database, if necessary; default: True
93 @type create_allowed: bool
94 """
95
96 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer,
97 create_allowed=create_allowed)
98 if os.path.exists(self.location):
99
100 try:
101 self.database = xapian.WritableDatabase(self.location,
102 xapian.DB_OPEN)
103 except xapian.DatabaseOpeningError, err_msg:
104 raise ValueError("Indexer: failed to open xapian database " \
105 + "(%s) - maybe it is not a xapian database: %s" \
106 % (self.location, err_msg))
107 else:
108
109 if not create_allowed:
110 raise OSError("Indexer: skipping database creation")
111 try:
112
113 parent_path = os.path.dirname(self.location)
114 if not os.path.isdir(parent_path):
115
116 os.makedirs(parent_path)
117 except IOError, err_msg:
118 raise OSError("Indexer: failed to create the parent " \
119 + "directory (%s) of the indexing database: %s" \
120 % (parent_path, err_msg))
121 try:
122 self.database = xapian.WritableDatabase(self.location,
123 xapian.DB_CREATE_OR_OPEN)
124 except xapian.DatabaseOpeningError, err_msg:
125 raise OSError("Indexer: failed to open or create a xapian " \
126 + "database (%s): %s" % (self.location, err_msg))
127
128 - def flush(self, optimize=False):
129 """force to write the current changes to disk immediately
130
131 @param optimize: ignored for xapian
132 @type optimize: bool
133 """
134
135 if (isinstance(self.database, xapian.WritableDatabase)):
136 self.database.flush()
137
138 self.database = None
139
140 self._prepare_database()
141
143 """generate a query based on an existing query object
144
145 basically this function should just create a copy of the original
146
147 @param query: the original query object
148 @type query: xapian.Query
149 @return: the resulting query object
150 @rtype: xapian.Query
151 """
152
153 return xapian.Query(query)
154
157 """generate a query for a plain term of a string query
158
159 basically this function parses the string and returns the resulting
160 query
161
162 @param text: the query string
163 @type text: str
164 @param require_all: boolean operator
165 (True -> AND (default) / False -> OR)
166 @type require_all: bool
167 @param analyzer: Define query options (partial matching, exact matching,
168 tokenizing, ...) as bitwise combinations of
169 CommonIndexer.ANALYZER_???.
170 This can override previously defined field analyzer settings.
171 If analyzer is None (default), then the configured analyzer for the
172 field is used.
173 @type analyzer: int
174 @return: resulting query object
175 @rtype: xapian.Query
176 """
177 qp = xapian.QueryParser()
178 qp.set_database(self.database)
179 if require_all:
180 qp.set_default_op(xapian.Query.OP_AND)
181 else:
182 qp.set_default_op(xapian.Query.OP_OR)
183 if analyzer is None:
184 analyzer = self.analyzer
185 if analyzer & self.ANALYZER_PARTIAL > 0:
186 match_flags = xapian.QueryParser.FLAG_PARTIAL
187 return qp.parse_query(text, match_flags)
188 elif analyzer == self.ANALYZER_EXACT:
189
190 return xapian.Query(text)
191 else:
192
193 match_flags = 0
194 return qp.parse_query(text, match_flags)
195
197 """generate a field query
198
199 this functions creates a field->value query
200
201 @param field: the fieldname to be used
202 @type field: str
203 @param value: the wanted value of the field
204 @type value: str
205 @param analyzer: Define query options (partial matching, exact matching,
206 tokenizing, ...) as bitwise combinations of
207 CommonIndexer.ANALYZER_???.
208 This can override previously defined field analyzer settings.
209 If analyzer is None (default), then the configured analyzer for the
210 field is used.
211 @type analyzer: int
212 @return: the resulting query object
213 @rtype: xapian.Query
214 """
215 if analyzer is None:
216 analyzer = self.analyzer
217 if analyzer == self.ANALYZER_EXACT:
218
219 return xapian.Query("%s%s" % (field.upper(), value))
220
221 qp = xapian.QueryParser()
222 qp.set_database(self.database)
223 if (analyzer & self.ANALYZER_PARTIAL > 0):
224
225 match_flags = xapian.QueryParser.FLAG_PARTIAL
226 return qp.parse_query(value, match_flags, field.upper())
227 else:
228
229 match_flags = 0
230 return qp.parse_query(value, match_flags, field.upper())
231
233 """generate a combined query
234
235 @param queries: list of the original queries
236 @type queries: list of xapian.Query
237 @param require_all: boolean operator
238 (True -> AND (default) / False -> OR)
239 @type require_all: bool
240 @return: the resulting combined query object
241 @rtype: xapian.Query
242 """
243 if require_all:
244 query_op = xapian.Query.OP_AND
245 else:
246 query_op = xapian.Query.OP_OR
247 return xapian.Query(query_op, queries)
248
250 """create an empty document to be filled and added to the index later
251
252 @return: the new document object
253 @rtype: xapian.Document
254 """
255 return xapian.Document()
256
258 """add a term to a document
259
260 @param document: the document to be changed
261 @type document: xapian.Document
262 @param term: a single term to be added
263 @type term: str
264 @param tokenize: should the term be tokenized automatically
265 @type tokenize: bool
266 """
267 if tokenize:
268 term_gen = xapian.TermGenerator()
269 term_gen.set_document(document)
270 term_gen.index_text(term)
271 else:
272 document.add_term(_truncate_term_length(term))
273
275 """add a field term to a document
276
277 @param document: the document to be changed
278 @type document: xapian.Document
279 @param field: name of the field
280 @type field: str
281 @param term: term to be associated to the field
282 @type term: str
283 @param tokenize: should the term be tokenized automatically
284 @type tokenize: bool
285 """
286 if tokenize:
287 term_gen = xapian.TermGenerator()
288 term_gen.set_document(document)
289 term_gen.index_text(term, 1, field.upper())
290 else:
291 document.add_term(_truncate_term_length("%s%s" % \
292 (field.upper(), term)))
293
295 """add a prepared document to the index database
296
297 @param document: the document to be added
298 @type document: xapian.Document
299 """
300
301 self._prepare_database(writable=True)
302 self.database.add_document(document)
303
305 """begin a transaction
306
307 Xapian supports transactions to group multiple database modifications.
308 This avoids intermediate flushing and therefore increases performance.
309 """
310 self._prepare_database(writable=True)
311 self.database.begin_transaction()
312
314 """cancel an ongoing transaction
315
316 no changes since the last execution of 'begin_transcation' are written
317 """
318 self._prepare_database(writable=True)
319 self.database.cancel_transaction()
320
322 """submit the changes of an ongoing transaction
323
324 all changes since the last execution of 'begin_transaction' are written
325 """
326 self._prepare_database(writable=True)
327 self.database.commit_transaction()
328
330 """return an object containing the results of a query
331
332 @param query: a pre-compiled xapian query
333 @type query: xapian.Query
334 @return: an object that allows access to the results
335 @rtype: XapianIndexer.CommonEnquire
336 """
337 enquire = xapian.Enquire(self.database)
338 enquire.set_query(query)
339 return XapianEnquire(enquire)
340
342 """delete a specified document
343
344 @param docid: the document ID to be deleted
345 @type docid: int
346 """
347
348 self._prepare_database(writable=True)
349 try:
350 self.database.delete_document(docid)
351 return True
352 except xapian.DocNotFoundError:
353 return False
354
355 - def search(self, query, fieldnames):
356 """return a list of the contents of specified fields for all matches of
357 a query
358
359 @param query: the query to be issued
360 @type query: xapian.Query
361 @param fieldnames: the name(s) of a field of the document content
362 @type fieldnames: string | list of strings
363 @return: a list of dicts containing the specified field(s)
364 @rtype: list of dicts
365 """
366 result = []
367 if isinstance(fieldnames, basestring):
368 fieldnames = [fieldnames]
369 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames))
370 return result
371
373 """reopen the database as read-only or as writable if necessary
374
375 this fixes a xapian specific issue regarding open locks for
376 writable databases
377
378 @param writable: True for opening a writable database
379 @type writable: bool
380 """
381 if writable and (not isinstance(self.database,
382 xapian.WritableDatabase)):
383 self.database = xapian.WritableDatabase(self.location,
384 xapian.DB_OPEN)
385 elif not writable and (not isinstance(self.database, xapian.Database)):
386 self.database = xapian.Database(self.location)
387
388
390 """interface to the xapian object for storing sets of matches
391 """
392
394 """return a specified number of qualified matches of a previous query
395
396 @param start: index of the first match to return (starting from zero)
397 @type start: int
398 @param number: the number of matching entries to return
399 @type number: int
400 @return: a set of matching entries and some statistics
401 @rtype: tuple of (returned number, available number, matches)
402 "matches" is a dictionary of::
403 ["rank", "percent", "document", "docid"]
404 """
405 matches = self.enquire.get_mset(start, number)
406 result = []
407 for match in matches:
408 elem = {}
409 elem["rank"] = match[xapian.MSET_RANK]
410 elem["docid"] = match[xapian.MSET_DID]
411 elem["percent"] = match[xapian.MSET_PERCENT]
412 elem["document"] = match[xapian.MSET_DOCUMENT]
413 result.append(elem)
414 return (matches.size(), matches.get_matches_estimated(), result)
415
416
418 """truncate the length of a term string length to the maximum allowed
419 for xapian terms
420
421 @param term: the value of the term, that should be truncated
422 @type term: str
423 @param taken: since a term consists of the name of the term and its
424 actual value, this additional parameter can be used to reduce the
425 maximum count of possible characters
426 @type taken: int
427 @return: the truncated string
428 @rtype: str
429 """
430 if len(term) > _MAX_TERM_LENGTH - taken:
431 return term[0:_MAX_TERM_LENGTH - taken - 1]
432 else:
433 return term
434
436 """add a dict of field values to a list
437
438 usually this function should be used together with '_walk_matches'
439 for traversing a list of matches
440 @param match: a single match object
441 @type match: xapian.MSet
442 @param result: the resulting dict will be added to this list
443 @type result: list of dict
444 @param fieldnames: the names of the fields to be added to the dict
445 @type fieldnames: list of str
446 """
447
448 item_fields = {}
449
450 for term in match["document"].termlist():
451 for fname in fieldnames:
452 if ((fname is None) and re.match("[^A-Z]", term.term)):
453 value = term.term
454 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term):
455 value = term.term[len(fname):]
456 else:
457 continue
458
459 if item_fields.has_key(fname):
460 item_fields[fname].append(value)
461 else:
462 item_fields[fname] = [value]
463 result.append(item_fields)
464