1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 base class for interfaces to indexing engines for pootle
25 """
26
27 import os
28
29 import translate.lang.data
30
31 __revision__ = "$Id: CommonIndexer.py 15615 2010-08-22 21:13:42Z dwaynebailey $"
32
33
35 """check if this indexing engine interface is usable
36
37 this function must exist in every module that contains indexing engine
38 interfaces
39
40 @return: is this interface usable?
41 @rtype: bool
42 """
43 return False
44
45
47 """base class for indexing support
48
49 any real implementation must override most methods of this class
50 """
51
52 field_analyzers = {}
53 """mapping of field names and analyzers - see 'set_field_analyzers'"""
54
55 ANALYZER_EXACT = 0
56 """exact matching: the query string must equal the whole term string"""
57
58 ANALYZER_PARTIAL = 1 << 1
59 """partial matching: a document matches, even if the query string only
60 matches the beginning of the term value."""
61
62 ANALYZER_TOKENIZE = 1 << 2
63 """tokenize terms and queries automatically"""
64
65 ANALYZER_DEFAULT = ANALYZER_TOKENIZE | ANALYZER_PARTIAL
66 """the default analyzer to be used if nothing is configured"""
67
68 QUERY_TYPE = None
69 """override this with the query class of the implementation"""
70
71 INDEX_DIRECTORY_NAME = None
72 """override this with a string to be used as the name of the indexing
73 directory/file in the filesystem
74 """
75
76 - def __init__(self, basedir, analyzer=None, create_allowed=True):
77 """initialize or open an indexing database
78
79 Any derived class must override __init__.
80
81 Any implementation can rely on the "self.location" attribute to be set
82 by the __init__ function of the super class.
83
84 @raise ValueError: the given location exists, but the database type
85 is incompatible (e.g. created by a different indexing engine)
86 @raise OSError: the database failed to initialize
87
88 @param basedir: the parent directory of the database
89 @type basedir: str
90 @param analyzer: bitwise combination of possible analyzer flags
91 to be used as the default analyzer for this database. Leave it empty
92 to use the system default analyzer (self.ANALYZER_DEFAULT).
93 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
94 @type analyzer: int
95 @param create_allowed: create the database, if necessary; default: True
96 @type create_allowed: bool
97 """
98
99 if self.QUERY_TYPE is None:
100 raise NotImplementedError("Incomplete indexer implementation: " \
101 + "'QUERY_TYPE' is undefined")
102 if self.INDEX_DIRECTORY_NAME is None:
103 raise NotImplementedError("Incomplete indexer implementation: " \
104 + "'INDEX_DIRECTORY_NAME' is undefined")
105 self.location = os.path.join(basedir, self.INDEX_DIRECTORY_NAME)
106 if (not create_allowed) and (not os.path.exists(self.location)):
107 raise OSError("Indexer: the database does not exist - and I am" \
108 + " not configured to create it.")
109 if analyzer is None:
110 self.analyzer = self.ANALYZER_DEFAULT
111 else:
112 self.analyzer = analyzer
113 self.field_analyzers = {}
114
115 - def flush(self, optimize=False):
116 """flush the content of the database - to force changes to be written
117 to disk
118
119 some databases also support index optimization
120
121 @param optimize: should the index be optimized if possible?
122 @type optimize: bool
123 """
124 raise NotImplementedError("Incomplete indexer implementation: " \
125 + "'flush' is missing")
126
127 - def make_query(self, args, require_all=True, analyzer=None):
128 """create simple queries (strings or field searches) or
129 combine multiple queries (AND/OR)
130
131 To specifiy rules for field searches, you may want to take a look at
132 'set_field_analyzers'. The parameter 'match_text_partial' can override
133 the previously defined default setting.
134
135 @param args: queries or search string or description of field query
136 examples::
137 [xapian.Query("foo"), xapian.Query("bar")]
138 xapian.Query("foo")
139 "bar"
140 {"foo": "bar", "foobar": "foo"}
141 @type args: list of queries | single query | str | dict
142 @param require_all: boolean operator
143 (True -> AND (default) / False -> OR)
144 @type require_all: boolean
145 @param analyzer: (only applicable for 'dict' or 'str')
146 Define query options (partial matching, exact matching, tokenizing,
147 ...) as bitwise combinations of CommonIndexer.ANALYZER_???.
148 This can override previously defined field analyzer settings.
149 If analyzer is None (default), then the configured analyzer for the
150 field is used.
151 @type analyzer: int
152 @return: the combined query
153 @rtype: query type of the specific implemention
154 """
155
156 if isinstance(args, dict):
157 args = args.items()
158
159 if not isinstance(args, list):
160 args = [args]
161
162 result = []
163 for query in args:
164
165 if isinstance(query, self.QUERY_TYPE):
166 result.append(self._create_query_for_query(query))
167
168 elif isinstance(query, tuple):
169 field, value = query
170
171 field = translate.lang.data.normalize(unicode(field))
172 value = translate.lang.data.normalize(unicode(value))
173
174 if analyzer is None:
175 analyzer = self.get_field_analyzers(field)
176 result.append(self._create_query_for_field(field, value,
177 analyzer=analyzer))
178
179 elif isinstance(query, basestring):
180 if analyzer is None:
181 analyzer = self.analyzer
182
183 query = translate.lang.data.normalize(unicode(query))
184 result.append(self._create_query_for_string(query,
185 require_all=require_all, analyzer=analyzer))
186 else:
187
188 raise ValueError("Unable to handle query type: %s" \
189 % str(type(query)))
190
191 return self._create_query_combined(result, require_all)
192
194 """generate a query based on an existing query object
195
196 basically this function should just create a copy of the original
197
198 @param query: the original query object
199 @type query: xapian.Query
200 @return: the resulting query object
201 @rtype: xapian.Query | PyLucene.Query
202 """
203 raise NotImplementedError("Incomplete indexer implementation: " \
204 + "'_create_query_for_query' is missing")
205
208 """generate a query for a plain term of a string query
209
210 basically this function parses the string and returns the resulting
211 query
212
213 @param text: the query string
214 @type text: str
215 @param require_all: boolean operator
216 (True -> AND (default) / False -> OR)
217 @type require_all: bool
218 @param analyzer: Define query options (partial matching, exact matching,
219 tokenizing, ...) as bitwise combinations of
220 CommonIndexer.ANALYZER_???.
221 This can override previously defined field analyzer settings.
222 If analyzer is None (default), then the configured analyzer for the
223 field is used.
224 @type analyzer: int
225 @return: resulting query object
226 @rtype: xapian.Query | PyLucene.Query
227 """
228 raise NotImplementedError("Incomplete indexer implementation: " \
229 + "'_create_query_for_string' is missing")
230
232 """generate a field query
233
234 this functions creates a field->value query
235
236 @param field: the fieldname to be used
237 @type field: str
238 @param value: the wanted value of the field
239 @type value: str
240 @param analyzer: Define query options (partial matching, exact matching,
241 tokenizing, ...) as bitwise combinations of
242 CommonIndexer.ANALYZER_???.
243 This can override previously defined field analyzer settings.
244 If analyzer is None (default), then the configured analyzer for the
245 field is used.
246 @type analyzer: int
247 @return: resulting query object
248 @rtype: xapian.Query | PyLucene.Query
249 """
250 raise NotImplementedError("Incomplete indexer implementation: " \
251 + "'_create_query_for_field' is missing")
252
254 """generate a combined query
255
256 @param queries: list of the original queries
257 @type queries: list of xapian.Query
258 @param require_all: boolean operator
259 (True -> AND (default) / False -> OR)
260 @type require_all: bool
261 @return: the resulting combined query object
262 @rtype: xapian.Query | PyLucene.Query
263 """
264 raise NotImplementedError("Incomplete indexer implementation: " \
265 + "'_create_query_combined' is missing")
266
268 """add the given data to the database
269
270 @param data: the data to be indexed.
271 A dictionary will be treated as fieldname:value combinations.
272 If the fieldname is None then the value will be interpreted as a
273 plain term or as a list of plain terms.
274 Lists of terms are indexed separately.
275 Lists of strings are treated as plain terms.
276 @type data: dict | list of str
277 """
278 doc = self._create_empty_document()
279 if isinstance(data, dict):
280 data = data.items()
281
282 for dataset in data:
283 if isinstance(dataset, tuple):
284
285 key, value = dataset
286 if key is None:
287 if isinstance(value, list):
288 terms = value[:]
289 elif isinstance(value, basestring):
290 terms = [value]
291 else:
292 raise ValueError("Invalid data type to be indexed: %s" \
293 % str(type(data)))
294 for one_term in terms:
295 self._add_plain_term(doc, self._decode(one_term),
296 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0))
297 else:
298 analyze_settings = self.get_field_analyzers(key)
299
300 if not isinstance(value, list):
301 value = [value]
302 for one_term in value:
303 self._add_field_term(doc, key, self._decode(one_term),
304 (analyze_settings & self.ANALYZER_TOKENIZE > 0))
305 elif isinstance(dataset, basestring):
306 self._add_plain_term(doc, self._decode(dataset),
307 (self.ANALYZER_DEFAULT & self.ANALYZER_TOKENIZE > 0))
308 else:
309 raise ValueError("Invalid data type to be indexed: %s" \
310 % str(type(data)))
311 self._add_document_to_index(doc)
312
314 """create an empty document to be filled and added to the index later
315
316 @return: the new document object
317 @rtype: xapian.Document | PyLucene.Document
318 """
319 raise NotImplementedError("Incomplete indexer implementation: " \
320 + "'_create_empty_document' is missing")
321
323 """add a term to a document
324
325 @param document: the document to be changed
326 @type document: xapian.Document | PyLucene.Document
327 @param term: a single term to be added
328 @type term: str
329 @param tokenize: should the term be tokenized automatically
330 @type tokenize: bool
331 """
332 raise NotImplementedError("Incomplete indexer implementation: " \
333 + "'_add_plain_term' is missing")
334
336 """add a field term to a document
337
338 @param document: the document to be changed
339 @type document: xapian.Document | PyLucene.Document
340 @param field: name of the field
341 @type field: str
342 @param term: term to be associated to the field
343 @type term: str
344 @param tokenize: should the term be tokenized automatically
345 @type tokenize: bool
346 """
347 raise NotImplementedError("Incomplete indexer implementation: " \
348 + "'_add_field_term' is missing")
349
351 """add a prepared document to the index database
352
353 @param document: the document to be added
354 @type document: xapian.Document | PyLucene.Document
355 """
356 raise NotImplementedError("Incomplete indexer implementation: " \
357 + "'_add_document_to_index' is missing")
358
360 """begin a transaction
361
362 You can group multiple modifications of a database as a transaction.
363 This prevents time-consuming database flushing and helps, if you want
364 that a changeset is committed either completely or not at all.
365 No changes will be written to disk until 'commit_transaction'.
366 'cancel_transaction' can be used to revert an ongoing transaction.
367
368 Database types that do not support transactions may silently ignore it.
369 """
370 raise NotImplementedError("Incomplete indexer implementation: " \
371 + "'begin_transaction' is missing")
372
374 """cancel an ongoing transaction
375
376 See 'start_transaction' for details.
377 """
378 raise NotImplementedError("Incomplete indexer implementation: " \
379 + "'cancel_transaction' is missing")
380
382 """submit the currently ongoing transaction and write changes to disk
383
384 See 'start_transaction' for details.
385 """
386 raise NotImplementedError("Incomplete indexer implementation: " \
387 + "'commit_transaction' is missing")
388
390 """return an object containing the results of a query
391
392 @param query: a pre-compiled query
393 @type query: a query object of the real implementation
394 @return: an object that allows access to the results
395 @rtype: subclass of CommonEnquire
396 """
397 raise NotImplementedError("Incomplete indexer implementation: " \
398 + "'get_query_result' is missing")
399
401 """delete a specified document
402
403 @param docid: the document ID to be deleted
404 @type docid: int
405 """
406 raise NotImplementedError("Incomplete indexer implementation: " \
407 + "'delete_document_by_id' is missing")
408
409 - def search(self, query, fieldnames):
410 """return a list of the contents of specified fields for all matches of
411 a query
412
413 @param query: the query to be issued
414 @type query: a query object of the real implementation
415 @param fieldnames: the name(s) of a field of the document content
416 @type fieldnames: string | list of strings
417 @return: a list of dicts containing the specified field(s)
418 @rtype: list of dicts
419 """
420 raise NotImplementedError("Incomplete indexer implementation: " \
421 + "'search' is missing")
422
424 """delete the documents returned by a query
425
426 @param ident: [list of] document IDs | dict describing a query | query
427 @type ident: int | list of tuples | dict | list of dicts |
428 query (e.g. xapian.Query) | list of queries
429 """
430
431 if isinstance(ident, list):
432
433 ident_list = ident
434 else:
435 ident_list = [ident]
436 if len(ident_list) == 0:
437
438 return 0
439 if isinstance(ident_list[0], int):
440
441 success_delete = [match for match in ident_list
442 if self.delete_document_by_id(match)]
443 return len(success_delete)
444 if isinstance(ident_list[0], dict):
445
446
447 query = self.make_query([self.make_query(query_dict,
448 require_all=True) for query_dict in ident_list],
449 require_all=True)
450 elif isinstance(ident_list[0], object):
451
452 query = self.make_query(ident_list, require_all=True)
453 else:
454
455
456 raise TypeError("description of documents to-be-deleted is not " \
457 + "supported: list of %s" % type(ident_list[0]))
458
459
460 remove_list = []
461
462
463 def add_docid_to_list(match):
464 """collect every document ID"""
465 remove_list.append(match["docid"])
466 self._walk_matches(query, add_docid_to_list)
467 return self.delete_doc(remove_list)
468
469 - def _walk_matches(self, query, function, arg_for_function=None):
470 """use this function if you want to do something with every single match
471 of a query
472
473 example::
474 self._walk_matches(query, function_for_match, arg_for_func)
475 'function_for_match' expects only one argument: the matched object
476
477 @param query: a query object of the real implementation
478 @type query: xapian.Query | PyLucene.Query
479 @param function: the function to execute with every match
480 @type function: function
481 @param arg_for_function: an optional argument for the function
482 @type arg_for_function: anything
483 """
484
485 enquire = self.get_query_result(query)
486
487 start = 0
488
489 size, avail = (0, 1)
490
491 steps = 2
492 while start < avail:
493 (size, avail, matches) = enquire.get_matches(start, steps)
494 for match in matches:
495 if arg_for_function is None:
496 function(match)
497 else:
498 function(match, arg_for_function)
499 start += size
500
502 """set the analyzers for different fields of the database documents
503
504 All bitwise combinations of CommonIndexer.ANALYZER_??? are possible.
505
506 @param field_analyzers: mapping of field names and analyzers
507 @type field_analyzers: dict containing field names and analyzers
508 @raise TypeError: invalid values in 'field_analyzers'
509 """
510 for field, analyzer in field_analyzers.items():
511
512 if not isinstance(field, (str, unicode)):
513 raise TypeError("field name must be a string")
514 if not isinstance(analyzer, int):
515 raise TypeError("the analyzer must be a whole number (int)")
516
517 self.field_analyzers[field] = analyzer
518
520 """return the analyzer that was mapped to a specific field
521
522 see 'set_field_analyzers' for details
523
524 @param fieldnames: the analyzer of this field (or all/multiple fields)
525 is requested; leave empty (or "None") to request all fields
526 @type fieldnames: str | list of str | None
527 @return: the analyzer setting of the field - see
528 CommonDatabase.ANALYZER_??? or a dict of field names and analyzers
529 @rtype: int | dict
530 """
531
532 if fieldnames is None:
533
534 return dict(self.field_analyzers)
535
536 if isinstance(fieldnames, (str, unicode)):
537 if fieldnames in self.field_analyzers:
538 return self.field_analyzers[fieldnames]
539 else:
540 return self.analyzer
541
542 if isinstance(fieldnames, list):
543 result = {}
544 for field in fieldnames:
545 result[field] = self.get_field_analyzers(field)
546 return result
547 return self.analyzer
548
550 """decode the string from utf-8 or charmap
551 perform unicde normalization
552 """
553 if isinstance(text, str):
554 try:
555 result = unicode(text.decode("UTF-8"))
556 except UnicodeEncodeError, e:
557 result = unicode(text.decode("charmap"))
558 elif not isinstance(text, unicode):
559 result = unicode(text)
560 else:
561 result = text
562
563 return translate.lang.data.normalize(result)
564
565
567 """an enquire object contains the information about the result of a request
568 """
569
571 """intialization of a wrapper around enquires of different backends
572
573 @param enquire: a previous enquire
574 @type enquire: xapian.Enquire | pylucene-enquire
575 """
576 self.enquire = enquire
577
579 """return a specified number of qualified matches of a previous query
580
581 @param start: index of the first match to return (starting from zero)
582 @type start: int
583 @param number: the number of matching entries to return
584 @type number: int
585 @return: a set of matching entries and some statistics
586 @rtype: tuple of (returned number, available number, matches)
587 "matches" is a dictionary of::
588 ["rank", "percent", "document", "docid"]
589 """
590 raise NotImplementedError("Incomplete indexing implementation: " \
591 + "'get_matches' for the 'Enquire' class is missing")
592
594 """return the estimated number of matches
595
596 use "CommonIndexer.search" to retrieve the exact number of matches
597 @return: the estimaed number of matches
598 @rtype: int
599 """
600 (returned, estimate_count, matches) = self.get_matches(0, 1)
601 return estimate_count
602