Package translate :: Package search :: Package indexing :: Module PyLuceneIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.PyLuceneIndexer

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright 2008 Zuza Software Foundation 
  4  # 
  5  # This file is part of translate. 
  6  # 
  7  # translate is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  # 
 12  # translate is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with translate; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22   
 23  """ 
 24  interface for the PyLucene (v2.x) indexing engine 
 25   
 26  take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface 
 27  """ 
 28   
 29  __revision__ = "$Id: PyLuceneIndexer.py 15537 2010-08-15 20:23:28Z alaaosh $" 
 30   
 31  import CommonIndexer 
 32  import re 
 33  import os 
 34  import time 
 35  import logging 
 36   
 37  # try to import the PyLucene package (with the two possible names) 
 38  # remember the type of the detected package (compiled with jcc (>=v2.3) or 
 39  # with gcj (<=v2.2) 
 40  try: 
 41      import PyLucene 
 42      _COMPILER = 'gcj' 
 43  except ImportError: 
 44      # if this fails, then there is no pylucene installed 
 45      import lucene 
 46      PyLucene = lucene 
 47      PyLucene.initVM(PyLucene.CLASSPATH) 
 48      _COMPILER = 'jcc' 
 49   
 50   
 51  UNNAMED_FIELD_NAME = "FieldWithoutAName" 
 52  MAX_FIELD_SIZE = 1048576 
 53   
 54   
55 -def is_available():
56 return _get_pylucene_version() == 2
57 58
59 -class PyLuceneDatabase(CommonIndexer.CommonDatabase):
60 """manage and use a pylucene indexing database""" 61 62 QUERY_TYPE = PyLucene.Query 63 INDEX_DIRECTORY_NAME = "lucene" 64
65 - def __init__(self, basedir, analyzer=None, create_allowed=True):
66 """initialize or open an indexing database 67 68 Any derived class must override __init__. 69 70 @raise ValueError: the given location exists, but the database type 71 is incompatible (e.g. created by a different indexing engine) 72 @raise OSError: the database failed to initialize 73 74 @param basedir: the parent directory of the database 75 @type basedir: str 76 @param analyzer: bitwise combination of possible analyzer flags 77 to be used as the default analyzer for this database. Leave it empty 78 to use the system default analyzer (self.ANALYZER_DEFAULT). 79 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 80 @type analyzer: int 81 @param create_allowed: create the database, if necessary; default: True 82 @type create_allowed: bool 83 """ 84 jvm = PyLucene.getVMEnv() 85 jvm.attachCurrentThread() 86 super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, 87 create_allowed=create_allowed) 88 self.pyl_analyzer = PyLucene.StandardAnalyzer() 89 self.writer = None 90 self.reader = None 91 self.index_version = None 92 try: 93 # try to open an existing database 94 tempreader = PyLucene.IndexReader.open(self.location) 95 tempreader.close() 96 except PyLucene.JavaError, err_msg: 97 # Write an error out, in case this is a real problem instead of an absence of an index 98 # TODO: turn the following two lines into debug output 99 #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() 100 #DEBUG_FOO("could not open index, so going to create: " + errorstr) 101 # Create the index, so we can open cached readers on it 102 if not create_allowed: 103 raise OSError("Indexer: skipping database creation") 104 try: 105 # create the parent directory if it does not exist 106 parent_path = os.path.dirname(self.location) 107 if not os.path.isdir(parent_path): 108 # recursively create all directories up to parent_path 109 os.makedirs(parent_path) 110 except IOError, err_msg: 111 raise OSError("Indexer: failed to create the parent " \ 112 + "directory (%s) of the indexing database: %s" \ 113 % (parent_path, err_msg)) 114 try: 115 tempwriter = PyLucene.IndexWriter(self.location, 116 self.pyl_analyzer, True) 117 tempwriter.close() 118 except PyLucene.JavaError, err_msg: 119 raise OSError("Indexer: failed to open or create a Lucene" \ 120 + " database (%s): %s" % (self.location, err_msg)) 121 # the indexer is initialized - now we prepare the searcher 122 # windows file locking seems inconsistent, so we try 10 times 123 numtries = 0 124 #self.dir_lock.acquire(blocking=True) 125 # read "self.reader", "self.indexVersion" and "self.searcher" 126 try: 127 while numtries < 10: 128 try: 129 self.reader = PyLucene.IndexReader.open(self.location) 130 self.indexVersion = self.reader.getCurrentVersion( 131 self.location) 132 self.searcher = PyLucene.IndexSearcher(self.reader) 133 break 134 except PyLucene.JavaError, e: 135 # store error message for possible later re-raise (below) 136 lock_error_msg = e 137 time.sleep(0.01) 138 numtries += 1 139 else: 140 # locking failed for 10 times 141 raise OSError("Indexer: failed to lock index database" \ 142 + " (%s)" % lock_error_msg) 143 finally: 144 pass 145 # self.dir_lock.release() 146 # initialize the searcher and the reader 147 self._index_refresh()
148
149 - def __del__(self):
150 """remove lock and close writer after loosing the last reference""" 151 self._writer_close() 152 if self.reader is not None: 153 self.reader.close() 154 self.reader = None 155 if self.searcher is not None: 156 self.searcher.close() 157 self.searcher = None
158
159 - def flush(self, optimize=False):
160 """flush the content of the database - to force changes to be written 161 to disk 162 163 some databases also support index optimization 164 165 @param optimize: should the index be optimized if possible? 166 @type optimize: bool 167 """ 168 keep_open = self._writer_is_open() 169 self._writer_open() 170 try: 171 if optimize: 172 self.writer.optimize() 173 finally: 174 self.writer.flush() 175 if not keep_open: 176 self._writer_close()
177
178 - def make_query(self, *args, **kwargs):
179 jvm = PyLucene.getVMEnv() 180 jvm.attachCurrentThread() 181 return super(PyLuceneDatabase, self).make_query(*args, **kwargs)
182
183 - def _create_query_for_query(self, query):
184 """generate a query based on an existing query object 185 186 basically this function should just create a copy of the original 187 188 @param query: the original query object 189 @type query: PyLucene.Query 190 @return: resulting query object 191 @rtype: PyLucene.Query 192 """ 193 # TODO: a deep copy or a clone would be safer 194 # somehow not working (returns "null"): copy.deepcopy(query) 195 return query
196
197 - def _create_query_for_string(self, text, require_all=True, 198 analyzer=None):
199 """generate a query for a plain term of a string query 200 201 basically this function parses the string and returns the resulting 202 query 203 204 @param text: the query string 205 @type text: str 206 @param require_all: boolean operator 207 (True -> AND (default) / False -> OR) 208 @type require_all: bool 209 @param analyzer: the analyzer to be used 210 possible analyzers are: 211 - L{CommonDatabase.ANALYZER_TOKENIZE} 212 the field value is splitted to be matched word-wise 213 - L{CommonDatabase.ANALYZER_PARTIAL} 214 the field value must start with the query string 215 - L{CommonDatabase.ANALYZER_EXACT} 216 keep special characters and the like 217 @type analyzer: bool 218 @return: resulting query object 219 @rtype: PyLucene.Query 220 """ 221 if analyzer is None: 222 analyzer = self.analyzer 223 if analyzer == self.ANALYZER_EXACT: 224 analyzer_obj = PyLucene.KeywordAnalyzer() 225 else: 226 text = _escape_term_value(text) 227 analyzer_obj = PyLucene.StandardAnalyzer() 228 qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj) 229 if (analyzer & self.ANALYZER_PARTIAL > 0): 230 # PyLucene uses explicit wildcards for partial matching 231 text += "*" 232 if require_all: 233 qp.setDefaultOperator(qp.Operator.AND) 234 else: 235 qp.setDefaultOperator(qp.Operator.OR) 236 return qp.parse(text)
237
238 - def _create_query_for_field(self, field, value, analyzer=None):
239 """generate a field query 240 241 this functions creates a field->value query 242 243 @param field: the fieldname to be used 244 @type field: str 245 @param value: the wanted value of the field 246 @type value: str 247 @param analyzer: the analyzer to be used 248 possible analyzers are: 249 - L{CommonDatabase.ANALYZER_TOKENIZE} 250 the field value is splitted to be matched word-wise 251 - L{CommonDatabase.ANALYZER_PARTIAL} 252 the field value must start with the query string 253 - L{CommonDatabase.ANALYZER_EXACT} 254 keep special characters and the like 255 @type analyzer: bool 256 @return: resulting query object 257 @rtype: PyLucene.Query 258 """ 259 if analyzer is None: 260 analyzer = self.analyzer 261 if analyzer == self.ANALYZER_EXACT: 262 analyzer_obj = PyLucene.KeywordAnalyzer() 263 else: 264 value = _escape_term_value(value) 265 analyzer_obj = PyLucene.StandardAnalyzer() 266 qp = PyLucene.QueryParser(field, analyzer_obj) 267 if (analyzer & self.ANALYZER_PARTIAL > 0): 268 # PyLucene uses explicit wildcards for partial matching 269 value += "*" 270 return qp.parse(value)
271
272 - def _create_query_combined(self, queries, require_all=True):
273 """generate a combined query 274 275 @param queries: list of the original queries 276 @type queries: list of PyLucene.Query 277 @param require_all: boolean operator 278 (True -> AND (default) / False -> OR) 279 @type require_all: bool 280 @return: the resulting combined query object 281 @rtype: PyLucene.Query 282 """ 283 combined_query = PyLucene.BooleanQuery() 284 for query in queries: 285 combined_query.add( 286 PyLucene.BooleanClause(query, _occur(require_all, False))) 287 return combined_query
288
289 - def _create_empty_document(self):
290 """create an empty document to be filled and added to the index later 291 292 @return: the new document object 293 @rtype: PyLucene.Document 294 """ 295 return PyLucene.Document()
296
297 - def _add_plain_term(self, document, term, tokenize=True):
298 """add a term to a document 299 300 @param document: the document to be changed 301 @type document: PyLucene.Document 302 @param term: a single term to be added 303 @type term: str 304 @param tokenize: should the term be tokenized automatically 305 @type tokenize: bool 306 """ 307 if tokenize: 308 token_flag = PyLucene.Field.Index.TOKENIZED 309 else: 310 token_flag = PyLucene.Field.Index.UN_TOKENIZED 311 document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term, 312 PyLucene.Field.Store.YES, token_flag))
313
314 - def _add_field_term(self, document, field, term, tokenize=True):
315 """add a field term to a document 316 317 @param document: the document to be changed 318 @type document: PyLucene.Document 319 @param field: name of the field 320 @type field: str 321 @param term: term to be associated to the field 322 @type term: str 323 @param tokenize: should the term be tokenized automatically 324 @type tokenize: bool 325 """ 326 if tokenize: 327 token_flag = PyLucene.Field.Index.TOKENIZED 328 else: 329 token_flag = PyLucene.Field.Index.UN_TOKENIZED 330 document.add(PyLucene.Field(str(field), term, 331 PyLucene.Field.Store.YES, token_flag))
332
333 - def _add_document_to_index(self, document):
334 """add a prepared document to the index database 335 336 @param document: the document to be added 337 @type document: PyLucene.Document 338 """ 339 self._writer_open() 340 self.writer.addDocument(document)
341
342 - def begin_transaction(self):
343 """PyLucene does not support transactions 344 345 Thus this function just opens the database for write access. 346 Call "cancel_transaction" or "commit_transaction" to close write 347 access in order to remove the exclusive lock from the database 348 directory. 349 """ 350 jvm = PyLucene.getVMEnv() 351 jvm.attachCurrentThread() 352 self._writer_open()
353
354 - def cancel_transaction(self):
355 """PyLucene does not support transactions 356 357 Thus this function just closes the database write access and removes 358 the exclusive lock. 359 360 See 'start_transaction' for details. 361 """ 362 if self._writer_is_open(): 363 self.writer.abort() 364 self._writer_close()
365
366 - def commit_transaction(self):
367 """PyLucene does not support transactions 368 369 Thus this function just closes the database write access and removes 370 the exclusive lock. 371 372 See 'start_transaction' for details. 373 """ 374 self._writer_close() 375 self._index_refresh()
376
377 - def get_query_result(self, query):
378 """return an object containing the results of a query 379 380 @param query: a pre-compiled query 381 @type query: a query object of the real implementation 382 @return: an object that allows access to the results 383 @rtype: subclass of CommonEnquire 384 """ 385 return PyLuceneHits(self.searcher.search(query))
386
387 - def delete_doc(self, ident):
388 super(PyLuceneDatabase, self).delete_doc(ident) 389 self.reader.flush() 390 self._index_refresh()
391
392 - def delete_document_by_id(self, docid):
393 """delete a specified document 394 395 @param docid: the document ID to be deleted 396 @type docid: int 397 """ 398 if self._writer_is_open(): 399 self._writer_close() 400 try: 401 self.reader.deleteDocument(docid) 402 except PyLucene.JavaError: 403 self._index_refresh() 404 self.reader.deleteDocument(docid)
405
406 - def search(self, query, fieldnames):
407 """return a list of the contents of specified fields for all matches of 408 a query 409 410 @param query: the query to be issued 411 @type query: a query object of the real implementation 412 @param fieldnames: the name(s) of a field of the document content 413 @type fieldnames: string | list of strings 414 @return: a list of dicts containing the specified field(s) 415 @rtype: list of dicts 416 """ 417 if isinstance(fieldnames, basestring): 418 fieldnames = [fieldnames] 419 hits = self.searcher.search(query) 420 if _COMPILER == 'jcc': 421 # add the ranking number and the retrieved document to the array 422 hits = [(hit, hits.doc(hit)) for hit in range(hits.length())] 423 result = [] 424 for hit, doc in hits: 425 fields = {} 426 for fieldname in fieldnames: 427 # take care for the special field "None" 428 if fieldname is None: 429 pyl_fieldname = UNNAMED_FIELD_NAME 430 else: 431 pyl_fieldname = fieldname 432 fields[fieldname] = doc.getValues(pyl_fieldname) 433 result.append(fields) 434 return result
435
436 - def _delete_stale_lock(self):
437 if self.reader.isLocked(self.location): 438 #HACKISH: there is a lock but Lucene api can't tell us how old it 439 # is, will have to check the filesystem 440 try: 441 # in try block just in case lock disappears on us while testing it 442 stat = os.stat(os.path.join(self.location, 'write.lock')) 443 age = (time.time() - stat.st_mtime) / 60 444 if age > 15: 445 logging.warning("stale lock found in %s, removing.", self.location) 446 self.reader.unlock(self.reader.directory()) 447 except: 448 pass
449
450 - def _writer_open(self):
451 """open write access for the indexing database and acquire an 452 exclusive lock 453 """ 454 if not self._writer_is_open(): 455 self._delete_stale_lock() 456 self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer, 457 False) 458 # "setMaxFieldLength" is available since PyLucene v2 459 # we must stay compatible to v1 for the derived class 460 # (PyLuceneIndexer1) - thus we make this step optional 461 if hasattr(self.writer, "setMaxFieldLength"): 462 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
463 # do nothing, if it is already open 464
465 - def _writer_close(self):
466 """close indexing write access and remove the database lock""" 467 if self._writer_is_open(): 468 self.writer.close() 469 self.writer = None
470
471 - def _writer_is_open(self):
472 """check if the indexing write access is currently open""" 473 return not self.writer is None
474
475 - def _index_refresh(self):
476 """re-read the indexer database""" 477 try: 478 if self.reader is None or self.searcher is None: 479 self.reader = PyLucene.IndexReader.open(self.location) 480 self.searcher = PyLucene.IndexSearcher(self.reader) 481 elif self.index_version != self.reader.getCurrentVersion( \ 482 self.location): 483 self.searcher.close() 484 self.reader.close() 485 self.reader = PyLucene.IndexReader.open(self.location) 486 self.searcher = PyLucene.IndexSearcher(self.reader) 487 self.index_version = self.reader.getCurrentVersion(self.location) 488 except PyLucene.JavaError, e: 489 # TODO: add some debugging output? 490 #self.errorhandler.logerror("Error attempting to read index - try reindexing: "+str(e)) 491 pass
492 493
494 -class PyLuceneHits(CommonIndexer.CommonEnquire):
495 """an enquire object contains the information about the result of a request 496 """ 497
498 - def get_matches(self, start, number):
499 """return a specified number of qualified matches of a previous query 500 501 @param start: index of the first match to return (starting from zero) 502 @type start: int 503 @param number: the number of matching entries to return 504 @type number: int 505 @return: a set of matching entries and some statistics 506 @rtype: tuple of (returned number, available number, matches) 507 "matches" is a dictionary of:: 508 ["rank", "percent", "document", "docid"] 509 """ 510 # check if requested results do not exist 511 # stop is the lowest index number to be ommitted 512 stop = start + number 513 if stop > self.enquire.length(): 514 stop = self.enquire.length() 515 # invalid request range 516 if stop <= start: 517 return (0, self.enquire.length(), []) 518 result = [] 519 for index in range(start, stop): 520 item = {} 521 item["rank"] = index 522 item["docid"] = self.enquire.id(index) 523 item["percent"] = self.enquire.score(index) 524 item["document"] = self.enquire.doc(index) 525 result.append(item) 526 return (stop-start, self.enquire.length(), result)
527
528 -def _occur(required, prohibited):
529 if required == True and prohibited == False: 530 return PyLucene.BooleanClause.Occur.MUST 531 elif required == False and prohibited == False: 532 return PyLucene.BooleanClause.Occur.SHOULD 533 elif required == False and prohibited == True: 534 return PyLucene.BooleanClause.Occur.MUST_NOT 535 else: 536 # It is an error to specify a clause as both required 537 # and prohibited 538 return None
539
540 -def _get_pylucene_version():
541 """get the installed pylucene version 542 543 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown 544 @rtype: int 545 """ 546 version = PyLucene.VERSION 547 if version.startswith("1."): 548 return 1 549 elif version.startswith("2."): 550 return 2 551 else: 552 return 0
553 554
555 -def _escape_term_value(text):
556 return re.sub("\*", "", text)
557