Opera 12.15 Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ACT.h 8.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /* -*- Mode: c++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*-
  2. **
  3. ** Copyright (C) 1995-2011 Opera Software ASA. All rights reserved.
  4. **
  5. ** This file is part of the Opera web browser.
  6. ** It may not be distributed under any circumstances.
  7. */
  8. #ifndef ACT_H
  9. #define ACT_H
  10. #include "modules/search_engine/BlockStorage.h"
  11. #include "modules/search_engine/BSCache.h"
  12. #include "modules/search_engine/ResultBase.h"
  13. // skip non-printable characters
  14. #define FIRST_CHAR ' '
  15. #define RANDOM_STATUS_SIZE 17
  16. class TrieBranch;
  17. /**
  18. * @brief Array Compacted Trie implementation for searching/indexing utf-8 text
  19. * @author Pavel Studeny <pavels@opera.com>
  20. *
  21. * Trie is an array of pointers indexed by appropriate character from a word.
  22. * Each pointer points to array for next character.
  23. @verbatim Example:
  24. "word"
  25. [a|b|c|....|w|...]
  26. \-> [a|b|c|.....|o|....]
  27. \-> [.....]
  28. @endverbatim
  29. * These tries are very sparse, so the pointer in ACTs can point to a free position in the current array
  30. * instead of pointing to next array.
  31. * @see http://www.n3labs.com/pdf/fast-and-space-efficient.pdf for more information about ACTs
  32. */
  33. class ACT : public BSCache
  34. {
  35. public:
  36. typedef UINT32 WordID;
  37. struct PrefixResult : public NonCopyable
  38. {
  39. PrefixResult(void) : id(0), utf8_word(NULL) {}
  40. ~PrefixResult(void) {OP_DELETEA(utf8_word);}
  41. WordID id;
  42. char *utf8_word;
  43. };
  44. /**
  45. * @param stored_value should be in uppercase if you use case insensitive insertions; must be allocated by new [] and will be deleted by caller
  46. */
  47. typedef CHECK_RESULT(OP_STATUS (* TailCallback)(char **stored_value, WordID id, void *usr_val));
  48. ACT(void);
  49. /**
  50. * ACT must be opened before you call any other method
  51. * @param path file storing the data; file is always created if it doesn't exist
  52. * @param mode Read/ReadWrite mode
  53. * @param folder might be one of predefind folders
  54. * @param tc tail compression callback if tail compression is required
  55. * @param callback_val user parameter to TailCallback
  56. */
  57. CHECK_RESULT(OP_STATUS Open(const uni_char* path, BlockStorage::OpenMode mode,
  58. TailCallback tc = NULL, void *callback_val = NULL, OpFileFolder folder = OPFILE_ABSOLUTE_FOLDER));
  59. /**
  60. * flush all unsaved data, commit any pending transaction and close the file
  61. */
  62. CHECK_RESULT(OP_STATUS Close(void));
  63. /**
  64. * erase all data
  65. */
  66. CHECK_RESULT(OP_STATUS Clear(void));
  67. /**
  68. * index a new word; it will have the given ID if it doesn't exist in the index already
  69. * @param word a word to index
  70. * @param id ID for a newly created word, shouldn't be 0
  71. * @param overwrite_existing overwrite ID of the word if it was already present in the database
  72. * @return OpBoolean::IS_TRUE if the word has been created, OpBoolean::IS_FALSE if the word has been already indexed, OpStatus::OK on empty word or word without any valid character
  73. */
  74. CHECK_RESULT(OP_BOOLEAN AddWord(const uni_char *word, WordID id, BOOL overwrite_existing = TRUE));
  75. CHECK_RESULT(OP_BOOLEAN AddWord(const char *utf8_word, WordID id, BOOL overwrite_existing = TRUE));
  76. /** case-sensitive */
  77. CHECK_RESULT(OP_BOOLEAN AddCaseWord(const uni_char *word, WordID id, BOOL overwrite_existing = TRUE));
  78. CHECK_RESULT(OP_BOOLEAN AddCaseWord(const char *utf8_word, WordID id, BOOL overwrite_existing = TRUE));
  79. /**
  80. * delete a word from index, file might be truncated;
  81. * be carefull to delete only previously added words if you use tail compression
  82. */
  83. CHECK_RESULT(OP_STATUS DeleteWord(const uni_char *word));
  84. CHECK_RESULT(OP_STATUS DeleteWord(const char *utf8_word));
  85. /** case-sensitive */
  86. CHECK_RESULT(OP_STATUS DeleteCaseWord(const uni_char *word));
  87. CHECK_RESULT(OP_STATUS DeleteCaseWord(const char *utf8_word));
  88. /**
  89. * abort all write operations since the first AddWord or DeleteWord
  90. */
  91. void Abort(void);
  92. /**
  93. * flushes all data and ends any pending transaction
  94. */
  95. CHECK_RESULT(OP_STATUS Commit(void));
  96. /**
  97. * write all unsaved data to disk
  98. */
  99. // OP_STATUS Flush(ReleaseSeverity severity = ReleaseNo) {return BSCache::Flush(severity);}
  100. /**
  101. * search for a word
  102. * @return ID of the word, 0 on error or if not found
  103. */
  104. WordID Search(const uni_char *word);
  105. WordID Search(const char *utf8_word);
  106. /** case-sensitive */
  107. WordID CaseSearch(const uni_char *word);
  108. WordID CaseSearch(const char *utf8_word);
  109. /**
  110. * @deprecated Use the iterator methods instead
  111. */
  112. int PrefixWords(uni_char **result, const uni_char *prefix, int max_results);
  113. int PrefixWords(char **result, const char *utf8_prefix, int max_results);
  114. int PrefixCaseWords(uni_char **result, const uni_char *prefix, int max_results);
  115. int PrefixCaseWords(char **result, const char *utf8_prefix, int max_results);
  116. /**
  117. * @deprecated Use the iterator methods instead
  118. */
  119. int PrefixSearch(WordID *result, const uni_char *prefix, int max_results);
  120. int PrefixSearch(WordID *result, const char *utf8_prefix, int max_results);
  121. int PrefixCaseSearch(WordID *result, const uni_char *prefix, int max_results);
  122. int PrefixCaseSearch(WordID *result, const char *utf8_prefix, int max_results);
  123. /**
  124. * search for all words with given prefix
  125. * @param prefix word prefix
  126. * @param single_word if TRUE, the returned iterator will only search the given prefix as a word (and not do prefix search after all)
  127. * @return Iterator containing the first result or empty. Must be deleted by caller. NULL on error.
  128. */
  129. SearchIterator<PrefixResult> *PrefixSearch(const uni_char *prefix, BOOL single_word = FALSE);
  130. SearchIterator<PrefixResult> *PrefixSearch(const char *utf8_prefix, BOOL single_word = FALSE);
  131. SearchIterator<PrefixResult> *PrefixCaseSearch(const uni_char *prefix, BOOL single_word = FALSE);
  132. SearchIterator<PrefixResult> *PrefixCaseSearch(const char *utf8_prefix, BOOL single_word = FALSE);
  133. /**
  134. * Used internally by Prefix(Case)Search.
  135. * Find the first word with the given prefix, ordered by unicode values, case sensitive.
  136. */
  137. CHECK_RESULT(OP_BOOLEAN FindFirst(PrefixResult &res, const char *utf8_prefix));
  138. /**
  139. * Used internally by Prefix(Case)Search.
  140. * Find the next word with the given prefix, ordered by unicode values, case sensitive.
  141. */
  142. CHECK_RESULT(OP_BOOLEAN FindNext(PrefixResult &res, const char *utf8_prefix));
  143. /**
  144. * pseudo-random number generator, RANROT B algorithm
  145. */
  146. int Random(void);
  147. /**
  148. * save status of the random number generator
  149. */
  150. CHECK_RESULT(OP_STATUS SaveStatus(void));
  151. /**
  152. * restore status of the random number generator
  153. */
  154. void RestoreStatus(void);
  155. /**
  156. * case-sensitive word comparison skipping the invalid characters
  157. * @return the number of valid common chracters or -1 on match
  158. */
  159. static int WordsEqual(const char *w1, const char *w2, int max = -1);
  160. /**
  161. * @return an estimate of the memory used by this data structure
  162. */
  163. #ifdef ESTIMATE_MEMORY_USED_AVAILABLE
  164. virtual size_t EstimateMemoryUsed() const;
  165. #endif
  166. friend class NodePointer;
  167. friend class TrieBranch;
  168. protected:
  169. CHECK_RESULT(OP_BOOLEAN AddCaseWord(const char *utf8_word, WordID id, int new_len, BOOL overwrite_existing));
  170. virtual Item *NewMemoryItem(int id, Item *rbranch, int rnode, unsigned short nur);
  171. virtual Item *NewDiskItem(OpFileLength id, unsigned short nur);
  172. TailCallback m_TailCallback;
  173. void *m_callback_val;
  174. private:
  175. void InitRandom(void);
  176. UINT32 random_status[RANDOM_STATUS_SIZE + 2];
  177. #ifdef _DEBUG
  178. // statistics
  179. public:
  180. // branch_type: 0 ~ all, 1 ~ parents, 2 ~ children
  181. int GetFillFactor(int *f_average, int *f_min, int *f_max, int *empty, int branch_type);
  182. int GetFillFactor(int *f_average, int *f_min, int *f_max, int branch_type) {return GetFillFactor(f_average, f_min, f_max, NULL, branch_type);}
  183. int GetFillDistribution(int *levels, int *counts, int max_level, int *total = NULL, OpFileLength disk_id = 2);
  184. int GetFillDistribution(int *levels, int max_level, int *total = NULL, OpFileLength disk_id = 2) {return GetFillDistribution(levels, NULL, max_level, total, disk_id);}
  185. int collision_count;
  186. #endif
  187. public:
  188. CHECK_RESULT(OP_BOOLEAN CheckConsistency(void));
  189. static void SkipNonPrintableChars(const char* &s) {
  190. #if FIRST_CHAR >= 0
  191. while ((unsigned char)*s <= FIRST_CHAR && *s != 0)
  192. ++s;
  193. #endif
  194. }
  195. static void SkipNonPrintableChars(const uni_char* &s) {
  196. #if FIRST_CHAR >= 0
  197. while ((unsigned)*s <= FIRST_CHAR && *s != 0)
  198. ++s;
  199. #endif
  200. }
  201. };
  202. #endif // ACT_H