Opera 12.15 Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

VisitedSearch.h 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. /* -*- Mode: c++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*-
  2. **
  3. ** Copyright (C) 1995-2011 Opera Software ASA. All rights reserved.
  4. **
  5. ** This file is part of the Opera web browser.
  6. ** It may not be distributed under any circumstances.
  7. */
  8. #ifndef VISITEDSEARCH_H
  9. #define VISITEDSEARCH_H
  10. #include "modules/search_engine/Cursor.h"
  11. #include "modules/search_engine/ResultBase.h"
  12. #include "modules/search_engine/Vector.h"
  13. #include "modules/search_engine/RankIndex.h"
  14. #include "modules/hardcore/mh/mh.h"
  15. #if defined SEARCH_ENGINE_LOG && (SEARCH_ENGINE_LOG & SEARCH_ENGINE_LOG_VISITEDSEARCH)
  16. #include "modules/search_engine/log/Log.h"
  17. #endif
  18. #ifdef SEARCH_ENGINE_PHRASESEARCH
  19. #include "modules/search_engine/PhraseSearch.h"
  20. #endif
  21. #define RANK_TITLE 0.5F
  22. #define RANK_H1 0.6F
  23. #define RANK_H2 0.7F
  24. #define RANK_H3 0.8F
  25. #define RANK_H4 0.85F
  26. #define RANK_H5 0.88F
  27. #define RANK_H6 0.89F
  28. #define RANK_I 0.9F
  29. #define RANK_EM 0.91F
  30. #define RANK_P 0.95F
  31. class RecordHandleRec;
  32. struct FileWord;
  33. /**
  34. * @brief fulltext indexing/searching designed to hold URLs
  35. * @author Pavel Studeny <pavels@opera.com>
  36. */
  37. class VisitedSearch : public MessageObject, private NonCopyable
  38. {
  39. public:
  40. /** handle to create a record for a URL */
  41. typedef RecordHandleRec *RecordHandle;
  42. VisitedSearch(void);
  43. ~VisitedSearch(void)
  44. {
  45. OP_ASSERT(m_pending.GetCount() == 0);
  46. if (m_index.GetCount() != 0)
  47. OpStatus::Ignore(Close());
  48. }
  49. /**
  50. * change the maximum size of the index;
  51. * depending on the size and flush phase, the change may not come into effect immediately
  52. * @param max_size new index size in MB
  53. */
  54. void SetMaxSize(OpFileLength max_size);
  55. /**
  56. * change the maximum size to contain approximately max_items.
  57. * Could be analyzed later, because mapping to history items is not always 1:1 this could be improved.
  58. * @param max_items maximal number of items the full index should roughly contain
  59. */
  60. void SetMaxItems(int max_items);
  61. #ifdef SELFTEST
  62. void SetSubindexSize(OpFileLength ssize) {m_subindex_size = ssize;}
  63. #endif
  64. /**
  65. * Open/create the index in the given directory.
  66. * Must be called before using other methods except IsOpen.
  67. * @param directory full path without terminating path separator
  68. */
  69. CHECK_RESULT(OP_STATUS Open(const uni_char *directory));
  70. /**
  71. * flush all cached data and closes all resources
  72. * @param force_close close all resources even if all operations cannot be completed e.g. because out of disk space
  73. * @return if force_close was set, returns error anyway, but the resources are released
  74. */
  75. CHECK_RESULT(OP_STATUS Close(BOOL force_close = TRUE));
  76. BOOL IsOpen(void)
  77. {
  78. return m_index.GetCount() > 0;
  79. }
  80. /**
  81. * erase all data
  82. * @param reopen if FALSE, close the index when all data is cleared
  83. */
  84. CHECK_RESULT(OP_STATUS Clear(BOOL reopen = TRUE));
  85. /*** indexing part ***/
  86. /**
  87. * Open a new record for writing. Not closing the record by CloseRecord or AbortRecord is evil.
  88. * VisitedSearch must be opened before using this method.
  89. * @param url address of the topmost document in the current window
  90. * @param title the <title> tag from the header of the document
  91. * @return 0 on out of memory or if indexing is disabled/uninitialized
  92. */
  93. RecordHandle CreateRecord(const char *url, const uni_char *title = NULL);
  94. /**
  95. * add the title of the document if it wasn't available at the time of CreateRecord
  96. * @param handle handle created by CreateRecord
  97. * @param title the <title> tag from the header of the document
  98. */
  99. CHECK_RESULT(OP_STATUS AddTitle(RecordHandle handle, const uni_char *title));
  100. /**
  101. * add a block of text with the same ranking
  102. * @param handle handle created by CreateRecord
  103. * @param text plain text words
  104. * @param ranking float value from an open interval of (0, 1), should be one of the RANK_XXX
  105. * @param is_contination if TRUE, this text block should be considered a continuation of the previous
  106. */
  107. CHECK_RESULT(OP_STATUS AddTextBlock(RecordHandle handle, const uni_char *text, float ranking, BOOL is_continuation = FALSE));
  108. /**
  109. * make the data available for writing, no further changes will be possible
  110. * @param handle handle created by CreateRecord
  111. * @return aborts the record on error
  112. */
  113. CHECK_RESULT(OP_STATUS CloseRecord(RecordHandle handle));
  114. /**
  115. * cancel insertion of this document to the index, handle becomes invalid after this
  116. * @param handle handle created by CreateRecord
  117. */
  118. void AbortRecord(RecordHandle handle);
  119. /**
  120. * attach a small picture of the document
  121. * @param url URL of existing record created by CreateRecord
  122. * @param picture data in a format known to the caller
  123. * @param size size of the picture in bytes
  124. */
  125. CHECK_RESULT(OP_STATUS AssociateThumbnail(const char *url, const void *picture, int size));
  126. /**
  127. * associate a locally saved copy of the web page with the current handle
  128. * @param url URL of existing record created by CreateRecord
  129. * @param fname full path to a single file
  130. */
  131. CHECK_RESULT(OP_STATUS AssociateFileName(const char *url, const uni_char *fname));
  132. /**
  133. * begin transaction and prepare data from the cache to be written to disk;
  134. * all Inserts or Deletes until Commit will not be included in this transaction;
  135. * an error cancels the whole transaction and the data are returned back to cache
  136. *
  137. * @param max_ms maximum time to spend by PreFlush in miliseconds, 0 means unlimited
  138. * @return OpBoolean::IS_TRUE if finished successfully, OpBoolean::IS_FALSE if time limit was reached (call PreFlush again)
  139. * @see BlockStorage for more information about the transaction modes
  140. */
  141. CHECK_RESULT(OP_BOOLEAN PreFlush(int max_ms = 0));
  142. /**
  143. * write the data prepared by PreFlush;
  144. * calling PreFlush before Flush is optional, but a delay (roughly 30s) between PreFlush and Flush
  145. * reduces the time spent in operating system calls;
  146. * an error cancels the whole transaction and the data are returned back to cache
  147. *
  148. * @param max_ms maximum time to spend by PreFlush in miliseconds, 0 means unlimited
  149. * @return OpBoolean::IS_TRUE if finished successfully, OpBoolean::IS_FALSE if time limit was reached (call Flush again)
  150. */
  151. CHECK_RESULT(OP_BOOLEAN Flush(int max_ms = 0));
  152. /**
  153. * finish the transaction begun by PreFlush;
  154. * calling Flush before Commit is optional, but a delay (roughly 30s) between Flush and Commit
  155. * reduces the time spent in operating system calls
  156. * @return not supposed to fail under normal circumstances if Flush was called and finished successfully
  157. */
  158. CHECK_RESULT(OP_STATUS Commit(void));
  159. /*** searching part ***/
  160. enum Sort
  161. {
  162. RankSort, /**< sort the results by ranking, best ranking first */
  163. DateSort, /**< sort the results by date, latest date first */
  164. Autocomplete /**< like RankSort, but no results for empty query */
  165. };
  166. /** one row of a result */
  167. struct Result
  168. {
  169. char *url;
  170. uni_char *title;
  171. unsigned char *thumbnail;
  172. int thumbnail_size;
  173. uni_char *filename;
  174. time_t visited;
  175. float ranking;
  176. #ifndef SELFTEST
  177. protected:
  178. #endif
  179. UINT32 id; // for sorting purposes
  180. bool invalid;
  181. UINT16 prev_idx;
  182. UINT32 prev;
  183. UINT16 next_idx;
  184. UINT32 next;
  185. mutable uni_char *plaintext;
  186. mutable unsigned char *compressed_plaintext;
  187. mutable int compressed_plaintext_size;
  188. friend class RankIterator;
  189. friend class AllDocIterator;
  190. friend class TimeIterator;
  191. friend class VisitedSearch;
  192. friend class FastPrefixIterator;
  193. public:
  194. Result(void);
  195. CHECK_RESULT(OP_STATUS SetCompressedPlaintext(const unsigned char *buf, int size));
  196. uni_char *GetPlaintext() const;
  197. BOOL operator<(const Result &right) const
  198. {
  199. float s, a;
  200. s = ranking - right.ranking;
  201. a = (ranking + right.ranking) / 100000.0F;
  202. if (s < a && s > -a)
  203. {
  204. if (id == (UINT32)-1 && right.id == (UINT32)-1)
  205. {
  206. if (visited == right.visited)
  207. return op_strcmp(url, right.url) != 0;
  208. return visited > right.visited;
  209. }
  210. return id < right.id;
  211. }
  212. return s < 0.0;
  213. }
  214. CHECK_RESULT(static OP_STATUS ReadResult(Result &res, BlockStorage *metadata));
  215. static BOOL Later(const void *left, const void *right);
  216. static BOOL CompareId(const void *left, const void *right);
  217. static void DeleteResult(void *item);
  218. CHECK_RESULT(static OP_STATUS Assign(void *dst, const void *src));
  219. #ifdef ESTIMATE_MEMORY_USED_AVAILABLE
  220. static size_t EstimateMemoryUsed(const void *item);
  221. #endif
  222. };
  223. struct SearchSpec
  224. {
  225. OpString query;
  226. Sort sort_by;
  227. int max_items;
  228. int max_chars;
  229. OpString start_tag;
  230. OpString end_tag;
  231. int prefix_ratio;
  232. };
  233. struct SearchResult : public NonCopyable
  234. {
  235. OpString8 url;
  236. OpString title;
  237. unsigned char *thumbnail;
  238. int thumbnail_size;
  239. OpString filename;
  240. OpString excerpt;
  241. time_t visited;
  242. float ranking;
  243. SearchResult() : thumbnail(NULL), thumbnail_size(0), visited(0), ranking(0) {}
  244. ~SearchResult() { OP_DELETEA(thumbnail); }
  245. CHECK_RESULT(OP_STATUS CopyFrom(const Result & result, SearchSpec * search_spec));
  246. };
  247. /**
  248. * lookup documents contatining all the given words. Full ranking
  249. * @param text plain text words
  250. * @param sort_by sort results by relevance or a date of viewing
  251. * @param phrase_flags flags built up from PhraseMatcher::PhraseFlags, controlling what kind of phrase search is performed
  252. * @return iterator to be deleted by a caller, NULL on error
  253. */
  254. #ifdef SEARCH_ENGINE_PHRASESEARCH
  255. SearchIterator<Result> *Search(const uni_char *text, Sort sort_by = RankSort, int phrase_flags = PhraseMatcher::AllPhrases);
  256. #else
  257. SearchIterator<Result> *Search(const uni_char *text, Sort sort_by = RankSort, int phrase_flags = 0/*PhraseMatcher::NoPhrases*/);
  258. #endif
  259. /**
  260. * lookup documents contating all the words contained in text, last one as a prefix
  261. * ranking or other sorting is not involved as a trade off for a fast response
  262. * @param text searched phrase
  263. * @param phrase_flags flags built up from PhraseMatcher::PhraseFlags, controlling what kind of
  264. * phrase search is performed. PhraseMatcher::PrefixSearch is implied.
  265. * @return iterator to be deleted by a caller, NULL on error
  266. */
  267. SearchIterator<Result> *FastPrefixSearch(const uni_char *text, int phrase_flags = 0/*PhraseMatcher::NoPhrases*/);
  268. /**
  269. * disable the result from appearing in any further search results
  270. */
  271. CHECK_RESULT(OP_STATUS InvalidateResult(const Result &row));
  272. /**
  273. * disable the url from appearing in any further search results
  274. */
  275. CHECK_RESULT(OP_STATUS InvalidateUrl(const char *url));
  276. CHECK_RESULT(OP_STATUS InvalidateUrl(const uni_char *url));
  277. /**
  278. * find the indexed words for autocompletition
  279. * @param prefix word prefix to search
  280. * @param result resulting words, mustn't be NULL, the fields must be freed by caller
  281. * @param result_size maximum number of results on input, number of results on output
  282. */
  283. CHECK_RESULT(OP_STATUS WordSearch(const uni_char *prefix, uni_char **result, int *result_size));
  284. /*** message part ***/
  285. virtual void HandleCallback(OpMessage msg, MH_PARAM_1 par1, MH_PARAM_2 par2);
  286. /**
  287. * cancel the data being written by PreFlush/Flush/Commit
  288. */
  289. void AbortFlush(void);
  290. protected:
  291. enum Flags
  292. {
  293. PreFlushed = 4096, /*< PreFlush had finished successfully before Flush */
  294. Flushed = 8192, /*< Flush had finished successfully before Commit */
  295. PreFlushing = 32768, /*< time limit occured during PreFlush */
  296. UseNUR = 65536 /*< PreFlush hadn't been called before Flush, use alternative write strategy */
  297. };
  298. CHECK_RESULT(OP_STATUS IndexTextBlock(RecordHandle handle, const uni_char *text, float ranking, BOOL fine_segmenting = FALSE));
  299. CHECK_RESULT(OP_STATUS IndexURL(RecordHandle handle, const char *url, float ranking));
  300. CHECK_RESULT(OP_STATUS Insert(TVector<FileWord *> &cache, RecordHandle h, const uni_char *word, float rank));
  301. void CancelPreFlushItem(int i);
  302. CHECK_RESULT(OP_STATUS WriteMetadata(void));
  303. CHECK_RESULT(OP_STATUS FindLastIndex(UINT32 *prev, UINT16 *prev_idx, const char *url, RankIndex *current_index));
  304. CHECK_RESULT(OP_STATUS InsertHandle(RecordHandle handle));
  305. CHECK_RESULT(OP_STATUS CompressText(RecordHandle handle));
  306. CHECK_RESULT(OP_STATUS WipeData(BSCursor::RowID id, unsigned short r_index));
  307. void CheckMaxSize_RemoveOldIndexes();
  308. // Posts a message in a threadsafe manner when necessary
  309. void PostMessage(OpMessage msg, MH_PARAM_1 par1, MH_PARAM_2 par2, unsigned long delay=0);
  310. int m_flags;
  311. OpFileLength m_max_size; //total size of document index, matched to History size in desktop browser.
  312. int m_pos;
  313. int m_flush_errors;
  314. TVector<RankIndex *> m_index; //array of indexes, the newest index is at index 0.
  315. TVector<FileWord *> m_cache; //word lists inserted but not yet written to disk.
  316. TVector<FileWord *> m_preflush; //word lists that are being written.
  317. TVector<RecordHandle> m_pending; //contains documents that are being parsed.
  318. TVector<RecordHandle> m_meta; //plain text, hash etc.
  319. TVector<RecordHandle> m_meta_pf; //meta data that is being written.
  320. OpFileLength m_subindex_size; //10 times less than m_max_size above.
  321. #if defined SEARCH_ENGINE_LOG && (SEARCH_ENGINE_LOG & SEARCH_ENGINE_LOG_VISITEDSEARCH)
  322. OutputLogDevice *m_log;
  323. #endif
  324. OP_STATUS AddWord(VisitedSearch::RecordHandle handle, const uni_char *text, float ranking);
  325. private:
  326. CHECK_RESULT(OP_STATUS AbortPreFlush(OP_STATUS err));
  327. CHECK_RESULT(OP_STATUS AbortFlush(OP_STATUS err));
  328. #ifdef SEARCH_ENGINE_PHRASESEARCH
  329. class VisitedSearchDocumentSource : public DocumentSource<Result>
  330. {
  331. virtual const uni_char *GetDocument(const Result &item)
  332. {
  333. return item.GetPlaintext();
  334. }
  335. };
  336. VisitedSearchDocumentSource m_doc_source;
  337. #endif
  338. };
  339. #ifdef VPS_WRAPPER
  340. class AsyncVisitedSearch
  341. {
  342. public:
  343. static AsyncVisitedSearch *Create(void);
  344. virtual ~AsyncVisitedSearch(void) {}
  345. virtual void SetMaxSize(OpFileLength max_size) = 0;
  346. virtual void SetMaxItems(int max_items) = 0;
  347. CHECK_RESULT(virtual OP_STATUS Open(const uni_char *directory)) = 0;
  348. CHECK_RESULT(virtual OP_STATUS Close(BOOL force_close = TRUE)) = 0;
  349. virtual BOOL IsOpen(void) = 0;
  350. CHECK_RESULT(virtual OP_STATUS Clear(BOOL reopen = TRUE)) = 0;
  351. virtual VisitedSearch::RecordHandle CreateRecord(const char *url, const uni_char *title = NULL) = 0;
  352. CHECK_RESULT(virtual OP_STATUS AddTitle(VisitedSearch::RecordHandle handle, const uni_char *title)) = 0;
  353. CHECK_RESULT(virtual OP_STATUS AddTextBlock(VisitedSearch::RecordHandle handle, const uni_char *text, float ranking, BOOL is_continuation = FALSE)) = 0;
  354. CHECK_RESULT(virtual OP_STATUS CloseRecord(VisitedSearch::RecordHandle handle)) = 0;
  355. virtual void AbortRecord(VisitedSearch::RecordHandle handle) = 0;
  356. CHECK_RESULT(virtual OP_STATUS Search(const uni_char *text,
  357. VisitedSearch::Sort sort_by,
  358. int max_items,
  359. int excerpt_max_chars,
  360. const OpStringC & excerpt_start_tag,
  361. const OpStringC & excerpt_end_tag,
  362. int excerpt_prefix_ratio,
  363. MessageObject * callback)) = 0;
  364. virtual SearchIterator<VisitedSearch::Result> *Search(const uni_char *text,
  365. VisitedSearch::Sort sort_by = VisitedSearch::RankSort,
  366. #ifdef SEARCH_ENGINE_PHRASESEARCH
  367. int phrase_flags = PhraseMatcher::AllPhrases
  368. #else
  369. int phrase_flags = 0/*PhraseMatcher::NoPhrases*/
  370. #endif
  371. ) = 0;
  372. CHECK_RESULT(virtual OP_STATUS InvalidateUrl(const char *url)) = 0;
  373. CHECK_RESULT(virtual OP_STATUS InvalidateUrl(const uni_char *url)) = 0;
  374. };
  375. #endif
  376. #endif // VISITEDSEARCH_H