Opera 12.15 Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PhraseSearch.h 9.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. /* -*- Mode: c++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*-
  2. **
  3. ** Copyright (C) 1995-2011 Opera Software ASA. All rights reserved.
  4. **
  5. ** This file is part of the Opera web browser.
  6. ** It may not be distributed under any circumstances.
  7. */
  8. #ifndef PHRASESEARCH_H
  9. #define PHRASESEARCH_H
  10. #include "modules/search_engine/ResultBase.h"
  11. #include "modules/search_engine/Vector.h"
  12. #include "modules/search_engine/WordSegmenter.h"
  13. /**
  14. * @brief The PhraseMatcher extracts phrase elements from a text query and can
  15. * check if a document matches.
  16. *
  17. * Unless the FullSearch flag is set,
  18. * parts of the text query that is not phrase related is ignored and not
  19. * searched for in the document. The main usage is to post-process search
  20. * results where all whole words are already accounted for and present in the
  21. * results. If a query does not contain any phrase elements, all documents
  22. * will match. Phrases match case-insensitively.
  23. *
  24. * @par
  25. * Definitions:
  26. * @li A word is defined as a substring that would be returned if the query
  27. * string was parsed by the WordSegmenter. This also includes single
  28. * CJK characters that are not separated by space.
  29. * @li Only space, '+' and double quotes have special meaning in phrase
  30. * search. All whitespace and '+' is treated as space.
  31. *
  32. * @par
  33. * Phrase elements in a search query is defined as follows:
  34. * @li More than one word in double quotes is a phrase element if the
  35. * QuotedPhrases flag is set.
  36. * @li A sub-string containing non-word characters other than space, '+' and
  37. * double quotes is a phrase element if the PunctuationPhrases flag is set.
  38. * @li A sub-string consisting of only word characters that contains more
  39. * than one word is a phrase element if the CJKPhrases flag is set.
  40. *
  41. * @par
  42. * Examples of quoted phrase elements:
  43. * @li "to be, or not to be"
  44. * @li "to be"
  45. *
  46. * @par
  47. * Examples of "punctuation" phrase elements:
  48. * @li be,
  49. * @li [BTS]
  50. * @li opera.com
  51. *
  52. * @par
  53. * Examples of CJK phrase elements
  54. * @li ABC (where A, B and C are CJK characters)
  55. *
  56. * @par
  57. * Examples of non-phrase elements:
  58. * @li to be or not to be
  59. * @li "to"
  60. * @li to+be+" " ('+' is considered a space (esp. in forms))
  61. *
  62. * @par
  63. * A query with phrase content in double quotes match if all the phrase
  64. * elements within the quotes are located consecutively in the document,
  65. * separated by only non-word characters. That is, a phrase like
  66. * "to be or not to be" will match a document containing
  67. * "to be, or not to be".
  68. */
  69. class PhraseMatcher
  70. {
  71. public:
  72. enum PhraseFlags
  73. {
  74. NoPhrases = 0, ///< Don't do phrase filtering
  75. CJKPhrases = 1, ///< Match phrases consisting of multiple consecutive CJK characters.
  76. QuotedPhrases = 2, ///< Match quoted phrases. This also implies CJKPhrases within the quotes.
  77. PunctuationPhrases = 4, ///< Match phrases built up by a combination of word and non-word characters. This also implies CJKPhrases.
  78. AllPhrases = 7, ///< Match all types of phrases
  79. PrefixSearch = 8, ///< If set, do not require that the found phrase ends on a word boundary
  80. FullSearch =16, ///< If set, search for all words, not only phrase content. (Do not assume that all single words have already been found. Useful with AppendHighlight)
  81. DontCopyInputString=32 /**< The query string set with Init will be used directly, not copied.
  82. The string must not be deleted before the PhraseMatcher is.
  83. Note that this flag prevents preprocessing of the input string (e.g. to remove &shy; characters) */
  84. };
  85. PhraseMatcher();
  86. virtual ~PhraseMatcher();
  87. /**
  88. * initialize with a search query
  89. * @param query the query used for searching, presumably containing phrases
  90. * @param phrase_flags flags built up from PhraseFlags, controlling which phrases are matched
  91. * @return OK if there were no errors
  92. */
  93. CHECK_RESULT(OP_STATUS Init(const uni_char *query, int phrase_flags));
  94. /**
  95. * @return TRUE if no phrase elements were found in the query
  96. */
  97. BOOL Empty() const { return m_phrases.GetCount() == 0; }
  98. /**
  99. * Search the haystack for the all the words in the query
  100. * @param haystack to search
  101. * @return TRUE if all the phrases are present in the haystack
  102. */
  103. BOOL Matches(const uni_char *haystack) const;
  104. /**
  105. * append src to dst with searched words marked by start_tag and end_tag
  106. * @param dst output with tagged words
  107. * @param src plaintext excerpt, parser treats XML tags as regular words
  108. * @param max_chars maximum number of plaintext characters which should appear in the output
  109. * @param start_tag tag to prefix at the beginning of a searched word found in the plaintext; may be NULL
  110. * @param end_tag tag to prefix at the end of a searched word found in the plaintext; may be NULL
  111. * @param prefix_ratio to use when generating the context before the first matched word, if less than or
  112. * equal to zero the beginning of src will be used as the beginning of the excerpt
  113. */
  114. CHECK_RESULT(OP_STATUS AppendHighlight(OpString &dst,
  115. const uni_char *src,
  116. int max_chars,
  117. const OpStringC &start_tag,
  118. const OpStringC &end_tag,
  119. int prefix_ratio = 15));
  120. #ifdef SELFTEST
  121. uni_char *GetPhrases() const;
  122. #endif
  123. protected:
  124. struct Word : public WordSegmenter::Word
  125. {
  126. const uni_char *found_pos; ///< Pointer to where in the haystack the word was found
  127. int found_len; ///< Length of the occurrence that was found
  128. BOOL is_last; ///< TRUE if this was the last, possibly unfinished word and we are doing prefix search
  129. Word() : WordSegmenter::Word(), found_pos(NULL), found_len(0), is_last(FALSE) {}
  130. };
  131. static BOOL IsDoubleQuote(UnicodePoint c);
  132. BOOL TreatAsSpace(UnicodePoint c) const;
  133. static BOOL FindPhrase(TVector<Word> *phrase, const uni_char *haystack);
  134. static BOOL FindWord(Word &word, const uni_char *s, const uni_char *haystack, BOOL first_word);
  135. static int CopyText(uni_char *&dst_ptr, const uni_char *from, const uni_char *end);
  136. static int CopyResult(uni_char *&dst_ptr, Word &result, const OpStringC &start_tag, const OpStringC &end_tag);
  137. static const uni_char *FindEnd(const uni_char *start, int length);
  138. static const uni_char *FindStart(const uni_char *text, const uni_char *start, int length, int prefix_ratio);
  139. const uni_char *m_query;
  140. TVector<TVector<Word> *> m_phrases;
  141. int m_phrase_flags;
  142. };
  143. /**
  144. * @brief Used by PhraseFilter to get the document corresponding to a search
  145. * result to be able to look for the actual phrase.
  146. */
  147. template <typename T> class DocumentSource
  148. {
  149. public:
  150. virtual ~DocumentSource() {}
  151. /**
  152. * Get the document associated with a search result.
  153. *
  154. * Since phrase filtering in worst case can take a long time, it might be
  155. * necessary to abort the search (e.g. in search-as-you-type, if another letter
  156. * has been written to the search query). To effect an abort, the DocumentSource
  157. * may return NULL on all subsequent calls. Since NULL documents do not match,
  158. * no further results will be added, and the filtering will be quickly finished.
  159. *
  160. * If several "records" are concatenated into one document, but phrases should
  161. * not match across record boundaries (e.g. message subject + message body), the
  162. * records may be separated using the form-feed character ('\f').
  163. *
  164. * @param item A search result, potentially matching
  165. * @return the document associated with item, or NULL to abort the search
  166. */
  167. virtual const uni_char *GetDocument(const T &item) = 0;
  168. };
  169. /**
  170. * @brief A DocumentSource that automatically deletes the acquired documents.
  171. */
  172. template <typename T> class AutoDeleteDocumentSource : public DocumentSource<T>
  173. {
  174. public:
  175. AutoDeleteDocumentSource() : m_doc(NULL) {}
  176. virtual ~AutoDeleteDocumentSource()
  177. {
  178. OP_DELETEA(m_doc);
  179. }
  180. virtual const uni_char *GetDocument(const T &item)
  181. {
  182. OP_DELETEA(m_doc);
  183. m_doc = AcquireDocument(item);
  184. return m_doc;
  185. }
  186. protected:
  187. /**
  188. * Get the document associated with a search result
  189. * @param item A search result
  190. * @return a document associated with item. Will be freed by the
  191. * destructor of this class using OP_DELETEA().
  192. */
  193. virtual uni_char *AcquireDocument(const T &item) = 0;
  194. uni_char *m_doc;
  195. };
  196. /**
  197. * @brief PhraseFilter is a SearchFilter to be used with FilterIterator.
  198. *
  199. * It uses a PhraseMatcher to match documents retrieved from a DocumentSource.
  200. */
  201. template <typename T> class PhraseFilter : public SearchFilter<T>
  202. {
  203. public:
  204. /**
  205. * Construct a PhraseFilter
  206. * @param query the query used for searching, presumably containing phrases
  207. * @param doc_source The document source used to match phrases
  208. * @param phrase_flags flags built up from PhraseMatcher::PhraseFlags, controlling what kind of phrase search is performed
  209. */
  210. PhraseFilter(const uni_char *query, DocumentSource<T> &doc_source, int phrase_flags)
  211. : m_doc_source(doc_source)
  212. {
  213. m_status = OpStatus::OK;
  214. m_matcher = OP_NEW(PhraseMatcher, ());
  215. if (m_matcher == NULL)
  216. {
  217. m_status = OpStatus::ERR_NO_MEMORY;
  218. }
  219. else if (OpStatus::IsError(m_status = m_matcher->Init(query, phrase_flags)) || m_matcher->Empty())
  220. {
  221. OP_DELETE(m_matcher);
  222. m_matcher = NULL;
  223. }
  224. m_first_time = TRUE;
  225. }
  226. virtual ~PhraseFilter()
  227. {
  228. OP_DELETE(m_matcher);
  229. }
  230. /**
  231. * @return TRUE if no phrases were found in the query
  232. */
  233. virtual BOOL Empty() const { return m_matcher == NULL; }
  234. /**
  235. * @param item A search result
  236. * @return TRUE if the document associated with the search result matches the phrase
  237. */
  238. virtual BOOL Matches(const T &item) const
  239. {
  240. if (!m_first_time)
  241. m_status = OpStatus::OK;
  242. m_first_time = FALSE;
  243. return m_matcher == NULL || m_matcher->Matches(m_doc_source.GetDocument(item));
  244. }
  245. CHECK_RESULT(virtual OP_STATUS Error(void) const) { return m_status; }
  246. protected:
  247. mutable OP_STATUS m_status;
  248. mutable BOOL m_first_time;
  249. PhraseMatcher *m_matcher;
  250. DocumentSource<T> &m_doc_source;
  251. };
  252. #endif // PHRASESEARCH_H