Opera 12.15 Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

WordSegmenter.h 5.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. /* -*- Mode: c++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*-
  2. **
  3. ** Copyright (C) 1995-2011 Opera Software ASA. All rights reserved.
  4. **
  5. ** This file is part of the Opera web browser.
  6. ** It may not be distributed under any circumstances.
  7. */
  8. #ifndef WORDSEGMENTER_H
  9. #define WORDSEGMENTER_H
  10. #include "modules/search_engine/Vector.h"
  11. #ifdef USE_UNICODE_SEGMENTATION
  12. #include "modules/unicode/unicode_segmenter.h"
  13. #endif
  14. /**
  15. * @brief Parses plain text into separate words.
  16. * @author Pavel Studeny <pavels@opera.com>
  17. *
  18. * Finds words in any 16-bit unicode script or mixture of scripts.
  19. * Emails and URLs are detected and returned both segmented to words and as a whole.
  20. * Special behavior for languages with no spaces between words:
  21. * returns single characters for Chinese and n-grams for the others
  22. */
  23. class WordSegmenter : public NonCopyable
  24. {
  25. public:
  26. enum Flags
  27. {
  28. DisableNGrams = 1, ///< bigrams and trigrams are used for spaceless scripts like hiragana, not desirable for word highlighting
  29. // DisableSpecialBlock would be easy to implement, if needed
  30. DontCopyInputString = 2, /**< The input string set with Set will be used directly, not copied.
  31. The string must not be deleted before the WordSegmenter is.
  32. Note that this flag prevents preprocessing of the input string (e.g. to remove &shy; characters) */
  33. FineSegmenting = 4 ///< Fine-grained segmenting with additional word boundaries: letter-nonletter, number-nonnumber, lower-uppercase transition
  34. };
  35. /* return values of WordSegmenter::GetCharFlags */
  36. enum CharFlags
  37. {
  38. BreakBefore = 1, ///< word can end before this character
  39. BreakAfter = 2, ///< word can end this character
  40. NoBreakBefore = 4, ///< word mustn't end before this character
  41. NoBreakAfter = 8, ///< word mustn't end after this character
  42. AlNum = 16, ///< letter or number
  43. CJK = 32, ///< chinese characters (china, japan, korea) returns 1 character
  44. Hiragana = 64, ///< japan returns 2 characters in a sliding window
  45. Hangul = 128, ///< korea returns 2 characters in a sliding window
  46. Katakana = 256, ///< japan returns 2 characters in a sliding window
  47. Thai = 512, ///< thailand returns 3 characters in a sliding window
  48. NoSpaceLng = CJK | Hiragana | Hangul | Katakana | Thai
  49. };
  50. struct Word
  51. {
  52. const uni_char *ptr;
  53. int len;
  54. Word() : ptr(NULL), len(0) {}
  55. void Set(const uni_char *p, int l) { ptr = p; len = l; }
  56. void Empty() { ptr = NULL; len = 0; }
  57. BOOL IsEmpty() const { return ptr==NULL; }
  58. uni_char* Extract() const { uni_char* s=OP_NEWA(uni_char,len+1); if(s){ uni_strncpy(s,ptr,len); s[len]=0; } return s; }
  59. BOOL operator<(const Word &right) { return (ptr == right.ptr && len < right.len) || ptr < right.ptr; }
  60. };
  61. WordSegmenter(unsigned flags = 0);
  62. ~WordSegmenter();
  63. CHECK_RESULT(OP_STATUS Set(const OpStringC &string)) {return Set(string.CStr());}
  64. /**
  65. * delete the old data, if any, and set a new string
  66. */
  67. CHECK_RESULT(OP_STATUS Set(const uni_char *string));
  68. /**
  69. * get next word from the text set with the Set function.
  70. */
  71. void GetNextToken(Word &token);
  72. /**
  73. * get next word from the text set with the Set function.
  74. */
  75. CHECK_RESULT(OP_BOOLEAN GetNextToken(OpString &token));
  76. /**
  77. * get list of all the words, the words are destructed automatically in the output Vector's destructor
  78. * @return NULL on out of memory or if no string had been set
  79. */
  80. TVector<Word> *GetTokens(void);
  81. /**
  82. * get list of all the words, the words are destructed automatically in the output Vector's destructor
  83. * @param last_is_prefix if not NULL, will be set to TRUE if the last parsed word can be considered a prefix
  84. * @return NULL on out of memory or if no string had been set
  85. */
  86. TVector<uni_char *> *Parse(BOOL *last_is_prefix = NULL);
  87. /**
  88. * @param buf beginning of the string containing the word break
  89. * @param s character after the possible word break, s == buf and *s == 0 is allowed
  90. * @param fine_segmenting TRUE to achieve the same effect as flag "FineSegmenting"
  91. * @return TRUE if a word break is possible before s
  92. */
  93. static BOOL WordBreak(const uni_char *buf, const uni_char *s, BOOL fine_segmenting = FALSE);
  94. /**
  95. * Check if a character belongs to a character class that can be a part of
  96. * a word, but is invisible. E.g soft-hyphen (0xAD)
  97. * @param ch character to check
  98. * @return TRUE if ch is an invisible word character
  99. */
  100. static BOOL IsInvisibleWordCharacter(const uni_char ch);
  101. /**
  102. * Preprocess while duplicating a string to be used in the WordSegmenter.
  103. * This removes IsInvisibleWordCharacter characters from the input.
  104. * @param string The input string
  105. * @return The preprocessed string, or NULL if out of memory
  106. */
  107. static uni_char *PreprocessDup(const uni_char *string);
  108. private:
  109. // a lot of the work is outsouced to the calling layer #ifdef USE_UNICODE_SEGMENTATION
  110. inline static int GetCharFlags(UnicodePoint c);
  111. static BOOL UniStrCompare(const void *left, const void *right);
  112. const uni_char *m_original_string;
  113. const uni_char *m_word_break;
  114. const uni_char *m_original_string_end;
  115. // Should also possibly run without unicode segmenter.
  116. #ifdef USE_UNICODE_SEGMENTATION
  117. UnicodeSegmenter m_boundary_finder;
  118. #endif
  119. unsigned m_flags;
  120. };
  121. #endif // WORDSEGMENTER_H