Opera 12.15 Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

WordSegmenter.cpp 12KB


  1. /* -*- Mode: c++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*-
  2. **
  3. ** Copyright (C) 1995-2011 Opera Software ASA. All rights reserved.
  4. **
  5. ** This file is part of the Opera web browser.
  6. ** It may not be distributed under any circumstances.
  7. */
  8. #include "core/pch.h"
  9. #if defined(VISITED_PAGES_SEARCH) || defined(USE_SEARCH_ENGINE_WORDHIGHLIGHTER)
  10. #include "modules/search_engine/WordSegmenter.h"
  11. WordSegmenter::WordSegmenter(unsigned flags)
  12. #ifdef USE_UNICODE_SEGMENTATION
  13. : m_boundary_finder(UnicodeSegmenter::Word)
  14. #endif
  15. {
  16. m_original_string = NULL;
  17. m_word_break = NULL;
  18. m_original_string_end = NULL;
  19. m_flags = flags;
  20. }
  21. WordSegmenter::~WordSegmenter()
  22. {
  23. if ((m_flags & DontCopyInputString) == 0)
  24. OP_DELETEA(const_cast<uni_char*>(m_original_string));
  25. }
  26. OP_STATUS WordSegmenter::Set(const uni_char *string)
  27. {
  28. if (m_original_string != NULL && (m_flags & DontCopyInputString) == 0)
  29. {
  30. OP_DELETEA(const_cast<uni_char*>(m_original_string));
  31. m_original_string = NULL;
  32. }
  33. if (string == NULL)
  34. {
  35. m_original_string_end = NULL;
  36. m_word_break = NULL;
  37. return OpStatus::OK;
  38. }
  39. if ((m_flags & DontCopyInputString) == 0)
  40. RETURN_OOM_IF_NULL(m_original_string = PreprocessDup(string));
  41. else
  42. m_original_string = string;
  43. m_original_string_end = m_original_string + uni_strlen(m_original_string);
  44. m_word_break = m_original_string;
  45. return OpStatus::OK;
  46. }
  47. /* special character ranges:
  48. 0E00 .. 0E7F Thai
  49. 0E80 .. 0EFF Lao
  50. 0F00 .. 0FFF Tibetan
  51. 1000 .. 109F Myanmar (Burma)
  52. 1100 .. 11FF Hangul (Korea)
  53. 1780 .. 17FF Khmer (Cambodia)
  54. 19E0 .. 19FF Khmer Symbols
  55. 2E80 .. 2FFF CJK radicals
  56. 3000 .. 303F CJK symbols and punctuation
  57. 3040 .. 309F Hiragana (Japan)
  58. 30A0 .. 30FF Katakana (Japan)
  59. 3100 .. 312F Bopomofo (Taiwan, not special, has spaces!)
  60. 3130 .. 318F Hangul (Korea)
  61. 3190 .. 319F Kanbun (Japan)
  62. 31A0 .. 31BF Bopomofo (not special, has spaces!)
  63. 31C0 .. 31EF CJK strokes
  64. 31F0 .. 31FF Katakana
  65. 3200 .. 9FBF CJK
  66. /9FC0 .. 9FFF nothing
  67. A000 .. A4CF Yi (China)
  68. /A4D0 .. A6FF nothing
  69. A700 .. A71F Chinese tone modifiers
  70. /A720 .. A7FF nothing
  71. AC00 .. D7AF Hangul
  72. /D7B0 .. F8FF nothing
  73. F900 .. FAFF CJK
  74. FF65 .. FF9F Katakana/2
  75. FFA0 .. FFDF Hangul/2
  76. */
  77. #define between(i1, c, i2) ((c) >= i1 && (c) <= i2)
  78. #define uni_isthai(c) (between(0x0E00, c, 0x0EFF) || between(0x1000, c, 0x109F) || between(0x1780, c, 0x17FF) || between(0x19E0, c, 0x19FF))
  79. #define uni_isCJK(c) (between(0x2E80, c, 0x303F) || between(0x31C0, c, 0x31EF) || between(0x3200, c, 0xA7FF) || between(0xF900, c, 0xFAFF))
  80. #define uni_ishiragana(c) between(0x3040, c, 0x309F)
  81. #define uni_ishangul(c) (between(0xAC00, c, 0xD7AF) || between(0xFFA0, c, 0xFFDF))
  82. #define uni_iskatakana(c) (between(0x30A0, c, 0x30FF) || between(0x31F0, c, 0x31FF) || between(0xFF65, c, 0xFF9F))
  83. #define uni_nospacelng(c) (between(0x0E00, c, 0x11FF) || between(0x1780, c, 0x17FF) || between(0x19E0, c, 0x19FF) ||\
  84. between(0x2E80, c, 0x30FF) || between(0x3130, c, 0x319F) || between(0x31C0, c, 0xFAFF) || between(0xFF65, c, 0xFFDF))
  85. #ifndef USE_UNICODE_SEGMENTATION
  86. int WordSegmenter::GetCharFlags(UnicodePoint c)
  87. {
  88. int prop;
  89. int rv = 0;
  90. if (c == 0)
  91. return BreakBefore | BreakAfter;
  92. prop = Unicode::GetLinebreakProperties(c);
  93. if ((prop & LINEBREAK_ALLOW_BEFORE) != 0)
  94. rv |= BreakBefore;
  95. if ((prop & LINEBREAK_ALLOW_AFTER) != 0)
  96. rv |= BreakAfter;
  97. if ((prop & LINEBREAK_PROHIBIT_BEFORE) != 0)
  98. rv |= NoBreakBefore;
  99. if ((prop & LINEBREAK_PROHIBIT_AFTER) != 0)
  100. rv |= NoBreakAfter;
  101. if (c <= ' ')
  102. {
  103. rv |= BlockLimit;
  104. return rv;
  105. }
  106. if (uni_isalnum(c))
  107. rv |= AlNum;
  108. if (c <= 127)
  109. {
  110. if (CharSearch(block_limits, (unsigned char)c))
  111. rv |= BlockLimit;
  112. if (CharSearch(no_block_end, (unsigned char)c))
  113. rv |= NoBlockEnd;
  114. return rv;
  115. }
  116. rv |= BlockLimit;
  117. if (c < 0x0E00)
  118. return rv;
  119. if (c <= 0x19FF)
  120. {
  121. if (c <= 0x0EFF || between(0x1000, c, 0x109F) || between(0x1780, c, 0x17FF) || c >= 0x19E0)
  122. {
  123. if (rv & AlNum)
  124. rv &= ~0xF;
  125. rv |= Thai;
  126. }
  127. return rv;
  128. }
  129. if (uni_isCJK(c))
  130. rv |= CJK;
  131. if (uni_ishiragana(c))
  132. {
  133. if (rv & AlNum)
  134. rv &= ~0xF;
  135. rv |= Hiragana;
  136. }
  137. if (uni_ishangul(c))
  138. {
  139. if (rv & AlNum)
  140. rv &= ~0xF;
  141. rv |= Hangul;
  142. }
  143. if (uni_iskatakana(c))
  144. {
  145. if (rv & AlNum)
  146. rv &= ~0xF;
  147. rv |= Katakana;
  148. }
  149. return rv;
  150. }
  151. #else // USE_UNICODE_SEGMENTATION
  152. #define SET_ALNUM(x, char_class) switch (char_class) \
  153. { \
  154. case CC_Nd: \
  155. case CC_Nl: \
  156. case CC_No: \
  157. case CC_Ll: \
  158. case CC_Lm: \
  159. case CC_Lo: \
  160. case CC_Lt: \
  161. case CC_Lu: \
  162. x |= AlNum; \
  163. }
  164. int WordSegmenter::GetCharFlags(UnicodePoint c)
  165. {
  166. int rv = 0;
  167. if (c == 0)
  168. return BreakBefore | BreakAfter;
  169. if (c <= 127)
  170. return rv;
  171. if (c < 0x0E00)
  172. return rv;
  173. if (c <= 0x19FF)
  174. {
  175. if (c <= 0x0EFF || between(0x1000, c, 0x109F) || between(0x1780, c, 0x17FF) || c >= 0x19E0)
  176. rv |= Thai;
  177. return rv;
  178. }
  179. if (uni_isCJK(c))
  180. rv |= CJK;
  181. if (uni_ishiragana(c))
  182. rv |= Hiragana;
  183. if (uni_ishangul(c))
  184. rv |= Hangul;
  185. if (uni_iskatakana(c))
  186. rv |= Katakana;
  187. return rv;
  188. }
  189. #endif // USE_UNICODE_SEGMENTATION
  190. /*
  191. what makes a word:
  192. * single CJK (chinese, hanja, kanji, yi) character
  193. * tibetan syllable
  194. * couples of katakana characters (or a single self-standing character)
  195. * couples of hangul characters (or a single self-standing character)
  196. * triplets of hiragana characters (singles and couples are discarded)
  197. * triplets of thai/khmer/lao/myanmar characters (or less self-standing characters)
  198. * other alphanum characters
  199. what makes a block:
  200. * Block is a sequence of ascii characters beginning with a word.
  201. * Block contains at least two words separated by not-block-breaking characters.
  202. * Block can contain "=.,:;&\?!-@", but doesn't end with them.
  203. (aargh!@#$5 is a valid block)
  204. */
  205. void WordSegmenter::GetNextToken(Word &token)
  206. {
  207. const uni_char *pos, *next_pos;
  208. int c_prop, next_prop;
  209. BOOL word_end;
  210. UnicodePoint uc, next_uc;
  211. int uc_width, next_uc_width;
  212. if (m_original_string == NULL)
  213. {
  214. token.Empty();
  215. return;
  216. }
  217. if (m_original_string_end - m_word_break)
  218. uc = Unicode::GetUnicodePoint(m_word_break, (int)(m_original_string_end - m_word_break), uc_width);
  219. else
  220. {
  221. uc = 0;
  222. uc_width = 0;
  223. }
  224. #ifdef USE_UNICODE_SEGMENTATION
  225. m_boundary_finder.Reset();
  226. if (m_boundary_finder.FindBoundary(m_word_break, uc_width) == 0)
  227. m_boundary_finder.FindBoundary(m_word_break, uc_width);
  228. c_prop = GetCharFlags(uc);
  229. SET_ALNUM(c_prop, m_boundary_finder.LastClass())
  230. #else
  231. c_prop = GetCharFlags(uc);
  232. #endif
  233. for (pos = m_word_break; pos < m_original_string_end;
  234. pos = next_pos, uc = next_uc, uc_width = next_uc_width, c_prop = next_prop)
  235. {
  236. next_pos = pos + uc_width;
  237. if (next_pos < m_original_string_end)
  238. next_uc = Unicode::GetUnicodePoint(next_pos, (int)(m_original_string_end - next_pos), next_uc_width);
  239. else
  240. {
  241. next_uc_width = 1;
  242. next_uc = 0;
  243. }
  244. next_prop = GetCharFlags(next_uc);
  245. #ifdef USE_UNICODE_SEGMENTATION
  246. if ((c_prop & NoSpaceLng) != (next_prop & NoSpaceLng))
  247. {
  248. word_end = TRUE;
  249. if (uni_isalnum(next_uc))
  250. next_prop |= AlNum;
  251. if (m_boundary_finder.FindBoundary(next_pos, next_uc_width) == 0)
  252. m_boundary_finder.FindBoundary(next_pos, next_uc_width);
  253. }
  254. else {
  255. word_end = (m_boundary_finder.FindBoundary(next_pos, next_uc_width) == 0);
  256. if (word_end)
  257. m_boundary_finder.FindBoundary(next_pos, next_uc_width);
  258. // workaround for unicode always breaking on hiragana characters
  259. if ((c_prop & Hiragana) != 0 && (next_prop & Hiragana) != 0)
  260. word_end = FALSE;
  261. // workaround for unicode always breaking on thai characters
  262. else if ((c_prop & Thai) != 0 && (next_prop & Thai) != 0)
  263. word_end = FALSE;
  264. SET_ALNUM(next_prop, m_boundary_finder.LastClass())
  265. }
  266. // Additional word breaks for fine-grained word-segmenting
  267. if ((m_flags & FineSegmenting) != 0)
  268. {
  269. WordBreakType wb1 = Unicode::GetWordBreakType(uc);
  270. WordBreakType wb2 = Unicode::GetWordBreakType(next_uc);
  271. if (((wb1 == WB_ALetter) != (wb2 == WB_ALetter)) ||
  272. ((wb1 == WB_Numeric) != (wb2 == WB_Numeric)) ||
  273. (Unicode::IsLower(uc) && Unicode::IsUpper(next_uc)))
  274. word_end = TRUE;
  275. }
  276. #else
  277. word_end = ((c_prop & BreakAfter) != 0 && (next_prop & NoBreakBefore) == 0) ||
  278. ((next_prop & BreakBefore) != 0 && (c_prop & NoBreakAfter) == 0) ||
  279. (c_prop & NoSpaceLng) != (next_prop & NoSpaceLng);
  280. #endif
  281. // skip leading spaces/punctuation
  282. if ((c_prop & AlNum) == 0 && pos == m_word_break)
  283. {
  284. m_word_break = next_pos;
  285. continue;
  286. }
  287. // word end
  288. if (word_end) // change of script
  289. {
  290. token.Set(m_word_break, (int)(next_pos - m_word_break));
  291. m_word_break = next_pos;
  292. return;
  293. }
  294. // japanese/korean, thai/lao/khmer/myanmar
  295. if ((m_flags & DisableNGrams) == 0 &&
  296. ((pos - m_word_break >= 1 && (c_prop & (Katakana | Hangul)) != 0) ||
  297. (pos - m_word_break >= 2 && (c_prop & (Thai | Hiragana)) != 0)))
  298. {
  299. token.Set(m_word_break, (int)(next_pos - m_word_break));
  300. int width;
  301. Unicode::GetUnicodePoint(m_word_break, (int)(m_original_string_end - m_word_break), width);
  302. m_word_break += width;
  303. return;
  304. }
  305. }
  306. token.Empty();
  307. }
  308. OP_BOOLEAN WordSegmenter::GetNextToken(OpString &token)
  309. {
  310. Word tmp;
  311. GetNextToken(tmp);
  312. RETURN_IF_ERROR(token.Set(tmp.ptr, tmp.len));
  313. return tmp.IsEmpty() ? OpBoolean::IS_FALSE : OpBoolean::IS_TRUE;
  314. }
  315. TVector<WordSegmenter::Word> *WordSegmenter::GetTokens(void)
  316. {
  317. OpAutoPtr< TVector<Word> > rv;
  318. Word token;
  319. rv.reset(OP_NEW(TVector<Word>, ()));
  320. if (rv.get() == NULL)
  321. return NULL;
  322. GetNextToken(token);
  323. while (!token.IsEmpty())
  324. {
  325. RETURN_VALUE_IF_ERROR(rv->Add(token), NULL);
  326. GetNextToken(token);
  327. }
  328. return rv.release();
  329. }
  330. TVector<uni_char *> *WordSegmenter::Parse(BOOL *last_is_prefix)
  331. {
  332. OpAutoPtr< TVector<uni_char *> > rv;
  333. Word token;
  334. uni_char *str;
  335. if (last_is_prefix != NULL)
  336. *last_is_prefix = FALSE;
  337. rv.reset(OP_NEW(TVector<uni_char *>, (&UniStrCompare, &PtrDescriptor<uni_char>::DestructArray)));
  338. if (rv.get() == NULL)
  339. return NULL;
  340. GetNextToken(token);
  341. while (!token.IsEmpty())
  342. {
  343. if ((str = token.Extract()) == NULL)
  344. return NULL;
  345. RETURN_VALUE_IF_ERROR(rv->Add(str), NULL);
  346. if (last_is_prefix != NULL)
  347. *last_is_prefix = (*(token.ptr+token.len) == '\0'); // Prefix only if the last word ends the string
  348. GetNextToken(token);
  349. }
  350. return rv.release();
  351. }
  352. #ifdef USE_UNICODE_SEGMENTATION
  353. BOOL WordSegmenter::WordBreak(const uni_char *buf, const uni_char *s, BOOL fine_segmenting)
  354. {
  355. const uni_char *gc_boundary;
  356. int boundary;
  357. UnicodeSegmenter segmenter(UnicodeSegmenter::Word);
  358. if (s <= buf)
  359. return TRUE;
  360. if (!UnicodeSegmenter::IsGraphemeClusterBoundary(s[-1], s[0]))
  361. return FALSE;
  362. WordBreakType wb1 = Unicode::GetWordBreakType(s[-1]);
  363. WordBreakType wb2 = Unicode::GetWordBreakType(s[0]);
  364. // Override rule WB13 in UnicodeSegmenter
  365. if (wb1 == WB_Katakana || wb2 == WB_Katakana)
  366. return TRUE;
  367. // Additional word breaks for fine-grained word-segmenting
  368. if (fine_segmenting)
  369. {
  370. if (((wb1 == WB_ALetter) != (wb2 == WB_ALetter)) ||
  371. ((wb1 == WB_Numeric) != (wb2 == WB_Numeric)) ||
  372. (Unicode::IsLower(s[-1]) && Unicode::IsUpper(s[0])))
  373. return TRUE;
  374. }
  375. gc_boundary = s - 1;
  376. while (gc_boundary > buf && !UnicodeSegmenter::IsGraphemeClusterBoundary(gc_boundary[-1], gc_boundary[0]))
  377. --gc_boundary;
  378. if ((boundary = segmenter.FindBoundary(gc_boundary, (int)(s - gc_boundary + 1))) == 0)
  379. boundary = segmenter.FindBoundary(gc_boundary, (int)(s - gc_boundary + 1));
  380. return boundary == s - gc_boundary;
  381. }
  382. #else
  383. BOOL WordSegmenter::WordBreak(const uni_char *buf, const uni_char *s)
  384. {
  385. int f1, f2;
  386. if (s > buf)
  387. f1 = GetCharFlags(s[-1]);
  388. else
  389. f1 = GetCharFlags(0);
  390. f2 = GetCharFlags(*s);
  391. return (f1 & AlNum) == 0 || (f2 & AlNum) == 0 ||
  392. ((f1 & BreakAfter) != 0 && (f2 & NoBreakBefore) == 0) || ((f2 & BreakBefore) != 0 && (f1 & NoBreakAfter) == 0);
  393. }
  394. #endif
  395. BOOL WordSegmenter::IsInvisibleWordCharacter(const uni_char ch)
  396. {
  397. return Unicode::GetWordBreakType((UnicodePoint)ch) == WB_Format;
  398. }
  399. uni_char *WordSegmenter::PreprocessDup(const uni_char *src)
  400. {
  401. if (!src)
  402. return NULL;
  403. uni_char *dst = OP_NEWA(uni_char, uni_strlen(src)+1);
  404. if (!dst)
  405. return NULL;
  406. uni_char *p = dst;
  407. uni_char ch;
  408. while ((ch = *src++) != 0)
  409. if (!IsInvisibleWordCharacter(ch))
  410. *p++ = ch;
  411. *p = 0;
  412. return dst;
  413. }
  414. BOOL WordSegmenter::UniStrCompare(const void *left, const void *right)
  415. {
  416. return uni_strcmp(*(uni_char**)left, *(uni_char**)right) < 0;
  417. }
  418. #endif // VISITED_PAGES_SEARCH