Opera 12.15 Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ACT.cpp 29KB


  1. /* -*- Mode: c++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*-
  2. **
  3. ** Copyright (C) 1995-2011 Opera Software ASA. All rights reserved.
  4. **
  5. ** This file is part of the Opera web browser.
  6. ** It may not be distributed under any circumstances.
  7. */
  8. #include "core/pch.h"
  9. #ifdef SEARCH_ENGINE // to remove compilation errors with ADVANCED_OPVECTOR
  10. #include "modules/search_engine/ACT.h"
  11. #include "modules/search_engine/ACTUtil.h"
  12. #define NUR_MASK 0x7FFF
  13. #define NUR_MAX 0x8000
  14. class ACTPrefixIterator : public SearchIterator<ACT::PrefixResult>
  15. {
  16. public:
  17. ACTPrefixIterator(void) {m_prefix = NULL; m_current_result.id = 0; m_current_result.utf8_word = NULL; m_status = OpBoolean::IS_FALSE; m_act = NULL;}
  18. ~ACTPrefixIterator(void) {op_free(m_prefix);}
  19. virtual BOOL Next(void)
  20. {
  21. if (m_status != OpBoolean::IS_TRUE)
  22. return FALSE;
  23. return (m_status = m_act->FindNext(m_current_result, m_prefix)) == OpBoolean::IS_TRUE;
  24. }
  25. virtual BOOL Prev(void) {return FALSE;}
  26. CHECK_RESULT(virtual OP_STATUS Error(void) const) {return OpStatus::IsError(m_status) ? m_status : OpStatus::OK;}
  27. virtual int Count(void) const {return m_status != OpBoolean::IS_TRUE ? 0 : -1;}
  28. virtual BOOL End(void) const {return m_status != OpBoolean::IS_TRUE;}
  29. virtual BOOL Beginning(void) const {return TRUE;}
  30. virtual const ACT::PrefixResult &Get(void) {return m_current_result;}
  31. protected:
  32. friend class ACT;
  33. CHECK_RESULT(virtual OP_STATUS Init(const char *prefix, ACT *act))
  34. {
  35. RETURN_OOM_IF_NULL(m_prefix = op_strdup(prefix));
  36. m_act = act;
  37. m_status = m_act->FindFirst(m_current_result, m_prefix);
  38. return OpStatus::IsError(m_status) ? m_status : OpStatus::OK;
  39. }
  40. char *m_prefix;
  41. ACT::PrefixResult m_current_result;
  42. OP_BOOLEAN m_status;
  43. ACT *m_act;
  44. };
  45. /** Used as prefix iterator when the last word of the query is a whole word, not a prefix after all */
  46. class ACTSingleWordIterator : public ACTPrefixIterator
  47. {
  48. virtual BOOL Next(void)
  49. {
  50. m_status = OpBoolean::IS_FALSE;
  51. return FALSE;
  52. }
  53. CHECK_RESULT(virtual OP_STATUS Init(const char *prefix, ACT *act))
  54. {
  55. RETURN_OOM_IF_NULL(m_current_result.utf8_word = SetNewStr(prefix));
  56. m_current_result.id = act->CaseSearch(prefix);
  57. m_status = m_current_result.id ? OpBoolean::IS_TRUE : OpBoolean::IS_FALSE;
  58. return OpStatus::OK;
  59. }
  60. };
  61. ACT::ACT(void) : BSCache(ACT_MAX_CACHE_BRANCHES)
  62. {
  63. op_memset(random_status, 0, sizeof(random_status));
  64. m_TailCallback = NULL;
  65. #ifdef _DEBUG
  66. collision_count = 0;
  67. #endif
  68. }
  69. OP_STATUS ACT::Open(const uni_char* path, BlockStorage::OpenMode mode, TailCallback tc, void *callback_val, OpFileFolder folder)
  70. {
  71. int block_size;
  72. TrieBranch *t;
  73. m_TailCallback = tc;
  74. m_callback_val = callback_val;
  75. block_size = (TrieBranch::GetPackedSize() + 12 + 511) & ~511;
  76. RETURN_IF_ERROR(m_storage.Open(path, mode, block_size, 0, folder));
  77. if (m_storage.GetBlockSize() != block_size)
  78. {
  79. m_storage.Close();
  80. return OpStatus::ERR_PARSING_FAILED;
  81. }
  82. if (!m_storage.IsNativeEndian())
  83. m_storage.SetOnTheFlyCnvFunc(&TrieNode::SwitchEndian, NULL);
  84. if (m_storage.GetFileSize() < (OpFileLength)(m_storage.GetBlockSize() * 2)) // empty file
  85. {
  86. if ((t = OP_NEW(TrieBranch, (0, NULL, 0, 0))) == NULL)
  87. {
  88. m_storage.Close();
  89. return OpStatus::ERR_NO_MEMORY;
  90. }
  91. if (OpStatus::IsError(t->Write(&m_storage)))
  92. {
  93. m_storage.Close();
  94. return OpStatus::ERR_NO_DISK;
  95. }
  96. InitRandom();
  97. #ifdef _DEBUG
  98. ++branches_created;
  99. #endif
  100. OP_DELETE(t);
  101. }
  102. else
  103. RestoreStatus(); // Restores Random
  104. return OpStatus::OK;
  105. }
  106. OP_STATUS ACT::Close(void)
  107. {
  108. OP_STATUS err, err2;
  109. err = Flush(ReleaseAll);
  110. ClearCache();
  111. if (m_storage.InTransaction())
  112. {
  113. err2 = SaveStatus();
  114. err = OpStatus::IsError(err) ? err : err2;
  115. err2 = m_storage.Commit();
  116. err = OpStatus::IsError(err) ? err : err2;
  117. }
  118. m_storage.Close();
  119. return err;
  120. }
  121. OP_STATUS ACT::Clear(void)
  122. {
  123. TrieBranch *t;
  124. Abort();
  125. RETURN_IF_ERROR(m_storage.Clear());
  126. if ((t = OP_NEW(TrieBranch, (0, NULL, 0, 0))) == NULL)
  127. {
  128. m_storage.Close();
  129. return OpStatus::ERR_NO_MEMORY;
  130. }
  131. if (OpStatus::IsError(t->Write(&m_storage)))
  132. {
  133. m_storage.Close();
  134. return OpStatus::ERR_NO_DISK;
  135. }
  136. InitRandom();
  137. #ifdef _DEBUG
  138. ++branches_created;
  139. #endif
  140. OP_DELETE(t);
  141. return OpStatus::OK;
  142. }
  143. int ACT::WordsEqual(const char *w1, const char *w2, int max)
  144. {
  145. int len = 0;
  146. if (max == 0)
  147. return TRUE;
  148. if (max < 0)
  149. max = (int)op_strlen(w1) + 1;
  150. while (*w1 != 0 && *w2 != 0)
  151. {
  152. SkipNonPrintableChars(w1);
  153. SkipNonPrintableChars(w2);
  154. if (*w1 != *w2)
  155. return len;
  156. if (--max <= 0)
  157. return -1;
  158. ++w1;
  159. ++len;
  160. ++w2;
  161. }
  162. SkipNonPrintableChars(w1);
  163. SkipNonPrintableChars(w2);
  164. return *w1 == 0 && *w2 == 0 ? -1 : len;
  165. }
  166. #ifdef ESTIMATE_MEMORY_USED_AVAILABLE
  167. size_t ACT::EstimateMemoryUsed() const
  168. {
  169. return BSCache::EstimateMemoryUsed() +
  170. sizeof(m_TailCallback) +
  171. sizeof(m_callback_val);
  172. }
  173. #endif
  174. OP_STATUS ACT::AddCaseWord(const char *utf8_word, WordID id, BOOL overwrite_existing)
  175. {
  176. if (!m_storage.InTransaction())
  177. {
  178. RETURN_IF_ERROR(m_storage.BeginTransaction(m_flush_mode == JournalOnly));
  179. }
  180. if (m_TailCallback != NULL)
  181. {
  182. WordID tail_id;
  183. char *tail_word;
  184. int eq_len;
  185. OP_STATUS err;
  186. if ((tail_id = CaseSearch(utf8_word)) != 0)
  187. {
  188. RETURN_IF_ERROR(m_TailCallback(&tail_word, tail_id, m_callback_val));
  189. if ((eq_len = WordsEqual(tail_word, utf8_word)) > 0)
  190. {
  191. if (OpStatus::IsError((err = AddCaseWord(tail_word, tail_id, eq_len, overwrite_existing))))
  192. {
  193. OP_DELETEA(tail_word);
  194. return err;
  195. }
  196. }
  197. OP_DELETEA(tail_word);
  198. }
  199. }
  200. else
  201. m_NUR_mark = (m_NUR_mark + 1) & NUR_MASK; // in the first case it's done in CaseSearch
  202. return AddCaseWord(utf8_word, id, 0, overwrite_existing);
  203. }
  204. OP_STATUS ACT::AddCaseWord(const char *utf8_word, WordID id, int new_len, BOOL overwrite_existing)
  205. {
  206. NodePointer t(this);
  207. OP_STATUS err;
  208. BOOL e_exist = TRUE;
  209. int current_super, conflict_super;
  210. const char *next_char;
  211. if (!m_storage.InTransaction())
  212. {
  213. RETURN_IF_ERROR(m_storage.BeginTransaction(m_flush_mode == JournalOnly));
  214. }
  215. SkipNonPrintableChars(utf8_word);
  216. if (*utf8_word == 0)
  217. return OpStatus::OK;
  218. if (OpStatus::IsError(err = t.Reset(2)))
  219. goto add_word_rollback;
  220. while (*utf8_word != 0)
  221. {
  222. next_char = utf8_word + 1;
  223. SkipNonPrintableChars(next_char);
  224. if (!t.ValidChar(utf8_word[0]))
  225. {
  226. if (!t.Reposition(t.GetCurrentOffset(), utf8_word[0]))
  227. if (OpStatus::IsError(err = t.Move(t.GetCurrentOffset())))
  228. goto add_word_rollback;
  229. }
  230. if (OpStatus::IsError(err = t.Goto(utf8_word[0])))
  231. goto add_word_rollback;
  232. // conflict?
  233. if (!t.IsFree() && t.GetParent() != t.GetCurrentParent())
  234. {
  235. current_super = t.GetSuperParent(t.GetCurrentParent()); // the first predecessor in this branch
  236. conflict_super = t.GetSuperParent(t.GetParent());
  237. // there are better algorithms in theory, but the experiments proved that this one is the best (see older versions in cvs)
  238. if (t.GetParent() != 0 &&
  239. (t.GetCurrentParent() == 0 ||
  240. t.GetChildrenSize(current_super) + (int)op_strlen(utf8_word) < t.GetChildrenSize(conflict_super)))
  241. {
  242. if (!t.Reposition(t.GetParent())) // repositioning the shorter chain makes the fill factor worse
  243. if (OpStatus::IsError(err = t.Move(conflict_super)))
  244. goto add_word_rollback;
  245. }
  246. else {
  247. if (!t.Reposition())
  248. if (OpStatus::IsError(err = t.Move(current_super)))
  249. goto add_word_rollback;
  250. }
  251. }
  252. if (!t.IsFree()) // is this node already used with the current word as a prefix?
  253. {
  254. if (*next_char == 0)
  255. {
  256. if (!t.IsWord())
  257. {
  258. e_exist = FALSE;
  259. t.SetIsWord();
  260. }
  261. }
  262. else if (t.IsFinal()) // is the current word longer?
  263. {
  264. if (m_TailCallback != NULL && !t.IsWord() && new_len <= 0)
  265. break; // just overwriting the WordId of the same word
  266. if (OpStatus::IsError(err = t.NewNode((unsigned char)*next_char)))
  267. goto add_word_rollback;
  268. }
  269. }
  270. else {
  271. e_exist = FALSE;
  272. t.SetParent(t.GetCurrentParent());
  273. if (*next_char == 0)
  274. t.SetFinalWord();
  275. else if (m_TailCallback != NULL && new_len <= 0 && *next_char != 0)
  276. {
  277. t.SetFinal();
  278. break; // end of input
  279. }
  280. else {
  281. t.SetFinal(); // Temporarily, until NewNode sets a child
  282. if (OpStatus::IsError(err = t.NewNode((unsigned char)*next_char)))
  283. goto add_word_rollback;
  284. }
  285. }
  286. utf8_word = next_char;
  287. if (new_len > 0)
  288. --new_len;
  289. }
  290. if (!e_exist || (overwrite_existing && t.GetId() != id))
  291. t.SetId(id);
  292. #ifdef _DEBUG
  293. if (t.GetCurrentPointer()->modified)
  294. OP_ASSERT(TrieBranch::CheckIntegrity(t.GetCurrentPointer(), this));
  295. #endif
  296. return e_exist ? OpBoolean::IS_FALSE : OpBoolean::IS_TRUE;
  297. add_word_rollback:
  298. t.Reset();
  299. Abort();
  300. return err;
  301. }
  302. OP_STATUS ACT::DeleteCaseWord(const char *utf8_word)
  303. {
  304. BSCache::Item::DiskId last_branch;
  305. int last_pos;
  306. const char *last_word, *word_origin;
  307. NodePointer t(this), tmp(this);
  308. OP_STATUS err;
  309. int i, max_offset, children;
  310. const char *next_char;
  311. SkipNonPrintableChars(utf8_word);
  312. if (*utf8_word == 0)
  313. return OpStatus::ERR_OUT_OF_RANGE;
  314. m_NUR_mark = (m_NUR_mark + 1) & NUR_MASK;
  315. word_origin = utf8_word;
  316. last_branch = 1;
  317. last_pos = 0;
  318. last_word = word_origin;
  319. RETURN_IF_ERROR(t.Reset(2));
  320. i = 1;
  321. max_offset = 257 - FIRST_CHAR;
  322. // find the point from which the word is unique
  323. while (*utf8_word != 0)
  324. {
  325. next_char = utf8_word + 1;
  326. SkipNonPrintableChars(next_char);
  327. tmp = t;
  328. RETURN_IF_ERROR(tmp.Goto(utf8_word[0]));
  329. if (tmp.IsFree() || tmp.GetParent() != tmp.GetCurrentParent())
  330. return OpStatus::ERR_OUT_OF_RANGE;
  331. children = 0;
  332. while (i < max_offset)
  333. {
  334. if (!tmp.GetCurrentPointer()->IsFree(i) &&
  335. tmp.GetCurrentPointer()->GetParent(i) == tmp.GetCurrentParent())
  336. if (++children > 1) // previous node seem to be unique, but it isn't
  337. {
  338. last_branch = 1;
  339. last_pos = 0;
  340. last_word = word_origin;
  341. break;
  342. }
  343. ++i;
  344. }
  345. // current path might be unique, but another word is prefix of this one
  346. if (t.IsWord() && last_pos != 0)
  347. {
  348. last_branch = t.GetCurrentBranch();
  349. last_pos = t.GetCurrentOffset();
  350. last_word = utf8_word;
  351. }
  352. // this particular node has just one child
  353. if (children <= 1 && last_pos == 0)
  354. {
  355. last_branch = t.GetCurrentBranch();
  356. last_pos = t.GetCurrentOffset();
  357. last_word = utf8_word;
  358. }
  359. if (tmp.IsFinal() && *next_char != 0)
  360. {
  361. if (m_TailCallback == NULL || tmp.IsWord())
  362. return OpStatus::ERR_OUT_OF_RANGE;
  363. else {
  364. while (*next_char != 0)
  365. ++next_char;
  366. }
  367. }
  368. utf8_word = next_char;
  369. t = tmp;
  370. tmp.Reset();
  371. if (utf8_word[0] != 0)
  372. {
  373. if (t.HasChild())
  374. i = 1;
  375. else i = t.GetOffset();
  376. if ((max_offset = i + 256 - FIRST_CHAR) > TRIE_SIZE)
  377. max_offset = TRIE_SIZE;
  378. if (i <= 0)
  379. i = 1;
  380. }
  381. }
  382. // currently found string isn't an indexed word
  383. if (!t.IsWord() && !t.IsFinal())
  384. return OpStatus::ERR_OUT_OF_RANGE;
  385. if (!m_storage.InTransaction())
  386. {
  387. RETURN_IF_ERROR(m_storage.BeginTransaction(m_flush_mode == JournalOnly));
  388. }
  389. // current word is a prefix of another word(s)
  390. if (!t.IsFinal())
  391. {
  392. t.SetIsNotWord();
  393. return OpStatus::OK;
  394. }
  395. // since here, Word and Final are set
  396. // not unique until the last character
  397. if (last_pos == 0)
  398. {
  399. t.SetFree();
  400. return OpStatus::OK;
  401. }
  402. // delete the nodes
  403. utf8_word = last_word;
  404. if (OpStatus::IsError(err = t.Reset(last_branch)))
  405. goto delete_word_rollback;
  406. t.SetCurrentOffset(last_pos);
  407. // set the last non-unique position
  408. tmp = t;
  409. if (OpStatus::IsError(err = tmp.Goto(utf8_word[0])))
  410. goto delete_word_rollback;
  411. if (t.IsWord())
  412. t.SetFinalWord();
  413. else t.SetFree();
  414. t = tmp;
  415. tmp.Reset();
  416. ++utf8_word;
  417. SkipNonPrintableChars(utf8_word);
  418. if (!t.IsWord() && t.IsFinal() && m_TailCallback != NULL)
  419. {
  420. while (*utf8_word != 0)
  421. ++utf8_word;
  422. }
  423. while (*utf8_word != 0)
  424. {
  425. next_char = utf8_word + 1;
  426. SkipNonPrintableChars(next_char);
  427. tmp = t;
  428. if (OpStatus::IsError(err = tmp.Goto(utf8_word[0])))
  429. goto delete_word_rollback;
  430. if (!tmp.IsWord() && tmp.IsFinal() && m_TailCallback != NULL)
  431. {
  432. while (*next_char != 0)
  433. ++next_char;
  434. }
  435. t.SetFree();
  436. if (t.GetCurrentBranch() != tmp.GetCurrentBranch() && t.GetCurrentBranch() != last_branch)
  437. Unlink(t.GetCurrentPointer());
  438. t = tmp;
  439. tmp.Reset();
  440. utf8_word = next_char;
  441. }
  442. t.SetFree();
  443. if (t.GetCurrentBranch() != last_branch)
  444. Unlink(t.GetCurrentPointer());
  445. #ifdef _DEBUG
  446. if (t.GetCurrentPointer()->modified)
  447. OP_ASSERT(TrieBranch::CheckIntegrity(t.GetCurrentPointer(), this));
  448. #endif
  449. return OpStatus::OK;
  450. delete_word_rollback:
  451. t.Reset();
  452. tmp.Reset();
  453. Abort();
  454. return err;
  455. }
  456. /*OP_STATUS ACT::ToNativeEndian(BlockStorage::ProgressCallback progress, void *user_value)
  457. {
  458. return m_storage.ToNativeEndian((BlockStorage::EndianCallback)&TrieNode::SwitchEndian,
  459. 10, progress, user_value);
  460. }*/
  461. void ACT::Abort(void)
  462. {
  463. ClearCache();
  464. if (m_storage.InTransaction())
  465. OpStatus::Ignore(m_storage.Rollback());
  466. RestoreStatus(); // Restores Random
  467. }
  468. OP_STATUS ACT::Commit(void)
  469. {
  470. OP_STATUS err, err2;
  471. err = Flush();
  472. if (!m_storage.InTransaction())
  473. return OpStatus::OK;
  474. if (m_flush_mode != BSCache::JournalOnly)
  475. {
  476. err2 = SaveStatus();
  477. err = OpStatus::IsError(err) ? err : err2;
  478. }
  479. err2 = m_storage.Commit();
  480. err = OpStatus::IsError(err) ? err : err2;
  481. return err;
  482. }
  483. ACT::WordID ACT::CaseSearch(const char *utf8_word)
  484. {
  485. const char *next_char;
  486. NodePointer t(this);
  487. SkipNonPrintableChars(utf8_word);
  488. if (*utf8_word == 0)
  489. return 0;
  490. m_NUR_mark = (m_NUR_mark + 1) & NUR_MASK;
  491. RETURN_VALUE_IF_ERROR(t.Reset(2), 0);
  492. while (*utf8_word != 0)
  493. {
  494. next_char = utf8_word + 1;
  495. SkipNonPrintableChars(next_char);
  496. RETURN_VALUE_IF_ERROR(t.Goto(utf8_word[0]), 0);
  497. if (t.IsFree() || t.GetParent() != t.GetCurrentParent())
  498. return 0;
  499. if (t.IsFinal())
  500. {
  501. if (m_TailCallback != NULL && !t.IsWord())
  502. return t.GetId();
  503. if (*next_char != 0)
  504. return 0;
  505. }
  506. utf8_word = next_char;
  507. }
  508. if (!t.IsWord())
  509. return 0;
  510. return t.GetId();
  511. }
  512. int ACT::PrefixCaseWords(char **result, const char *utf8_prefix, int max_results)
  513. {
  514. NodePointer t(this);
  515. int rcount;
  516. int i;
  517. const char *prefix;
  518. const char *next_char;
  519. size_t size;
  520. RETURN_VALUE_IF_ERROR(t.Reset(2), 0);
  521. if (max_results <= 0)
  522. return 0;
  523. m_NUR_mark = (m_NUR_mark + 1) & NUR_MASK;
  524. for (i = 0; i < max_results; ++i)
  525. result[i] = NULL;
  526. prefix = utf8_prefix;
  527. SkipNonPrintableChars(utf8_prefix);
  528. while (*utf8_prefix != 0)
  529. {
  530. next_char = utf8_prefix + 1;
  531. SkipNonPrintableChars(next_char);
  532. RETURN_VALUE_IF_ERROR(t.Goto(utf8_prefix[0]), 0);
  533. if (t.IsFree() || t.GetParent() != t.GetCurrentParent())
  534. return 0;
  535. if (t.IsFinal() && *next_char != 0)
  536. {
  537. if (m_TailCallback == NULL || t.IsWord())
  538. return 0;
  539. else
  540. break;
  541. }
  542. utf8_prefix = next_char;
  543. }
  544. rcount = 0;
  545. if (m_TailCallback != NULL && !t.IsWord() && t.IsFinal())
  546. {
  547. RETURN_VALUE_IF_ERROR(m_TailCallback(result, t.GetId(), m_callback_val), 0);
  548. return 1;
  549. }
  550. size = prefix[0] == 0 ? 33 : 32 * (op_strlen(prefix) / 32 + 1) + 1;
  551. if ((result[0] = OP_NEWA(char, size)) == NULL)
  552. return 0;
  553. op_strcpy(result[0], prefix);
  554. ++rcount;
  555. if (t.IsFinal() && t.IsWord())
  556. return rcount;
  557. if (t.GetCurrentOffset() != 0 && t.IsWord())
  558. {
  559. size = 32 * (op_strlen(prefix) / 32 + 1) + 1;
  560. if ((result[1] = OP_NEWA(char, size)) == NULL)
  561. {
  562. OP_DELETEA(result[0]);
  563. result[0] = NULL;
  564. return 0;
  565. }
  566. op_strcpy(result[1], prefix);
  567. ++rcount;
  568. }
  569. if (!t.IsFinal() && OpStatus::IsError(NodePointer::GetSubTree(result, &rcount, t, max_results)))
  570. {
  571. for (i = 0; i < max_results; ++i)
  572. if (result[i] != NULL)
  573. {
  574. OP_DELETEA(result[i]);
  575. result[i] = NULL;
  576. }
  577. return 0;
  578. }
  579. if (rcount == 0)
  580. {
  581. OP_DELETEA(result[0]);
  582. result[0] = NULL;
  583. }
  584. return rcount;
  585. }
  586. int ACT::PrefixCaseSearch(WordID *result, const char *utf8_prefix, int max_results)
  587. {
  588. NodePointer t(this);
  589. OP_STATUS err;
  590. int rcount;
  591. const char *next_char;
  592. RETURN_VALUE_IF_ERROR(t.Reset(2), 0);
  593. if (max_results <= 0)
  594. return 0;
  595. m_NUR_mark = (m_NUR_mark + 1) & NUR_MASK;
  596. SkipNonPrintableChars(utf8_prefix);
  597. while (*utf8_prefix != 0)
  598. {
  599. next_char = utf8_prefix + 1;
  600. SkipNonPrintableChars(next_char);
  601. if (OpStatus::IsError(err = t.Goto(utf8_prefix[0])))
  602. return 0;
  603. if (t.IsFree() || t.GetParent() != t.GetCurrentParent())
  604. return 0;
  605. if (t.IsFinal() && *next_char != 0)
  606. {
  607. if (m_TailCallback == NULL || t.IsWord())
  608. return 0;
  609. else {
  610. result[0] = t.GetId();
  611. return 1;
  612. }
  613. }
  614. utf8_prefix = next_char;
  615. }
  616. rcount = 0;
  617. if ((t.GetCurrentOffset() != 0 && t.IsWord()) || (m_TailCallback != NULL && t.IsFinal()))
  618. result[rcount++] = t.GetId();
  619. if (!t.IsFinal())
  620. RETURN_VALUE_IF_ERROR(NodePointer::GetSubTree(result, &rcount, t, max_results), 0);
  621. return rcount;
  622. }
  623. SearchIterator<ACT::PrefixResult> *ACT::PrefixCaseSearch(const char *utf8_prefix, BOOL single_word)
  624. {
  625. ACTPrefixIterator *it;
  626. if (single_word)
  627. {
  628. if ((it = OP_NEW(ACTSingleWordIterator, ())) == NULL)
  629. return NULL;
  630. }
  631. else
  632. {
  633. if ((it = OP_NEW(ACTPrefixIterator, ())) == NULL)
  634. return NULL;
  635. }
  636. if (OpStatus::IsError(it->Init(utf8_prefix, this)))
  637. {
  638. OP_DELETE(it);
  639. return NULL;
  640. }
  641. return it;
  642. }
  643. OP_BOOLEAN ACT::FindFirst(PrefixResult &res, const char *utf8_prefix)
  644. {
  645. NodePointer t(this);
  646. OP_STATUS err;
  647. const char *next_char;
  648. size_t size;
  649. RETURN_IF_ERROR(t.Reset(2));
  650. if ((int)t.GetCurrentPointer()->NumFilled() == 0)
  651. return OpBoolean::IS_FALSE;
  652. m_NUR_mark = (m_NUR_mark + 1) & NUR_MASK;
  653. OP_DELETEA(res.utf8_word);
  654. size = 32 * (op_strlen(utf8_prefix) / 32 + 1) + 1;
  655. RETURN_OOM_IF_NULL(res.utf8_word = OP_NEWA(char, size));
  656. op_strcpy(res.utf8_word, utf8_prefix);
  657. SkipNonPrintableChars(utf8_prefix);
  658. while (*utf8_prefix != 0)
  659. {
  660. next_char = utf8_prefix + 1;
  661. SkipNonPrintableChars(next_char);
  662. if (OpStatus::IsError(err = t.Goto(utf8_prefix[0])))
  663. {
  664. if (err == OpStatus::ERR_OUT_OF_RANGE)
  665. return OpBoolean::IS_FALSE;
  666. else return err;
  667. }
  668. if (t.IsFree() || t.GetParent() != t.GetCurrentParent())
  669. return OpBoolean::IS_FALSE;
  670. if (t.IsFinal() && *next_char != 0)
  671. {
  672. if (m_TailCallback == NULL || t.IsWord())
  673. return OpBoolean::IS_FALSE;
  674. else {
  675. res.id = t.GetId();
  676. return OpBoolean::IS_TRUE;
  677. }
  678. }
  679. utf8_prefix = next_char;
  680. }
  681. if (t.GetCurrentOffset() != 0 && t.IsWord())
  682. {
  683. res.id = t.GetId();
  684. return OpBoolean::IS_TRUE;
  685. }
  686. RETURN_IF_ERROR(NodePointer::GetFirstEntry(res, t));
  687. return res.id == 0 ? OpBoolean::IS_FALSE : OpBoolean::IS_TRUE;
  688. }
  689. OP_BOOLEAN ACT::FindNext(PrefixResult &res, const char *utf8_prefix)
  690. {
  691. char *prev;
  692. char *match;
  693. int match_len;
  694. OP_BOOLEAN rv;
  695. NodePointer t(this);
  696. const char *next_char;
  697. size_t size;
  698. if ((prev = res.utf8_word) == NULL)
  699. return OpBoolean::IS_FALSE;
  700. if ((match_len = ACT::WordsEqual(res.utf8_word, utf8_prefix)) != -1)
  701. {
  702. match = res.utf8_word;
  703. while (match_len > 0 && *match != 0)
  704. {
  705. if ((unsigned char)*match > FIRST_CHAR)
  706. --match_len;
  707. ++match;
  708. }
  709. match_len = 0;
  710. while (*match != 0)
  711. prev[match_len++] = *(match++);
  712. prev[match_len] = 0;
  713. }
  714. else
  715. prev[0] = 0;
  716. OpAutoArray<char> prev_a(prev);
  717. size = 32 * (op_strlen(utf8_prefix) / 32 + 1) + 1;
  718. RETURN_OOM_IF_NULL(res.utf8_word = OP_NEWA(char, size));
  719. op_strcpy(res.utf8_word, utf8_prefix);
  720. RETURN_IF_ERROR(t.Reset(2));
  721. m_NUR_mark = (m_NUR_mark + 1) & NUR_MASK;
  722. SkipNonPrintableChars(utf8_prefix);
  723. while (*utf8_prefix != 0)
  724. {
  725. next_char = utf8_prefix + 1;
  726. SkipNonPrintableChars(next_char);
  727. RETURN_IF_ERROR(t.Goto(utf8_prefix[0]));
  728. if (t.IsFree() || t.GetParent() != t.GetCurrentParent())
  729. return OpStatus::ERR;
  730. if (t.IsFinal())
  731. return OpBoolean::IS_FALSE;
  732. utf8_prefix = next_char;
  733. }
  734. // t is not at TrieNode::Final here
  735. if (t.GetCurrentOffset() != 0 && t.IsWord() && *prev == 0)
  736. {
  737. if (OpStatus::IsSuccess(rv = NodePointer::GetFirstEntry(res, t)))
  738. rv = res.id == 0 ? OpBoolean::IS_FALSE : OpBoolean::IS_TRUE;
  739. }
  740. else
  741. rv = NodePointer::GetNextEntry(res, t, prev);
  742. return rv;
  743. }
  744. #define R1 13
  745. #define R2 9
  746. static inline UINT32 ACT_ROTATE(UINT32 value, int shift)
  747. {
  748. return (value << shift) | (value >> (32 - shift));
  749. }
  750. int ACT::Random(void)
  751. {
  752. UINT32 rv;
  753. UINT32 &i1 = random_status[RANDOM_STATUS_SIZE];
  754. UINT32 &i2 = random_status[RANDOM_STATUS_SIZE + 1];
  755. rv = random_status[i1] = ACT_ROTATE(random_status[i2], R1) + ACT_ROTATE(random_status[i1], R2);
  756. if (i1 == 0) i1 = RANDOM_STATUS_SIZE; --i1;
  757. if (i2 == 0) i2 = RANDOM_STATUS_SIZE; --i2;
  758. return (int)rv & 0x7FFFFFFF;
  759. }
  760. void ACT::InitRandom(void)
  761. {
  762. int i;
  763. random_status[0] = 1655855145U;
  764. for (i = 1; i < RANDOM_STATUS_SIZE; ++i)
  765. random_status[i] = random_status[i - 1] * 2891336453U + 1;
  766. random_status[RANDOM_STATUS_SIZE] = 0;
  767. random_status[RANDOM_STATUS_SIZE + 1] = 10;
  768. }
  769. OP_STATUS ACT::SaveStatus(void)
  770. {
  771. return m_storage.WriteUserHeader(random_status, 4, 4, RANDOM_STATUS_SIZE+2) ? OpStatus::OK : OpStatus::ERR;
  772. }
  773. void ACT::RestoreStatus(void)
  774. {
  775. if (!m_storage.ReadUserHeader(random_status, 4, 4, RANDOM_STATUS_SIZE+2))
  776. InitRandom();
  777. if (random_status[RANDOM_STATUS_SIZE] == random_status[RANDOM_STATUS_SIZE + 1] ||
  778. random_status[RANDOM_STATUS_SIZE] >= RANDOM_STATUS_SIZE ||
  779. random_status[RANDOM_STATUS_SIZE + 1] >= RANDOM_STATUS_SIZE ||
  780. (random_status[random_status[RANDOM_STATUS_SIZE]] == 0 &&
  781. random_status[random_status[RANDOM_STATUS_SIZE + 1]] == 0))
  782. InitRandom();
  783. }
  784. BSCache::Item *ACT::NewMemoryItem(int id, Item *rbranch, int rnode, unsigned short nur)
  785. {
  786. return OP_NEW(TrieBranch, (id, (TrieBranch *)rbranch, rnode, nur));
  787. }
  788. BSCache::Item *ACT::NewDiskItem(OpFileLength id, unsigned short nur)
  789. {
  790. return OP_NEW(TrieBranch, (id, nur));
  791. }
  792. class DBlockReader : public BlockStorage
  793. {
  794. public:
  795. CHECK_RESULT(OP_STATUS ReadPtr(OpFileLength pos, OpFileLength *ptr))
  796. {
  797. *ptr = 0;
  798. RETURN_IF_ERROR(m_file->SetFilePos(pos));
  799. return ReadOFL(m_file, ptr);
  800. }
  801. CHECK_RESULT(OP_STATUS ReadDeletedBitField(void *bf, OpFileLength pos))
  802. {
  803. RETURN_IF_ERROR(m_file->SetFilePos(pos));
  804. return BlockStorage::ReadFully(m_file, bf, m_blocksize);
  805. }
  806. };
  807. #ifdef _DEBUG
  808. int ACT::GetFillFactor(int *f_average, int *f_min, int *f_max, int *empty, int branch_type)
  809. {
  810. TrieBranch b(0, 0);
  811. int j;
  812. int fill_nodes, branches_total;
  813. int num_children;
  814. OpFileLength i, fsize, bitfield_pos;
  815. int block_size;
  816. unsigned char *deleted;
  817. block_size = m_storage.GetBlockSize();
  818. fsize = m_storage.GetFileSize();
  819. *f_average = 0;
  820. *f_min = TRIE_SIZE;
  821. *f_max = 0;
  822. if (empty != NULL)
  823. *empty = 0;
  824. if ((deleted = OP_NEWA(unsigned char, m_storage.GetBlockSize())) == NULL)
  825. return 0;
  826. branches_total = 0;
  827. bitfield_pos = block_size;
  828. for (i = block_size; i < fsize; i += block_size)
  829. {
  830. if ((i - block_size) % (8 * block_size * block_size + block_size) == 0)
  831. {
  832. bitfield_pos = i;
  833. if (OpStatus::IsError(((DBlockReader *)&m_storage)->ReadDeletedBitField(deleted, i)))
  834. break;
  835. continue;
  836. }
  837. if (deleted[((i - bitfield_pos) / block_size - 1) >> 3] & (1 << ((((i - bitfield_pos) / block_size - 1)) & 7)))
  838. {
  839. if (empty != NULL)
  840. ++(*empty);
  841. continue;
  842. }
  843. b.disk_id = (BSCache::Item::DiskId)(i/block_size);
  844. if (OpStatus::IsError(b.Read(&m_storage)))
  845. break;
  846. fill_nodes = 0;
  847. num_children = 0;
  848. for (j = 1; j < TRIE_SIZE; ++j)
  849. {
  850. if (!b.IsFree(j))
  851. ++fill_nodes;
  852. if (b.HasChild(j))
  853. ++num_children;
  854. }
  855. OP_ASSERT(fill_nodes == (int)b.NumFilled());
  856. OP_ASSERT(fill_nodes > 0 || i == (OpFileLength)(2 * block_size));
  857. OP_ASSERT(num_children <= fill_nodes);
  858. if (branch_type == 0 || // all
  859. (branch_type == 1 && num_children != 0) || // parents
  860. (branch_type == 2 && num_children == 0)) // !parents
  861. {
  862. *f_average += fill_nodes;
  863. if (fill_nodes < *f_min)
  864. *f_min = fill_nodes;
  865. if (fill_nodes > *f_max)
  866. *f_max = fill_nodes;
  867. ++branches_total;
  868. }
  869. }
  870. OP_DELETEA(deleted);
  871. // *f_average = *f_average * 100 / (branches_created * (TRIE_SIZE - 1));
  872. if (branches_total != 0)
  873. *f_average = *f_average * 100 / (branches_total * (TRIE_SIZE - 1));
  874. else *f_average = 0;
  875. if (empty != NULL)
  876. *empty = *empty * 100 / branches_total;
  877. *f_min = *f_min * 100 / (TRIE_SIZE - 1);
  878. *f_max = *f_max * 100 / (TRIE_SIZE - 1);
  879. return branches_total;
  880. }
  881. int ACT::GetFillDistribution(int *levels, int *counts, int max_level, int *total, OpFileLength disk_id)
  882. {
  883. TrieBranch b(0, 0), sub(0, 0);
  884. int i, node_count;
  885. int own_branches, branches;
  886. if (disk_id == 2)
  887. {
  888. if (total != NULL)
  889. *total = 1;
  890. if (max_level <= 1)
  891. return 0;
  892. for (i = 0; i < max_level; ++i)
  893. {
  894. levels[i] = 0;
  895. if (counts != NULL)
  896. counts[i] = 0;
  897. }
  898. b.disk_id = (BSCache::Item::DiskId)disk_id;
  899. RETURN_VALUE_IF_ERROR(b.Read(&m_storage), 0);
  900. *levels = (int)b.NumFilled() * 1000 / (TRIE_SIZE - 1);
  901. if (counts != NULL)
  902. *counts = 1;
  903. ++levels;
  904. if (counts != NULL)
  905. ++counts;
  906. --max_level;
  907. node_count = 0;
  908. for (i = 1; i < TRIE_SIZE; ++i)
  909. if (!b.IsFree(i))
  910. ++node_count;
  911. OP_ASSERT((int)b.NumFilled() == node_count);
  912. }
  913. branches = 0;
  914. own_branches = 0;
  915. b.disk_id = (BSCache::Item::DiskId)disk_id;
  916. RETURN_VALUE_IF_ERROR(b.Read(&m_storage), 0);
  917. node_count = 0;
  918. for (i = 1; i < TRIE_SIZE; ++i)
  919. {
  920. if (!b.IsFree(i))
  921. ++node_count;
  922. if (b.HasDiskChild(i))
  923. {
  924. if (total != NULL)
  925. ++(*total);
  926. sub.disk_id = b.GetDiskChild(i);
  927. RETURN_VALUE_IF_ERROR(sub.Read(&m_storage), 0);
  928. OP_ASSERT(sub.NumFilled() > 0);
  929. *levels += (int)sub.NumFilled();
  930. if (counts != NULL)
  931. ++(*counts);
  932. ++own_branches;
  933. if (max_level > 0)
  934. branches += GetFillDistribution(levels + 1, counts == NULL ? NULL : counts + 1, max_level - 1, total, (OpFileLength)(b.GetDiskChild(i)));
  935. }
  936. }
  937. OP_ASSERT((int)b.NumFilled() == node_count);
  938. if (max_level > 0 && branches > 0)
  939. {
  940. levels[1] = levels[1] * 1000 / ((TRIE_SIZE - 1) * branches);
  941. // if (counts != NULL)
  942. // counts[1] = branches;
  943. }
  944. if (disk_id == 2 && own_branches > 0)
  945. {
  946. *levels = *levels * 1000 / ((TRIE_SIZE - 1) * own_branches);
  947. // if (counts != NULL)
  948. // *counts = own_branches;
  949. }
  950. return own_branches;
  951. }
  952. #endif // DEBUG
  953. OP_BOOLEAN ACT::CheckConsistency(void)
  954. {
  955. TrieBranch branch(0, 0);
  956. int fill_nodes, i;
  957. char *block, *deleted, *refs;
  958. OpAutoArray<char> block_a, deleted_a, refs_a;
  959. OpFileLength fsize, pos;
  960. int block_size, block_len;
  961. UINT32 ptr, num_blocks, b, b2;
  962. block_size = m_storage.GetBlockSize();
  963. fsize = m_storage.GetFileSize();
  964. num_blocks = (UINT32)(fsize/block_size);
  965. if (fsize < (OpFileLength)3 * block_size)
  966. return OpBoolean::IS_TRUE;
  967. block_a .reset(block = OP_NEWA(char, block_size));
  968. deleted_a.reset(deleted = OP_NEWA(char, (num_blocks + 7) / 8));
  969. refs_a .reset(refs = OP_NEWA(char, (num_blocks + 7) / 8));
  970. RETURN_OOM_IF_NULL(block);
  971. RETURN_OOM_IF_NULL(deleted);
  972. RETURN_OOM_IF_NULL(refs);
  973. op_memset(deleted, 0, (num_blocks + 7) / 8);
  974. op_memset(refs, 0, (num_blocks + 7) / 8);
  975. RETURN_IF_ERROR(((DBlockReader *)&m_storage)->ReadPtr(0, &pos));
  976. if (pos >= fsize)
  977. return OpBoolean::IS_FALSE; // corrupted file
  978. for (b = 1; b < num_blocks; b += 8 * block_size + 1)
  979. {
  980. pos = (OpFileLength)b * block_size;
  981. RETURN_IF_ERROR(((DBlockReader *)&m_storage)->ReadDeletedBitField(block, pos));
  982. for (b2 = b+1; b2 < b+1+8*block_size && b2 < num_blocks; ++b2)
  983. if (block[(b2 - b - 1) / 8] & (1 << ((b2 - b - 1) % 8)))
  984. deleted[b2 / 8] |= (1 << (b2 % 8));
  985. deleted[b / 8] |= (1 << (b % 8));
  986. }
  987. op_memcpy(refs, deleted, (num_blocks + 7) / 8);
  988. for (b = 2; b < num_blocks; ++b)
  989. {
  990. pos = (OpFileLength)b * block_size;
  991. if (deleted[b / 8] & (1 << (b % 8)))
  992. {
  993. if (m_storage.IsStartBlock(pos))
  994. return OpBoolean::IS_FALSE;
  995. continue;
  996. }
  997. if (m_storage.IsStartBlocksSupported() && !m_storage.IsStartBlock(pos))
  998. return OpBoolean::IS_FALSE;
  999. block_len = m_storage.DataLength(pos);
  1000. //if (block_len == 0 && m_head != NULL)
  1001. // continue; // empty branch might be tolerated for reserved pages during a transaction
  1002. if (block_len != branch.GetPackedSize())
  1003. return OpBoolean::IS_FALSE;
  1004. branch.disk_id = b;
  1005. RETURN_IF_ERROR(branch.Read(&m_storage));
  1006. if (branch.IsWord(0) ||
  1007. branch.IsFinal(0) ||
  1008. branch.GetParent(0) != 0 ||
  1009. branch.GetOffset(0) != 0)
  1010. return OpBoolean::IS_FALSE;
  1011. fill_nodes = 0;
  1012. for (i = 1; i < TRIE_SIZE; ++i)
  1013. {
  1014. if (!branch.IsFree(i))
  1015. {
  1016. ++fill_nodes;
  1017. if (branch.HasMemoryChild(i))
  1018. {
  1019. return OpBoolean::IS_FALSE;
  1020. }
  1021. else if (branch.HasDiskChild(i))
  1022. {
  1023. if (branch.IsFinal(i))
  1024. return OpBoolean::IS_FALSE;
  1025. ptr = branch.GetDiskChild(i);
  1026. if ( ptr == 0 // ptr_error
  1027. || (ptr >= num_blocks || (ptr - 1) % (8 * block_size + 1) == 0 || ptr == b) // ptr_bad_error
  1028. || (deleted[ptr / 8] & (1 << (ptr % 8))) != 0 // ptr_deleted_error
  1029. || (refs[ptr / 8] & (1 << (ptr % 8))) != 0) // ref_error
  1030. return OpBoolean::IS_FALSE;
  1031. refs[ptr / 8] |= (1 << (ptr % 8));
  1032. }
  1033. else if (!branch.IsFinal(i))
  1034. {
  1035. if (branch.GetOffset(i) + 255 - FIRST_CHAR <= 0 ||
  1036. branch.GetOffset(i) >= TRIE_SIZE)
  1037. return OpBoolean::IS_FALSE;
  1038. }
  1039. if (!branch.IsWord(i) && branch.IsFinal(i) && m_TailCallback == NULL)
  1040. return FALSE;
  1041. if (branch.GetParent(i) == 0)
  1042. {
  1043. if (i > 256 - FIRST_CHAR)
  1044. return OpBoolean::IS_FALSE;
  1045. }
  1046. else
  1047. {
  1048. if (branch.HasChild(branch.GetParent(i)))
  1049. return OpBoolean::IS_FALSE;
  1050. if (branch.IsFinal(branch.GetParent(i)))
  1051. return OpBoolean::IS_FALSE;
  1052. }
  1053. }
  1054. }
  1055. if (fill_nodes != (int)branch.NumFilled())
  1056. return OpBoolean::IS_FALSE;
  1057. }
  1058. // Check that all branches are referenced, deleted or a deleted-bit-field
  1059. for (b = 3; b < num_blocks; ++b)
  1060. if ((refs[b / 8] & (1 << (b % 8))) == 0)
  1061. return OpBoolean::IS_FALSE;
  1062. return OpBoolean::IS_TRUE;
  1063. }
  1064. #endif // SEARCH_ENGINE