Opera 12.15 Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RankIndex.h 4.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. /* -*- Mode: c++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*-
  2. **
  3. ** Copyright (C) 1995-2011 Opera Software ASA. All rights reserved.
  4. **
  5. ** This file is part of the Opera web browser.
  6. ** It may not be distributed under any circumstances.
  7. */
  8. #ifndef RANKINDEX_H
  9. #define RANKINDEX_H
  10. #include "modules/search_engine/Cursor.h"
  11. #include "modules/search_engine/BlockStorage.h"
  12. #include "modules/search_engine/ACT.h"
  13. #include "modules/search_engine/SingleBTree.h"
  14. #if defined SEARCH_ENGINE_LOG && (SEARCH_ENGINE_LOG & SEARCH_ENGINE_LOG_VISITEDSEARCH)
  15. #include "modules/search_engine/log/Log.h"
  16. #endif
  17. //#define RANKID_RANK 0
  18. #define RANKID_ID 1
  19. struct RankId
  20. {
  21. // float rank;
  22. // UINT32 id;
  23. union {
  24. float rank;
  25. UINT32 data[2]; // pragma pack is not recognized on a lot of platforms
  26. };
  27. RankId(void) {rank = 0.0; data[RANKID_ID] = 0;}
  28. RankId(float r) {rank = r; data[RANKID_ID] = 0;}
  29. RankId(UINT32 i) {rank = 0.0; data[RANKID_ID] = i;}
  30. RankId(float r, UINT32 i) {rank = r; data[RANKID_ID] = i;}
  31. /* BOOL operator<(const RankId &right) const
  32. {
  33. return (double)this->id + this->rank < (double)right.id + right.rank; // rank is in open interval (0,1)
  34. }*/
  35. static BOOL CompareRank(const void *left, const void *right)
  36. {
  37. float s, a;
  38. s = ((RankId *)left)->rank - ((RankId *)right)->rank;
  39. a = (((RankId *)left)->rank + ((RankId *)right)->rank) / 100000.0F;
  40. if (s < a && s > -a)
  41. return ((RankId *)left)->data[RANKID_ID] < ((RankId *)right)->data[RANKID_ID];
  42. return s < 0.0;
  43. }
  44. static BOOL CompareId(const void *left, const void *right)
  45. {
  46. return ((RankId *)left)->data[RANKID_ID] < ((RankId *)right)->data[RANKID_ID]; // id is unique
  47. }
  48. };
  49. #define IDTIME_VISITED 0
  50. #define IDTIME_ID 1
  51. struct IdTime
  52. {
  53. // UINT32 visited;
  54. // UINT32 id;
  55. UINT32 data[2]; // pragma pack is not recognized on a lot of platforms
  56. IdTime(void) {data[IDTIME_VISITED] = 0; data[IDTIME_ID] = 0;}
  57. IdTime(time_t visited, UINT32 id) {data[IDTIME_VISITED] = (UINT32)visited; data[IDTIME_ID] = id;}
  58. BOOL operator<(const IdTime &right) const
  59. {
  60. if (this->data[IDTIME_VISITED] == right.data[IDTIME_VISITED])
  61. return this->data[IDTIME_ID] > right.data[IDTIME_ID];
  62. return this->data[IDTIME_VISITED] > right.data[IDTIME_VISITED]; // last time first
  63. }
  64. };
  65. /**
  66. * When initializing the vps folder is scanned for all subdirectories, sorted in order by date of last modification of directory using fstat.
  67. * Oldest directory by date is deleted first, directory 0-9 is always used but can be less or more.
  68. */
  69. struct RankIndex : public NonCopyable
  70. {
  71. unsigned short m_id; //name of directory where files are stored.
  72. ACT m_act;
  73. BlockStorage m_wordbag; //unsorted vector of document IDs and rank.
  74. BlockStorage m_metadata; //contains full plain text, title, url, time, links to previous occurence of url if previously indexed, hash of text.
  75. SingleBTree<IdTime> m_alldoc; //list of all url documents indexed in this directory. Allows for searching by date.
  76. //Also used for stop words. These are identified by whenever a word is in half the documents and number of documents is > 500.
  77. ACT m_url;
  78. unsigned m_doc_count;
  79. RankIndex() : m_id(0), m_doc_count(0)
  80. {
  81. m_act.GroupWith(m_wordbag);
  82. m_act.GroupWith(m_metadata);
  83. m_act.GroupWith(m_alldoc);
  84. m_act.GroupWith(m_url);
  85. }
  86. ~RankIndex(void)
  87. {
  88. Close();
  89. }
  90. CHECK_RESULT(OP_STATUS Open(const uni_char *path, unsigned short id));
  91. void Close(void);
  92. /**
  93. * @return actual size of the files on disk, any buffered data is not counted in
  94. */
  95. OpFileLength Size();
  96. /**
  97. * @return last modification time of the RabkIndex's directory or -1 on error
  98. */
  99. time_t ModifTime(void);
  100. /**
  101. * close and delete all the files; RankIndex remains opened on error
  102. */
  103. CHECK_RESULT(OP_STATUS Clear(void));
  104. /**
  105. * abort the pending transaction
  106. */
  107. CHECK_RESULT(OP_STATUS Rollback(void));
  108. /**
  109. * setup a cursor for the metadata file previously described at m_metadata.
  110. */
  111. CHECK_RESULT(static OP_STATUS SetupCursor(BSCursor &cursor));
  112. /**
  113. * callback for tail-compressed URLs
  114. * Tail compressed urls are used for deleting searched urls.
  115. */
  116. CHECK_RESULT(static OP_STATUS GetTail(char **stored_value, ACT::WordID id, void *usr_val));
  117. /**
  118. * to find an id in Vector
  119. */
  120. static BOOL CompareId(const void *left, const void *right)
  121. {
  122. return (*(RankIndex **)left)->m_id < (*(RankIndex **)right)->m_id;
  123. }
  124. #if defined SEARCH_ENGINE_LOG && (SEARCH_ENGINE_LOG & SEARCH_ENGINE_LOG_VISITEDSEARCH)
  125. CHECK_RESULT(static OP_STATUS LogFile(OutputLogDevice *log, const uni_char *path, unsigned short id, const uni_char *fname, const uni_char *suffix = NULL));
  126. CHECK_RESULT(static OP_STATUS LogSubDir(OutputLogDevice *log, const uni_char *path, unsigned short id));
  127. #endif
  128. };
  129. #define FNAME_ACT "w.axx"
  130. #define FNAME_WB "wb.vx"
  131. #define FNAME_META "md.dat"
  132. #define FNAME_BTREE "adoc.bx"
  133. #define FNAME_URL "url.axx"
  134. #endif // RANKINDEX_H