Opera 12.15 Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

UniCompressor.cpp 5.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. /* -*- Mode: c++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*-
  2. **
  3. ** Copyright (C) 1995-2011 Opera Software ASA. All rights reserved.
  4. **
  5. ** This file is part of the Opera web browser.
  6. ** It may not be distributed under any circumstances.
  7. */
  8. #include "core/pch.h"
  9. #ifdef SEARCH_ENGINE
  10. #include "modules/search_engine/UniCompressor.h"
  11. #define DICT_SIZE 0x4000
  12. #define DICT_MASK 0x3FFF
  13. // Modified Bernstein hash (experimentally as good as FNV-1a hash)
  14. #define hash(cp) (((((((cp[0] * 0x21) ^ cp[1]) * 0x21) ^ cp[2]) * 0x21) ^ cp[3]) & DICT_MASK)
  15. OP_STATUS UniCompressor::InitCompDict(void)
  16. {
  17. OP_ASSERT(m_dict == NULL); // it is not necessary to init the dictionary several times
  18. FreeCompDict();
  19. RETURN_OOM_IF_NULL(m_dict = OP_NEWA(UINT32, DICT_SIZE));
  20. #ifdef VALGRIND
  21. // Mark 'm_dict' as defined, even though it is not, since the
  22. // algorithm is not sensitive to the initial values for correct
  23. // operation.
  24. op_valgrind_set_defined(m_dict, DICT_SIZE*sizeof(UINT32));
  25. #endif
  26. return OpStatus::OK;
  27. }
  28. unsigned UniCompressor::Compress(unsigned char *dst, const uni_char *src)
  29. {
  30. const uni_char *sp, *cp, *end, *match_end;
  31. unsigned char *op;
  32. int i, di;
  33. const uni_char *dptr;
  34. unsigned len = (unsigned)uni_strlen(src);
  35. if (len < 8)
  36. {
  37. dst[0] = len;
  38. dst[1] = 0;
  39. dst[2] = 0;
  40. dst[3] = 0;
  41. if (len == 0)
  42. return 4;
  43. return (unsigned)(OutputLiteral(dst + 4, src, src + len) - dst);
  44. }
  45. sp = src;
  46. cp = src;
  47. end = cp + len;
  48. match_end = end - 4;
  49. op = dst + 4;
  50. while (cp < match_end)
  51. {
  52. di = hash(cp);
  53. dptr = src+m_dict[di];
  54. m_dict[di] = (UINT32)(cp-src);
  55. if (dptr >= src && dptr < cp && cp - dptr <= 0xFFFF &&
  56. dptr[0] == cp[0] && dptr[1] == cp[1] && dptr[2] == cp[2] && dptr[3] == cp[3])
  57. {
  58. // match found
  59. if (cp > sp)
  60. op = OutputLiteral(op, sp, cp);
  61. sp = cp;
  62. cp += 4;
  63. i = 4;
  64. while (*cp == dptr[i])
  65. {
  66. ++cp;
  67. ++i;
  68. }
  69. op = OutputMatch(op, (unsigned)(cp - sp), (unsigned short)(sp - dptr));
  70. sp++;
  71. while (sp < cp && sp < match_end)
  72. {
  73. m_dict[hash(sp)] = (UINT32)(sp-src);
  74. ++sp;
  75. }
  76. sp = cp;
  77. }
  78. else {
  79. ++cp;
  80. }
  81. }
  82. cp = end;
  83. if (cp > sp)
  84. op = OutputLiteral(op, sp, cp);
  85. i = (int)(cp - src);
  86. dst[0] = i & 0xFF;
  87. dst[1] = (i >> 8) & 0xFF;
  88. dst[2] = (i >> 16) & 0xFF;
  89. dst[3] = (i >> 24) & 0xFF;
  90. return (unsigned)(op - dst);
  91. }
  92. unsigned UniCompressor::Decompress(uni_char *dst, const unsigned char *src, unsigned len)
  93. {
  94. const unsigned char *cp;
  95. uni_char *op;
  96. register uni_char *shiftp;
  97. int c;
  98. int lit_len, max_len;
  99. int shift;
  100. if (!dst || !src || len < 4)
  101. return 0;
  102. max_len = (int)Length(src);
  103. cp = src + 4;
  104. op = dst;
  105. src += len;
  106. while (cp < src && op - dst < max_len)
  107. {
  108. if ((*cp & 0x40) == 0) // literal
  109. {
  110. // length
  111. lit_len = *cp & 0x3F;
  112. shift = 6;
  113. while ((*cp++ & 0x80) != 0)
  114. {
  115. if (shift > 30 || cp >= src)
  116. return 0;
  117. lit_len |= (*cp & 0x7F) << shift;
  118. shift += 7;
  119. }
  120. if (lit_len == 0 ||
  121. op - dst + lit_len > max_len ||
  122. cp + 2 > src)
  123. return 0;
  124. // first char
  125. *op = *cp++;
  126. *op++ |= ((uni_char)(*cp++)) << 8;
  127. // differences
  128. while (--lit_len > 0)
  129. {
  130. if (cp >= src)
  131. return 0;
  132. c = *cp & 0x7F;
  133. if ((*cp++ & 0x80) != 0)
  134. {
  135. if (cp >= src)
  136. return 0;
  137. c |= ((uni_char)*cp & 0x7F) << 7;
  138. if ((*cp++ & 0x80) != 0)
  139. {
  140. if (cp >= src)
  141. return 0;
  142. c |= *cp++ << 14;
  143. if ((c & 0x10000) != 0)
  144. c |= 0xFFFF0000;
  145. }
  146. else if ((c & 0x2000) != 0)
  147. c |= 0xFFFFC000;
  148. }
  149. else if (c & 0x40)
  150. c |= 0xFFFFFF80;
  151. *op = op[-1] - c;
  152. ++op;
  153. }
  154. }
  155. else { // match
  156. // length
  157. lit_len = *cp & 0x1F;
  158. c = (*cp & 0x20) == 0;
  159. shift = 5;
  160. while ((*cp++ & 0x80) != 0)
  161. {
  162. if (shift > 30 || cp >= src)
  163. return 0;
  164. lit_len |= (*cp & 0x7F) << shift;
  165. shift += 7;
  166. }
  167. lit_len += 2;
  168. if (cp >= src || (c && cp+1 >= src))
  169. return 0;
  170. shift = *cp++;
  171. if (c)
  172. shift |= ((int)*cp++) << 8;
  173. shiftp = op - shift;
  174. if (shiftp < dst || shiftp >= op || op - dst + lit_len > max_len)
  175. return 0;
  176. while (lit_len-- > 0)
  177. *op++ = *shiftp++;
  178. }
  179. }
  180. *op = 0;
  181. return (unsigned)(op - dst);
  182. }
  183. unsigned UniCompressor::Length(const unsigned char *src)
  184. {
  185. return (unsigned)(src[0] | ((int)src[1]) << 8 | ((int)src[2]) << 16 | ((int)src[3]) << 24);
  186. }
  187. unsigned char *UniCompressor::OutputLiteral(unsigned char *op, const uni_char *sp, const uni_char *cp)
  188. {
  189. register INT32 prev, diff;
  190. int length;
  191. OP_ASSERT(cp - sp > 0);
  192. length = (int)(cp - sp);
  193. *(op++) = (unsigned char)(length & 0x3F) | ((length > 0x3F) << 7);
  194. length >>= 6;
  195. while (length > 0)
  196. {
  197. *(op++) = (unsigned char)(length & 0x7F) | ((length > 0x7F) << 7);
  198. length >>= 7;
  199. }
  200. prev = *sp++;
  201. *op++ = prev & 0xFF;
  202. *op++ = prev >> 8;
  203. while (sp < cp)
  204. {
  205. diff = prev - *sp;
  206. if ((diff & 0xFFFFFFC0) == 0 || (diff | 0x3F) == -1)
  207. *op++ = (unsigned char)(diff & 0x7F);
  208. else {
  209. *op++ = (unsigned char)((diff & 0x7F) | 0x80);
  210. diff >>= 7;
  211. if ((diff & 0xFFFFFFC0) == 0 || (diff | 0x3F) == -1)
  212. *op++ = (unsigned char)(diff & 0x7F);
  213. else {
  214. *op++ = (unsigned char)((diff & 0x7F) | 0x80);
  215. *op++ = diff >> 7;
  216. }
  217. }
  218. prev = *sp++;
  219. }
  220. return op;
  221. }
  222. unsigned char *UniCompressor::OutputMatch(unsigned char *op, unsigned length, unsigned short offset)
  223. {
  224. length -= 2; // Must still be 2 (not 4) for backwards compatibility (compression impact: 0.0016%)
  225. *(op++) = (unsigned char)(length & 0x1F) | ((length > 0x1F) << 7) | 0x40 | ((offset <= 0xFF) << 5);
  226. length >>= 5;
  227. while (length > 0)
  228. {
  229. *(op++) = (unsigned char)(length & 0x7F) | ((length > 0x7F) << 7);
  230. length >>= 7;
  231. }
  232. if (offset <= 0xFF)
  233. *op++ = (unsigned char)offset;
  234. else
  235. {
  236. *op++ = (unsigned char)(offset & 0xFF);
  237. *op++ = (unsigned char)(offset >> 8);
  238. }
  239. return op;
  240. }
  241. #undef DICT_SIZE
  242. #undef DICT_MASK
  243. #undef hash
  244. #endif // SEARCH_ENGINE