Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. #!/usr/bin/python
  2. #
  3. # linearize-data.py: Construct a linear, no-fork version of the chain.
  4. #
  5. # Copyright (c) 2013-2014 The Bitcoin developers
  6. # Distributed under the MIT/X11 software license, see the accompanying
  7. # file COPYING or http://www.opensource.org/licenses/mit-license.php.
  8. #
  9. from __future__ import print_function, division
  10. import json
  11. import struct
  12. import re
  13. import os
  14. import base64
  15. import httplib
  16. import sys
  17. import hashlib
  18. import datetime
  19. import time
  20. from collections import namedtuple
  21. settings = {}
  22. def uint32(x):
  23. return x & 0xffffffffL
  24. def bytereverse(x):
  25. return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
  26. (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
  27. def bufreverse(in_buf):
  28. out_words = []
  29. for i in range(0, len(in_buf), 4):
  30. word = struct.unpack('@I', in_buf[i:i+4])[0]
  31. out_words.append(struct.pack('@I', bytereverse(word)))
  32. return ''.join(out_words)
  33. def wordreverse(in_buf):
  34. out_words = []
  35. for i in range(0, len(in_buf), 4):
  36. out_words.append(in_buf[i:i+4])
  37. out_words.reverse()
  38. return ''.join(out_words)
  39. def calc_hdr_hash(blk_hdr):
  40. hash1 = hashlib.sha256()
  41. hash1.update(blk_hdr)
  42. hash1_o = hash1.digest()
  43. hash2 = hashlib.sha256()
  44. hash2.update(hash1_o)
  45. hash2_o = hash2.digest()
  46. return hash2_o
  47. def calc_hash_str(blk_hdr):
  48. hash = calc_hdr_hash(blk_hdr)
  49. hash = bufreverse(hash)
  50. hash = wordreverse(hash)
  51. hash_str = hash.encode('hex')
  52. return hash_str
  53. def get_blk_dt(blk_hdr):
  54. members = struct.unpack("<I", blk_hdr[68:68+4])
  55. nTime = members[0]
  56. dt = datetime.datetime.fromtimestamp(nTime)
  57. dt_ym = datetime.datetime(dt.year, dt.month, 1)
  58. return (dt_ym, nTime)
  59. def get_block_hashes(settings):
  60. blkindex = []
  61. f = open(settings['hashlist'], "r")
  62. for line in f:
  63. line = line.rstrip()
  64. blkindex.append(line)
  65. print("Read " + str(len(blkindex)) + " hashes")
  66. return blkindex
  67. def mkblockmap(blkindex):
  68. blkmap = {}
  69. for height,hash in enumerate(blkindex):
  70. blkmap[hash] = height
  71. return blkmap
  72. # Block header and extent on disk
  73. BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
  74. class BlockDataCopier:
  75. def __init__(self, settings, blkindex, blkmap):
  76. self.settings = settings
  77. self.blkindex = blkindex
  78. self.blkmap = blkmap
  79. self.inFn = 0
  80. self.inF = None
  81. self.outFn = 0
  82. self.outsz = 0
  83. self.outF = None
  84. self.outFname = None
  85. self.blkCountIn = 0
  86. self.blkCountOut = 0
  87. self.lastDate = datetime.datetime(2000, 1, 1)
  88. self.highTS = 1408893517 - 315360000
  89. self.timestampSplit = False
  90. self.fileOutput = True
  91. self.setFileTime = False
  92. self.maxOutSz = settings['max_out_sz']
  93. if 'output' in settings:
  94. self.fileOutput = False
  95. if settings['file_timestamp'] != 0:
  96. self.setFileTime = True
  97. if settings['split_timestamp'] != 0:
  98. self.timestampSplit = True
  99. # Extents and cache for out-of-order blocks
  100. self.blockExtents = {}
  101. self.outOfOrderData = {}
  102. self.outOfOrderSize = 0 # running total size for items in outOfOrderData
  103. def writeBlock(self, inhdr, blk_hdr, rawblock):
  104. if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz):
  105. self.outF.close()
  106. if self.setFileTime:
  107. os.utime(outFname, (int(time.time()), highTS))
  108. self.outF = None
  109. self.outFname = None
  110. self.outFn = outFn + 1
  111. self.outsz = 0
  112. (blkDate, blkTS) = get_blk_dt(blk_hdr)
  113. if self.timestampSplit and (blkDate > self.lastDate):
  114. print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
  115. lastDate = blkDate
  116. if outF:
  117. outF.close()
  118. if setFileTime:
  119. os.utime(outFname, (int(time.time()), highTS))
  120. self.outF = None
  121. self.outFname = None
  122. self.outFn = self.outFn + 1
  123. self.outsz = 0
  124. if not self.outF:
  125. if self.fileOutput:
  126. outFname = self.settings['output_file']
  127. else:
  128. outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn)
  129. print("Output file" + outFname)
  130. self.outF = open(outFname, "wb")
  131. self.outF.write(inhdr)
  132. self.outF.write(blk_hdr)
  133. self.outF.write(rawblock)
  134. self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
  135. self.blkCountOut = self.blkCountOut + 1
  136. if blkTS > self.highTS:
  137. self.highTS = blkTS
  138. if (self.blkCountOut % 1000) == 0:
  139. print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
  140. (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
  141. def inFileName(self, fn):
  142. return "%s/blk%05d.dat" % (self.settings['input'], fn)
  143. def fetchBlock(self, extent):
  144. '''Fetch block contents from disk given extents'''
  145. with open(self.inFileName(extent.fn), "rb") as f:
  146. f.seek(extent.offset)
  147. return f.read(extent.size)
  148. def copyOneBlock(self):
  149. '''Find the next block to be written in the input, and copy it to the output.'''
  150. extent = self.blockExtents.pop(self.blkCountOut)
  151. if self.blkCountOut in self.outOfOrderData:
  152. # If the data is cached, use it from memory and remove from the cache
  153. rawblock = self.outOfOrderData.pop(self.blkCountOut)
  154. self.outOfOrderSize -= len(rawblock)
  155. else: # Otherwise look up data on disk
  156. rawblock = self.fetchBlock(extent)
  157. self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
  158. def run(self):
  159. while self.blkCountOut < len(self.blkindex):
  160. if not self.inF:
  161. fname = self.inFileName(self.inFn)
  162. print("Input file" + fname)
  163. try:
  164. self.inF = open(fname, "rb")
  165. except IOError:
  166. print("Premature end of block data")
  167. return
  168. inhdr = self.inF.read(8)
  169. if (not inhdr or (inhdr[0] == "\0")):
  170. self.inF.close()
  171. self.inF = None
  172. self.inFn = self.inFn + 1
  173. continue
  174. inMagic = inhdr[:4]
  175. if (inMagic != self.settings['netmagic']):
  176. print("Invalid magic:" + inMagic)
  177. return
  178. inLenLE = inhdr[4:]
  179. su = struct.unpack("<I", inLenLE)
  180. inLen = su[0] - 80 # length without header
  181. blk_hdr = self.inF.read(80)
  182. inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
  183. hash_str = calc_hash_str(blk_hdr)
  184. if not hash_str in blkmap:
  185. print("Skipping unknown block " + hash_str)
  186. self.inF.seek(inLen, os.SEEK_CUR)
  187. continue
  188. blkHeight = self.blkmap[hash_str]
  189. self.blkCountIn += 1
  190. if self.blkCountOut == blkHeight:
  191. # If in-order block, just copy
  192. rawblock = self.inF.read(inLen)
  193. self.writeBlock(inhdr, blk_hdr, rawblock)
  194. # See if we can catch up to prior out-of-order blocks
  195. while self.blkCountOut in self.blockExtents:
  196. self.copyOneBlock()
  197. else: # If out-of-order, skip over block data for now
  198. self.blockExtents[blkHeight] = inExtent
  199. if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
  200. # If there is space in the cache, read the data
  201. # Reading the data in file sequence instead of seeking and fetching it later is preferred,
  202. # but we don't want to fill up memory
  203. self.outOfOrderData[blkHeight] = self.inF.read(inLen)
  204. self.outOfOrderSize += inLen
  205. else: # If no space in cache, seek forward
  206. self.inF.seek(inLen, os.SEEK_CUR)
  207. print("Done (%i blocks written)" % (self.blkCountOut))
  208. if __name__ == '__main__':
  209. if len(sys.argv) != 2:
  210. print("Usage: linearize-data.py CONFIG-FILE")
  211. sys.exit(1)
  212. f = open(sys.argv[1])
  213. for line in f:
  214. # skip comment lines
  215. m = re.search('^\s*#', line)
  216. if m:
  217. continue
  218. # parse key=value lines
  219. m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
  220. if m is None:
  221. continue
  222. settings[m.group(1)] = m.group(2)
  223. f.close()
  224. if 'netmagic' not in settings:
  225. settings['netmagic'] = 'f9beb4d9'
  226. if 'input' not in settings:
  227. settings['input'] = 'input'
  228. if 'hashlist' not in settings:
  229. settings['hashlist'] = 'hashlist.txt'
  230. if 'file_timestamp' not in settings:
  231. settings['file_timestamp'] = 0
  232. if 'split_timestamp' not in settings:
  233. settings['split_timestamp'] = 0
  234. if 'max_out_sz' not in settings:
  235. settings['max_out_sz'] = 1000L * 1000 * 1000
  236. if 'out_of_order_cache_sz' not in settings:
  237. settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
  238. settings['max_out_sz'] = long(settings['max_out_sz'])
  239. settings['split_timestamp'] = int(settings['split_timestamp'])
  240. settings['file_timestamp'] = int(settings['file_timestamp'])
  241. settings['netmagic'] = settings['netmagic'].decode('hex')
  242. settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
  243. if 'output_file' not in settings and 'output' not in settings:
  244. print("Missing output file / directory")
  245. sys.exit(1)
  246. blkindex = get_block_hashes(settings)
  247. blkmap = mkblockmap(blkindex)
  248. if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkmap:
  249. print("not found")
  250. else:
  251. BlockDataCopier(settings, blkindex, blkmap).run()