You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

linearize-data.py 9.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. #!/usr/bin/env python3
  2. #
  3. # linearize-data.py: Construct a linear, no-fork version of the chain.
  4. #
  5. # Copyright (c) 2013-2016 The Starwels developers
  6. # Distributed under the MIT software license, see the accompanying
  7. # file COPYING or http://www.opensource.org/licenses/mit-license.php.
  8. #
  9. from __future__ import print_function, division
  10. import struct
  11. import re
  12. import os
  13. import os.path
  14. import sys
  15. import hashlib
  16. import datetime
  17. import time
  18. from collections import namedtuple
  19. from binascii import hexlify, unhexlify
  20. settings = {}
  21. ##### Switch endian-ness #####
  22. def hex_switchEndian(s):
  23. """ Switches the endianness of a hex string (in pairs of hex chars) """
  24. pairList = [s[i:i+2].encode() for i in range(0, len(s), 2)]
  25. return b''.join(pairList[::-1]).decode()
  26. def uint32(x):
  27. return x & 0xffffffff
  28. def bytereverse(x):
  29. return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
  30. (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
  31. def bufreverse(in_buf):
  32. out_words = []
  33. for i in range(0, len(in_buf), 4):
  34. word = struct.unpack('@I', in_buf[i:i+4])[0]
  35. out_words.append(struct.pack('@I', bytereverse(word)))
  36. return b''.join(out_words)
  37. def wordreverse(in_buf):
  38. out_words = []
  39. for i in range(0, len(in_buf), 4):
  40. out_words.append(in_buf[i:i+4])
  41. out_words.reverse()
  42. return b''.join(out_words)
  43. def calc_hdr_hash(blk_hdr):
  44. hash1 = hashlib.sha256()
  45. hash1.update(blk_hdr)
  46. hash1_o = hash1.digest()
  47. hash2 = hashlib.sha256()
  48. hash2.update(hash1_o)
  49. hash2_o = hash2.digest()
  50. return hash2_o
  51. def calc_hash_str(blk_hdr):
  52. hash = calc_hdr_hash(blk_hdr)
  53. hash = bufreverse(hash)
  54. hash = wordreverse(hash)
  55. hash_str = hexlify(hash).decode('utf-8')
  56. return hash_str
  57. def get_blk_dt(blk_hdr):
  58. members = struct.unpack("<I", blk_hdr[68:68+4])
  59. nTime = members[0]
  60. dt = datetime.datetime.fromtimestamp(nTime)
  61. dt_ym = datetime.datetime(dt.year, dt.month, 1)
  62. return (dt_ym, nTime)
  63. # When getting the list of block hashes, undo any byte reversals.
  64. def get_block_hashes(settings):
  65. blkindex = []
  66. f = open(settings['hashlist'], "r")
  67. for line in f:
  68. line = line.rstrip()
  69. if settings['rev_hash_bytes'] == 'true':
  70. line = hex_switchEndian(line)
  71. blkindex.append(line)
  72. print("Read " + str(len(blkindex)) + " hashes")
  73. return blkindex
  74. # The block map shouldn't give or receive byte-reversed hashes.
  75. def mkblockmap(blkindex):
  76. blkmap = {}
  77. for height,hash in enumerate(blkindex):
  78. blkmap[hash] = height
  79. return blkmap
  80. # Block header and extent on disk
  81. BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
  82. class BlockDataCopier:
  83. def __init__(self, settings, blkindex, blkmap):
  84. self.settings = settings
  85. self.blkindex = blkindex
  86. self.blkmap = blkmap
  87. self.inFn = 0
  88. self.inF = None
  89. self.outFn = 0
  90. self.outsz = 0
  91. self.outF = None
  92. self.outFname = None
  93. self.blkCountIn = 0
  94. self.blkCountOut = 0
  95. self.lastDate = datetime.datetime(2000, 1, 1)
  96. self.highTS = 1408893517 - 315360000
  97. self.timestampSplit = False
  98. self.fileOutput = True
  99. self.setFileTime = False
  100. self.maxOutSz = settings['max_out_sz']
  101. if 'output' in settings:
  102. self.fileOutput = False
  103. if settings['file_timestamp'] != 0:
  104. self.setFileTime = True
  105. if settings['split_timestamp'] != 0:
  106. self.timestampSplit = True
  107. # Extents and cache for out-of-order blocks
  108. self.blockExtents = {}
  109. self.outOfOrderData = {}
  110. self.outOfOrderSize = 0 # running total size for items in outOfOrderData
  111. def writeBlock(self, inhdr, blk_hdr, rawblock):
  112. blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
  113. if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
  114. self.outF.close()
  115. if self.setFileTime:
  116. os.utime(self.outFname, (int(time.time()), self.highTS))
  117. self.outF = None
  118. self.outFname = None
  119. self.outFn = self.outFn + 1
  120. self.outsz = 0
  121. (blkDate, blkTS) = get_blk_dt(blk_hdr)
  122. if self.timestampSplit and (blkDate > self.lastDate):
  123. print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
  124. self.lastDate = blkDate
  125. if self.outF:
  126. self.outF.close()
  127. if self.setFileTime:
  128. os.utime(self.outFname, (int(time.time()), self.highTS))
  129. self.outF = None
  130. self.outFname = None
  131. self.outFn = self.outFn + 1
  132. self.outsz = 0
  133. if not self.outF:
  134. if self.fileOutput:
  135. self.outFname = self.settings['output_file']
  136. else:
  137. self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
  138. print("Output file " + self.outFname)
  139. self.outF = open(self.outFname, "wb")
  140. self.outF.write(inhdr)
  141. self.outF.write(blk_hdr)
  142. self.outF.write(rawblock)
  143. self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
  144. self.blkCountOut = self.blkCountOut + 1
  145. if blkTS > self.highTS:
  146. self.highTS = blkTS
  147. if (self.blkCountOut % 1000) == 0:
  148. print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
  149. (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
  150. def inFileName(self, fn):
  151. return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
  152. def fetchBlock(self, extent):
  153. '''Fetch block contents from disk given extents'''
  154. with open(self.inFileName(extent.fn), "rb") as f:
  155. f.seek(extent.offset)
  156. return f.read(extent.size)
  157. def copyOneBlock(self):
  158. '''Find the next block to be written in the input, and copy it to the output.'''
  159. extent = self.blockExtents.pop(self.blkCountOut)
  160. if self.blkCountOut in self.outOfOrderData:
  161. # If the data is cached, use it from memory and remove from the cache
  162. rawblock = self.outOfOrderData.pop(self.blkCountOut)
  163. self.outOfOrderSize -= len(rawblock)
  164. else: # Otherwise look up data on disk
  165. rawblock = self.fetchBlock(extent)
  166. self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
  167. def run(self):
  168. while self.blkCountOut < len(self.blkindex):
  169. if not self.inF:
  170. fname = self.inFileName(self.inFn)
  171. print("Input file " + fname)
  172. try:
  173. self.inF = open(fname, "rb")
  174. except IOError:
  175. print("Premature end of block data")
  176. return
  177. inhdr = self.inF.read(8)
  178. if (not inhdr or (inhdr[0] == "\0")):
  179. self.inF.close()
  180. self.inF = None
  181. self.inFn = self.inFn + 1
  182. continue
  183. inMagic = inhdr[:4]
  184. if (inMagic != self.settings['netmagic']):
  185. print("Invalid magic: " + hexlify(inMagic).decode('utf-8'))
  186. return
  187. inLenLE = inhdr[4:]
  188. su = struct.unpack("<I", inLenLE)
  189. inLen = su[0] - 80 # length without header
  190. blk_hdr = self.inF.read(80)
  191. inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
  192. self.hash_str = calc_hash_str(blk_hdr)
  193. if not self.hash_str in blkmap:
  194. # Because blocks can be written to files out-of-order as of 0.10, the script
  195. # may encounter blocks it doesn't know about. Treat as debug output.
  196. if settings['debug_output'] == 'true':
  197. print("Skipping unknown block " + self.hash_str)
  198. self.inF.seek(inLen, os.SEEK_CUR)
  199. continue
  200. blkHeight = self.blkmap[self.hash_str]
  201. self.blkCountIn += 1
  202. if self.blkCountOut == blkHeight:
  203. # If in-order block, just copy
  204. rawblock = self.inF.read(inLen)
  205. self.writeBlock(inhdr, blk_hdr, rawblock)
  206. # See if we can catch up to prior out-of-order blocks
  207. while self.blkCountOut in self.blockExtents:
  208. self.copyOneBlock()
  209. else: # If out-of-order, skip over block data for now
  210. self.blockExtents[blkHeight] = inExtent
  211. if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
  212. # If there is space in the cache, read the data
  213. # Reading the data in file sequence instead of seeking and fetching it later is preferred,
  214. # but we don't want to fill up memory
  215. self.outOfOrderData[blkHeight] = self.inF.read(inLen)
  216. self.outOfOrderSize += inLen
  217. else: # If no space in cache, seek forward
  218. self.inF.seek(inLen, os.SEEK_CUR)
  219. print("Done (%i blocks written)" % (self.blkCountOut))
  220. if __name__ == '__main__':
  221. if len(sys.argv) != 2:
  222. print("Usage: linearize-data.py CONFIG-FILE")
  223. sys.exit(1)
  224. f = open(sys.argv[1])
  225. for line in f:
  226. # skip comment lines
  227. m = re.search('^\s*#', line)
  228. if m:
  229. continue
  230. # parse key=value lines
  231. m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
  232. if m is None:
  233. continue
  234. settings[m.group(1)] = m.group(2)
  235. f.close()
  236. # Force hash byte format setting to be lowercase to make comparisons easier.
  237. # Also place upfront in case any settings need to know about it.
  238. if 'rev_hash_bytes' not in settings:
  239. settings['rev_hash_bytes'] = 'false'
  240. settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
  241. if 'netmagic' not in settings:
  242. settings['netmagic'] = 'f9beb4d9'
  243. if 'genesis' not in settings:
  244. settings['genesis'] = '000000003d69a915e9da53348c5c272978bb743442e3a6341c11061c125811a2'
  245. if 'input' not in settings:
  246. settings['input'] = 'input'
  247. if 'hashlist' not in settings:
  248. settings['hashlist'] = 'hashlist.txt'
  249. if 'file_timestamp' not in settings:
  250. settings['file_timestamp'] = 0
  251. if 'split_timestamp' not in settings:
  252. settings['split_timestamp'] = 0
  253. if 'max_out_sz' not in settings:
  254. settings['max_out_sz'] = 1000 * 1000 * 1000
  255. if 'out_of_order_cache_sz' not in settings:
  256. settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
  257. if 'debug_output' not in settings:
  258. settings['debug_output'] = 'false'
  259. settings['max_out_sz'] = int(settings['max_out_sz'])
  260. settings['split_timestamp'] = int(settings['split_timestamp'])
  261. settings['file_timestamp'] = int(settings['file_timestamp'])
  262. settings['netmagic'] = unhexlify(settings['netmagic'].encode('utf-8'))
  263. settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
  264. settings['debug_output'] = settings['debug_output'].lower()
  265. if 'output_file' not in settings and 'output' not in settings:
  266. print("Missing output file / directory")
  267. sys.exit(1)
  268. blkindex = get_block_hashes(settings)
  269. blkmap = mkblockmap(blkindex)
  270. # Block hash map won't be byte-reversed. Neither should the genesis hash.
  271. if not settings['genesis'] in blkmap:
  272. print("Genesis block not found in hashlist")
  273. else:
  274. BlockDataCopier(settings, blkindex, blkmap).run()