You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

linearize-data.py 5.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. #!/usr/bin/python
  2. #
  3. # linearize-data.py: Construct a linear, no-fork version of the chain.
  4. #
  5. # Copyright (c) 2013 The Bitcoin developers
  6. # Distributed under the MIT/X11 software license, see the accompanying
  7. # file COPYING or http://www.opensource.org/licenses/mit-license.php.
  8. #
  9. import json
  10. import struct
  11. import re
  12. import os
  13. import base64
  14. import httplib
  15. import sys
  16. import hashlib
  17. import datetime
  18. import time
  19. settings = {}
  20. def uint32(x):
  21. return x & 0xffffffffL
  22. def bytereverse(x):
  23. return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
  24. (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
  25. def bufreverse(in_buf):
  26. out_words = []
  27. for i in range(0, len(in_buf), 4):
  28. word = struct.unpack('@I', in_buf[i:i+4])[0]
  29. out_words.append(struct.pack('@I', bytereverse(word)))
  30. return ''.join(out_words)
  31. def wordreverse(in_buf):
  32. out_words = []
  33. for i in range(0, len(in_buf), 4):
  34. out_words.append(in_buf[i:i+4])
  35. out_words.reverse()
  36. return ''.join(out_words)
  37. def calc_hdr_hash(blk_hdr):
  38. hash1 = hashlib.sha256()
  39. hash1.update(blk_hdr)
  40. hash1_o = hash1.digest()
  41. hash2 = hashlib.sha256()
  42. hash2.update(hash1_o)
  43. hash2_o = hash2.digest()
  44. return hash2_o
  45. def calc_hash_str(blk_hdr):
  46. hash = calc_hdr_hash(blk_hdr)
  47. hash = bufreverse(hash)
  48. hash = wordreverse(hash)
  49. hash_str = hash.encode('hex')
  50. return hash_str
  51. def get_blk_dt(blk_hdr):
  52. members = struct.unpack("<I", blk_hdr[68:68+4])
  53. nTime = members[0]
  54. dt = datetime.datetime.fromtimestamp(nTime)
  55. dt_ym = datetime.datetime(dt.year, dt.month, 1)
  56. return (dt_ym, nTime)
  57. def get_block_hashes(settings):
  58. blkindex = []
  59. f = open(settings['hashlist'], "r")
  60. for line in f:
  61. line = line.rstrip()
  62. blkindex.append(line)
  63. print("Read " + str(len(blkindex)) + " hashes")
  64. return blkindex
  65. def mkblockset(blkindex):
  66. blkmap = {}
  67. for hash in blkindex:
  68. blkmap[hash] = True
  69. return blkmap
  70. def copydata(settings, blkindex, blkset):
  71. inFn = 0
  72. inF = None
  73. outFn = 0
  74. outsz = 0
  75. outF = None
  76. outFname = None
  77. blkCount = 0
  78. lastDate = datetime.datetime(2000, 1, 1)
  79. highTS = 1408893517 - 315360000
  80. timestampSplit = False
  81. fileOutput = True
  82. setFileTime = False
  83. maxOutSz = settings['max_out_sz']
  84. if 'output' in settings:
  85. fileOutput = False
  86. if settings['file_timestamp'] != 0:
  87. setFileTime = True
  88. if settings['split_timestamp'] != 0:
  89. timestampSplit = True
  90. while True:
  91. if not inF:
  92. fname = "%s/blk%05d.dat" % (settings['input'], inFn)
  93. print("Input file" + fname)
  94. inF = open(fname, "rb")
  95. inhdr = inF.read(8)
  96. if (not inhdr or (inhdr[0] == "\0")):
  97. inF.close()
  98. inF = None
  99. inFn = inFn + 1
  100. continue
  101. inMagic = inhdr[:4]
  102. if (inMagic != settings['netmagic']):
  103. print("Invalid magic:" + inMagic)
  104. return
  105. inLenLE = inhdr[4:]
  106. su = struct.unpack("<I", inLenLE)
  107. inLen = su[0]
  108. rawblock = inF.read(inLen)
  109. blk_hdr = rawblock[:80]
  110. hash_str = calc_hash_str(blk_hdr)
  111. if not hash_str in blkset:
  112. print("Skipping unknown block " + hash_str)
  113. continue
  114. if blkindex[blkCount] != hash_str:
  115. print("Out of order block.")
  116. print("Expected " + blkindex[blkCount])
  117. print("Got " + hash_str)
  118. sys.exit(1)
  119. if not fileOutput and ((outsz + inLen) > maxOutSz):
  120. outF.close()
  121. if setFileTime:
  122. os.utime(outFname, (int(time.time()), highTS))
  123. outF = None
  124. outFname = None
  125. outFn = outFn + 1
  126. outsz = 0
  127. (blkDate, blkTS) = get_blk_dt(blk_hdr)
  128. if timestampSplit and (blkDate > lastDate):
  129. print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
  130. lastDate = blkDate
  131. if outF:
  132. outF.close()
  133. if setFileTime:
  134. os.utime(outFname, (int(time.time()), highTS))
  135. outF = None
  136. outFname = None
  137. outFn = outFn + 1
  138. outsz = 0
  139. if not outF:
  140. if fileOutput:
  141. outFname = settings['output_file']
  142. else:
  143. outFname = "%s/blk%05d.dat" % (settings['output'], outFn)
  144. print("Output file" + outFname)
  145. outF = open(outFname, "wb")
  146. outF.write(inhdr)
  147. outF.write(rawblock)
  148. outsz = outsz + inLen + 8
  149. blkCount = blkCount + 1
  150. if blkTS > highTS:
  151. highTS = blkTS
  152. if (blkCount % 1000) == 0:
  153. print("Wrote " + str(blkCount) + " blocks")
  154. if __name__ == '__main__':
  155. if len(sys.argv) != 2:
  156. print "Usage: linearize-data.py CONFIG-FILE"
  157. sys.exit(1)
  158. f = open(sys.argv[1])
  159. for line in f:
  160. # skip comment lines
  161. m = re.search('^\s*#', line)
  162. if m:
  163. continue
  164. # parse key=value lines
  165. m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
  166. if m is None:
  167. continue
  168. settings[m.group(1)] = m.group(2)
  169. f.close()
  170. if 'netmagic' not in settings:
  171. settings['netmagic'] = 'f9beb4d9'
  172. if 'input' not in settings:
  173. settings['input'] = 'input'
  174. if 'hashlist' not in settings:
  175. settings['hashlist'] = 'hashlist.txt'
  176. if 'file_timestamp' not in settings:
  177. settings['file_timestamp'] = 0
  178. if 'split_timestamp' not in settings:
  179. settings['split_timestamp'] = 0
  180. if 'max_out_sz' not in settings:
  181. settings['max_out_sz'] = 1000L * 1000 * 1000
  182. settings['max_out_sz'] = long(settings['max_out_sz'])
  183. settings['split_timestamp'] = int(settings['split_timestamp'])
  184. settings['file_timestamp'] = int(settings['file_timestamp'])
  185. settings['netmagic'] = settings['netmagic'].decode('hex')
  186. if 'output_file' not in settings and 'output' not in settings:
  187. print("Missing output file / directory")
  188. sys.exit(1)
  189. blkindex = get_block_hashes(settings)
  190. blkset = mkblockset(blkindex)
  191. if not "000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f" in blkset:
  192. print("not found")
  193. else:
  194. copydata(settings, blkindex, blkset)