You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

dbcrash.py 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. #!/usr/bin/env python3
  2. # Copyright (c) 2017 The Bitcoin Core developers
  3. # Distributed under the MIT software license, see the accompanying
  4. # file COPYING or http://www.opensource.org/licenses/mit-license.php.
  5. """Test recovery from a crash during chainstate writing.
  6. - 4 nodes
  7. * node0, node1, and node2 will have different dbcrash ratios, and different
  8. dbcache sizes
  9. * node3 will be a regular node, with no crashing.
  10. * The nodes will not connect to each other.
  11. - use default test framework starting chain. initialize starting_tip_height to
  12. tip height.
  13. - Main loop:
  14. * generate lots of transactions on node3, enough to fill up a block.
  15. * uniformly randomly pick a tip height from starting_tip_height to
  16. tip_height; with probability 1/(height_difference+4), invalidate this block.
  17. * mine enough blocks to overtake tip_height at start of loop.
  18. * for each node in [node0,node1,node2]:
  19. - for each mined block:
  20. * submit block to node
  21. * if node crashed on/after submitting:
  22. - restart until recovery succeeds
  23. - check that utxo matches node3 using gettxoutsetinfo"""
  24. import errno
  25. import http.client
  26. import random
  27. import sys
  28. import time
  29. from test_framework.mininode import *
  30. from test_framework.script import *
  31. from test_framework.test_framework import BitcoinTestFramework
  32. from test_framework.util import *
  33. HTTP_DISCONNECT_ERRORS = [http.client.CannotSendRequest]
  34. try:
  35. HTTP_DISCONNECT_ERRORS.append(http.client.RemoteDisconnected)
  36. except AttributeError:
  37. pass
  38. class ChainstateWriteCrashTest(BitcoinTestFramework):
  39. def set_test_params(self):
  40. self.num_nodes = 4
  41. self.setup_clean_chain = False
  42. # Set -maxmempool=0 to turn off mempool memory sharing with dbcache
  43. # Set -rpcservertimeout=900 to reduce socket disconnects in this
  44. # long-running test
  45. self.base_args = ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900", "-dbbatchsize=200000"]
  46. # Set different crash ratios and cache sizes. Note that not all of
  47. # -dbcache goes to pcoinsTip.
  48. self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args
  49. self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args
  50. self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args
  51. # Node3 is a normal node with default args, except will mine full blocks
  52. self.node3_args = ["-blockmaxweight=4000000"]
  53. self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args]
  54. def setup_network(self):
  55. # Need a bit of extra time for the nodes to start up for this test
  56. self.add_nodes(self.num_nodes, extra_args=self.extra_args, timewait=90)
  57. self.start_nodes()
  58. # Leave them unconnected, we'll use submitblock directly in this test
  59. def restart_node(self, node_index, expected_tip):
  60. """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash.
  61. Exceptions on startup should indicate node crash (due to -dbcrashratio), in which case we try again. Give up
  62. after 60 seconds. Returns the utxo hash of the given node."""
  63. time_start = time.time()
  64. while time.time() - time_start < 120:
  65. try:
  66. # Any of these RPC calls could throw due to node crash
  67. self.start_node(node_index)
  68. self.nodes[node_index].waitforblock(expected_tip)
  69. utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_2']
  70. return utxo_hash
  71. except:
  72. # An exception here should mean the node is about to crash.
  73. # If bitcoind exits, then try again. wait_for_node_exit()
  74. # should raise an exception if bitcoind doesn't exit.
  75. self.wait_for_node_exit(node_index, timeout=10)
  76. self.crashed_on_restart += 1
  77. time.sleep(1)
  78. # If we got here, bitcoind isn't coming back up on restart. Could be a
  79. # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio --
  80. # perhaps we generated a test case that blew up our cache?
  81. # TODO: If this happens a lot, we should try to restart without -dbcrashratio
  82. # and make sure that recovery happens.
  83. raise AssertionError("Unable to successfully restart node %d in allotted time", node_index)
  84. def submit_block_catch_error(self, node_index, block):
  85. """Try submitting a block to the given node.
  86. Catch any exceptions that indicate the node has crashed.
  87. Returns true if the block was submitted successfully; false otherwise."""
  88. try:
  89. self.nodes[node_index].submitblock(block)
  90. return True
  91. except http.client.BadStatusLine as e:
  92. # Prior to 3.5 BadStatusLine('') was raised for a remote disconnect error.
  93. if sys.version_info[0] == 3 and sys.version_info[1] < 5 and e.line == "''":
  94. self.log.debug("node %d submitblock raised exception: %s", node_index, e)
  95. return False
  96. else:
  97. raise
  98. except tuple(HTTP_DISCONNECT_ERRORS) as e:
  99. self.log.debug("node %d submitblock raised exception: %s", node_index, e)
  100. return False
  101. except OSError as e:
  102. self.log.debug("node %d submitblock raised OSError exception: errno=%s", node_index, e.errno)
  103. if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]:
  104. # The node has likely crashed
  105. return False
  106. else:
  107. # Unexpected exception, raise
  108. raise
  109. def sync_node3blocks(self, block_hashes):
  110. """Use submitblock to sync node3's chain with the other nodes
  111. If submitblock fails, restart the node and get the new utxo hash.
  112. If any nodes crash while updating, we'll compare utxo hashes to
  113. ensure recovery was successful."""
  114. node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
  115. # Retrieve all the blocks from node3
  116. blocks = []
  117. for block_hash in block_hashes:
  118. blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)])
  119. # Deliver each block to each other node
  120. for i in range(3):
  121. nodei_utxo_hash = None
  122. self.log.debug("Syncing blocks to node %d", i)
  123. for (block_hash, block) in blocks:
  124. # Get the block from node3, and submit to node_i
  125. self.log.debug("submitting block %s", block_hash)
  126. if not self.submit_block_catch_error(i, block):
  127. # TODO: more carefully check that the crash is due to -dbcrashratio
  128. # (change the exit code perhaps, and check that here?)
  129. self.wait_for_node_exit(i, timeout=30)
  130. self.log.debug("Restarting node %d after block hash %s", i, block_hash)
  131. nodei_utxo_hash = self.restart_node(i, block_hash)
  132. assert nodei_utxo_hash is not None
  133. self.restart_counts[i] += 1
  134. else:
  135. # Clear it out after successful submitblock calls -- the cached
  136. # utxo hash will no longer be correct
  137. nodei_utxo_hash = None
  138. # Check that the utxo hash matches node3's utxo set
  139. # NOTE: we only check the utxo set if we had to restart the node
  140. # after the last block submitted:
  141. # - checking the utxo hash causes a cache flush, which we don't
  142. # want to do every time; so
  143. # - we only update the utxo cache after a node restart, since flushing
  144. # the cache is a no-op at that point
  145. if nodei_utxo_hash is not None:
  146. self.log.debug("Checking txoutsetinfo matches for node %d", i)
  147. assert_equal(nodei_utxo_hash, node3_utxo_hash)
  148. def verify_utxo_hash(self):
  149. """Verify that the utxo hash of each node matches node3.
  150. Restart any nodes that crash while querying."""
  151. node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
  152. self.log.info("Verifying utxo hash matches for all nodes")
  153. for i in range(3):
  154. try:
  155. nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_2']
  156. except OSError:
  157. # probably a crash on db flushing
  158. nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash())
  159. assert_equal(nodei_utxo_hash, node3_utxo_hash)
  160. def generate_small_transactions(self, node, count, utxo_list):
  161. FEE = 1000 # TODO: replace this with node relay fee based calculation
  162. num_transactions = 0
  163. random.shuffle(utxo_list)
  164. while len(utxo_list) >= 2 and num_transactions < count:
  165. tx = CTransaction()
  166. input_amount = 0
  167. for i in range(2):
  168. utxo = utxo_list.pop()
  169. tx.vin.append(CTxIn(COutPoint(int(utxo['txid'], 16), utxo['vout'])))
  170. input_amount += int(utxo['amount'] * COIN)
  171. output_amount = (input_amount - FEE) // 3
  172. if output_amount <= 0:
  173. # Sanity check -- if we chose inputs that are too small, skip
  174. continue
  175. for i in range(3):
  176. tx.vout.append(CTxOut(output_amount, hex_str_to_bytes(utxo['scriptPubKey'])))
  177. # Sign and send the transaction to get into the mempool
  178. tx_signed_hex = node.signrawtransaction(ToHex(tx))['hex']
  179. node.sendrawtransaction(tx_signed_hex)
  180. num_transactions += 1
  181. def run_test(self):
  182. # Track test coverage statistics
  183. self.restart_counts = [0, 0, 0] # Track the restarts for nodes 0-2
  184. self.crashed_on_restart = 0 # Track count of crashes during recovery
  185. # Start by creating a lot of utxos on node3
  186. initial_height = self.nodes[3].getblockcount()
  187. utxo_list = create_confirmed_utxos(self.nodes[3].getnetworkinfo()['relayfee'], self.nodes[3], 5000)
  188. self.log.info("Prepped %d utxo entries", len(utxo_list))
  189. # Sync these blocks with the other nodes
  190. block_hashes_to_sync = []
  191. for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1):
  192. block_hashes_to_sync.append(self.nodes[3].getblockhash(height))
  193. self.log.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync))
  194. # Syncing the blocks could cause nodes to crash, so the test begins here.
  195. self.sync_node3blocks(block_hashes_to_sync)
  196. starting_tip_height = self.nodes[3].getblockcount()
  197. # Main test loop:
  198. # each time through the loop, generate a bunch of transactions,
  199. # and then either mine a single new block on the tip, or some-sized reorg.
  200. for i in range(40):
  201. self.log.info("Iteration %d, generating 2500 transactions %s", i, self.restart_counts)
  202. # Generate a bunch of small-ish transactions
  203. self.generate_small_transactions(self.nodes[3], 2500, utxo_list)
  204. # Pick a random block between current tip, and starting tip
  205. current_height = self.nodes[3].getblockcount()
  206. random_height = random.randint(starting_tip_height, current_height)
  207. self.log.debug("At height %d, considering height %d", current_height, random_height)
  208. if random_height > starting_tip_height:
  209. # Randomly reorg from this point with some probability (1/4 for
  210. # tip, 1/5 for tip-1, ...)
  211. if random.random() < 1.0 / (current_height + 4 - random_height):
  212. self.log.debug("Invalidating block at height %d", random_height)
  213. self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height))
  214. # Now generate new blocks until we pass the old tip height
  215. self.log.debug("Mining longer tip")
  216. block_hashes = []
  217. while current_height + 1 > self.nodes[3].getblockcount():
  218. block_hashes.extend(self.nodes[3].generate(min(10, current_height + 1 - self.nodes[3].getblockcount())))
  219. self.log.debug("Syncing %d new blocks...", len(block_hashes))
  220. self.sync_node3blocks(block_hashes)
  221. utxo_list = self.nodes[3].listunspent()
  222. self.log.debug("Node3 utxo count: %d", len(utxo_list))
  223. # Check that the utxo hashes agree with node3
  224. # Useful side effect: each utxo cache gets flushed here, so that we
  225. # won't get crashes on shutdown at the end of the test.
  226. self.verify_utxo_hash()
  227. # Check the test coverage
  228. self.log.info("Restarted nodes: %s; crashes on restart: %d", self.restart_counts, self.crashed_on_restart)
  229. # If no nodes were restarted, we didn't test anything.
  230. assert self.restart_counts != [0, 0, 0]
  231. # Make sure we tested the case of crash-during-recovery.
  232. assert self.crashed_on_restart > 0
  233. # Warn if any of the nodes escaped restart.
  234. for i in range(3):
  235. if self.restart_counts[i] == 0:
  236. self.log.warn("Node %d never crashed during utxo flush!", i)
  237. if __name__ == "__main__":
  238. ChainstateWriteCrashTest().main()