A command line (CLI) program for monitoring and downloading 8chan threads. Licensed under MIT.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

web_methods.py 4.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. import json
  2. import re
  3. import pkg_resources
  4. import requests
  5. from lizard.json_methods import extract_filenames
  6. from lizard.file_io import *
  7. version = pkg_resources.require("lizard")[0].version
  8. user_agent = 'Lizard v{}: https://git.teknik.io/abrax/lizard'.format(version)
  9. def fetch_and_parse_thread_json(board, thread_no, use_onion_link=False):
  10. url = generate_thread_json_url(board, thread_no, use_onion_link=use_onion_link)
  11. p = generate_json_path(board, thread_no)
  12. if not download_file(url, p):
  13. return None
  14. else:
  15. return json.load(open(p))
  16. def dump_thread_html(board, thread_no, use_onion_link=False):
  17. print('Downloading HTML for /{}/{}...'.format(board, thread_no))
  18. url = generate_thread_html_url(board, thread_no, use_onion_link=use_onion_link)
  19. p = generate_html_path(board, thread_no)
  20. print('Downloading html to {}'.format(p))
  21. download_file(url, p)
  22. def dump_thread_files(board, thread_no, thread_json, use_onion_link=False):
  23. """ Downloads the files referenced in the supplied JSON. """
  24. # Make a list of the files in the thread
  25. filenames = extract_filenames(thread_json)
  26. # Filter out files that already exist in the cache
  27. filtered_filenames = [f for f in filenames if not file_exists_in_cache(board, thread_no, normalize_filename(f))]
  28. # Download the files
  29. print('Downloading {} files:'.format(len(filtered_filenames)))
  30. for f in filtered_filenames:
  31. normalized = normalize_filename(f)
  32. print('\t{}'.format(normalized))
  33. file_url = generate_file_url(f['hashed_name'], use_onion_link=use_onion_link)
  34. p = path_to_cached_file(board, thread_no, normalized)
  35. if not download_file(file_url, p):
  36. print('\t\tAttempt failed, trying alternate link.')
  37. # Try alternate link
  38. alternate_url = generate_alternate_file_url(board, f['hashed_name'], use_onion_link=use_onion_link)
  39. download_file(alternate_url, p)
  40. def download_file(file_url, file_path):
  41. """ If the request succeeds, downloads the file and returns True.
  42. On a 404 or 504, returns False. On other responses, raises exception. """
  43. r = requests.get(file_url, headers={'user-agent': user_agent})
  44. if r.status_code == requests.codes.ok:
  45. save_file(file_path, r.content)
  46. return True
  47. elif r.status_code == 404:
  48. return False
  49. elif r.status_code == 504:
  50. print("Server busy (504), couldn't get {}, try again later.".format(file_url))
  51. return False
  52. elif r.status_code == 403:
  53. print("8ch.net returned 403 (forbidden). There can be many reasons for this, but a common one is trying to "
  54. "connect to 8ch.net (instead oxwugzccvk3dk6tj.onion) through Tor. If you are using Tor, "
  55. "please substitute r commands with tr (trl, trc, etc).")
  56. return False
  57. else:
  58. raise RuntimeError("Unexpected status code {} while trying to fetch {} - try opening in the browser, "
  59. "if that doesn't work submit an issue to the tracker.".format(r.status_code, file_url))
  60. def thread_domain(use_onion_link=False):
  61. if use_onion_link:
  62. url = 'http://oxwugzccvk3dk6tj.onion'
  63. else:
  64. url = 'https://8ch.net'
  65. return url
  66. def generate_thread_json_url(board, thread_no, use_onion_link=False):
  67. url = '{}/{}/res/{}.json'.format(thread_domain(use_onion_link), board, thread_no)
  68. return url
  69. def generate_thread_html_url(board, thread_no, use_onion_link=False):
  70. url = '{}/{}/res/{}.html'.format(thread_domain(use_onion_link), board, thread_no)
  71. return url
  72. def generate_file_url(filename, use_onion_link=False):
  73. if use_onion_link:
  74. url = 'http://oxwugzccvk3dk6tj.onion/file_store/{}'.format(filename)
  75. else:
  76. url = 'https://media.8ch.net/file_store/{}'.format(filename)
  77. return url
  78. def generate_alternate_file_url(board, filename, use_onion_link=False):
  79. """Some images, like the OP pic of the /tech/ sticky, use an alternate media2 URL. """
  80. if use_onion_link:
  81. url = 'http://oxwugzccvk3dk6tj.onion/{}/res/{}.html'.format(board, filename)
  82. else:
  83. url = 'https://media2.8ch.net/{}/src/{}'.format(board, filename)
  84. return url
  85. def parse_url(url):
  86. """ Extracts the board name and thread no from a URL. """
  87. parts = url.split('#', 1)
  88. board, thread_no = re.findall('(\w+)\/res\/(\d+)', parts[0])[0]
  89. anchored_reply = '' if len(parts) < 2 else re.findall('q?(\d+)$', parts[1])[0]
  90. return [board, thread_no, anchored_reply]