A command line (CLI) program for monitoring and downloading 8chan threads. Licensed under MIT.
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. import json
  2. import re
  3. import requests
  4. from file_io import *
  5. from file_io import generate_json_path
  6. from json_methods import extract_filenames
  7. user_agent = 'https://git.teknik.io/abrax/lizard'
  8. def fetch_and_parse_thread_json(board, thread_no):
  9. url = generate_thread_json_url(board, thread_no)
  10. p = generate_json_path(board, thread_no)
  11. if not download_file(url, p):
  12. return None
  13. else:
  14. return json.load(open(p))
  15. def dump_thread_html(board, thread_no):
  16. print('Downloading HTML for /{}/{}...'.format(board, thread_no))
  17. url = generate_thread_html_url(board, thread_no)
  18. p = generate_html_path(board, thread_no)
  19. print('Downloading html to {}'.format(p))
  20. download_file(url, p)
  21. def dump_thread_files(board, thread_no, thread_json):
  22. """ Downloads the files referenced in the supplied JSON. """
  23. # Make a list of the files in the thread
  24. filenames = extract_filenames(thread_json)
  25. # Filter out files that already exist in the cache
  26. filtered_filenames = [f for f in filenames if not file_exists_in_cache(board, thread_no, normalize_filename(f))]
  27. # Download the files
  28. print('Downloading {} files:'.format(len(filtered_filenames)))
  29. for f in filtered_filenames:
  30. normalized = normalize_filename(f)
  31. print('\t{}'.format(normalized))
  32. file_url = generate_file_url(f['hashed_name'])
  33. p = path_to_cached_file(board, thread_no, normalized)
  34. if not download_file(file_url, p):
  35. print('\t\tGot a 404, trying alternate link.')
  36. # Try alternate link
  37. alternate_url = generate_alternate_file_url(board, f['hashed_name'])
  38. download_file(alternate_url, p)
  39. def download_file(file_url, file_path):
  40. """ If the request succeeds, downloads the file and returns True.
  41. On a 404, returns False. On other responses, raises exception. """
  42. r = requests.get(file_url, headers={'user-agent': user_agent})
  43. if r.status_code == requests.codes.ok:
  44. save_file(file_path, r.content)
  45. return True
  46. elif r.status_code == 404:
  47. return False
  48. else:
  49. raise "Unexpected status code {} while trying to fetch {} - try opening in the browser, if that doesn't work " \
  50. "submit an issue to the tracker.".format(r.status_code, file_url)
  51. def generate_thread_json_url(board, thread_no):
  52. url = 'https://8ch.net/{}/res/{}.json'.format(board, thread_no)
  53. return url
  54. def generate_thread_html_url(board, thread_no):
  55. url = 'https://8ch.net/{}/res/{}.html'.format(board, thread_no)
  56. return url
  57. def generate_file_url(filename):
  58. url = 'https://media.8ch.net/file_store/{}'.format(filename)
  59. return url
  60. def generate_alternate_file_url(board, filename):
  61. """Some images, like the OP pic of the /tech/ sticky, use an alternate media2 URL. """
  62. url = 'https://media2.8ch.net/{}/src/{}'.format(board, filename)
  63. return url
  64. def parse_url(url):
  65. """Extracts the board name and thread no from a URL.
  66. """
  67. parts = url.split('#', 1)
  68. board, thread_no = re.findall('(\w+)\/res\/(\d+)', parts[0])[0]
  69. anchored_reply = '' if len(parts)<2 else re.findall('q?(\d+)$', parts[1])[0]
  70. return [board, thread_no, anchored_reply]