A command line (CLI) program for monitoring and downloading 8chan threads. Licensed under MIT.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

web_methods.py 3.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. import json
  2. import re
  3. import requests
  4. from file_io import *
  5. from file_io import generate_json_path
  6. from json_methods import extract_filenames
  7. user_agent = 'https://git.teknik.io/abrax/lizard'
  8. def fetch_and_parse_thread_json(board, thread_no):
  9. url = generate_thread_json_url(board, thread_no)
  10. p = generate_json_path(board, thread_no)
  11. if not download_file(url, p):
  12. return None
  13. else:
  14. return json.load(open(p))
  15. def dump_thread_html(board, thread_no):
  16. print('Downloading HTML for /{}/{}...'.format(board, thread_no))
  17. url = generate_thread_html_url(board, thread_no)
  18. p = generate_html_path(board, thread_no)
  19. print('Downloading html to {}'.format(p))
  20. download_file(url, p)
  21. def dump_thread_files(board, thread_no, thread_json):
  22. """ Downloads the files referenced in the supplied JSON. """
  23. # Make a list of the files in the thread
  24. filenames = extract_filenames(thread_json)
  25. # Filter out files that already exist in the cache
  26. filtered_filenames = [f for f in filenames if not file_exists_in_cache(board, thread_no, normalize_filename(f))]
  27. # Download the files
  28. print('Downloading {} files:'.format(len(filtered_filenames)))
  29. for f in filtered_filenames:
  30. normalized = normalize_filename(f)
  31. print('\t{}'.format(normalized))
  32. file_url = generate_file_url(f['hashed_name'])
  33. p = path_to_cached_file(board, thread_no, normalized)
  34. if not download_file(file_url, p):
  35. print('\t\tGot a 404, trying alternate link.')
  36. # Try alternate link
  37. alternate_url = generate_alternate_file_url(board, f['hashed_name'])
  38. download_file(alternate_url, p)
  39. def download_file(file_url, file_path):
  40. """ If the request succeeds, downloads the file and returns True.
  41. On a 404, returns False. On other responses, raises exception. """
  42. r = requests.get(file_url, headers={'user-agent': user_agent})
  43. if r.status_code == requests.codes.ok:
  44. save_file(file_path, r.content)
  45. return True
  46. elif r.status_code == 404:
  47. return False
  48. else:
  49. raise "Unexpected status code {} while trying to fetch {} - try opening in the browser, if that doesn't work " \
  50. "submit an issue to the tracker.".format(r.status_code, file_url)
  51. def generate_thread_json_url(board, thread_no):
  52. url = 'https://8ch.net/{}/res/{}.json'.format(board, thread_no)
  53. return url
  54. def generate_thread_html_url(board, thread_no):
  55. url = 'https://8ch.net/{}/res/{}.html'.format(board, thread_no)
  56. return url
  57. def generate_file_url(filename):
  58. url = 'https://media.8ch.net/file_store/{}'.format(filename)
  59. return url
  60. def generate_alternate_file_url(board, filename):
  61. """Some images, like the OP pic of the /tech/ sticky, use an alternate media2 URL. """
  62. url = 'https://media2.8ch.net/{}/src/{}'.format(board, filename)
  63. return url
  64. def parse_url(url):
  65. """Extracts the board name and thread no from a URL.
  66. """
  67. parts = url.split('#', 1)
  68. board, thread_no = re.findall('(\w+)\/res\/(\d+)', parts[0])[0]
  69. anchored_reply = '' if len(parts)<2 else re.findall('q?(\d+)$', parts[1])[0]
  70. return [board, thread_no, anchored_reply]