123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- import json
- import re
-
- import pkg_resources
- import requests
- from lizard.json_methods import extract_filenames
-
- from lizard.file_io import *
-
- version = pkg_resources.require("lizard")[0].version
- user_agent = 'Lizard v{}: https://git.teknik.io/abrax/lizard'.format(version)
-
-
- def fetch_and_parse_thread_json(board, thread_no):
- url = generate_thread_json_url(board, thread_no)
- p = generate_json_path(board, thread_no)
-
- if not download_file(url, p):
- return None
- else:
- return json.load(open(p))
-
-
- def dump_thread_html(board, thread_no):
- print('Downloading HTML for /{}/{}...'.format(board, thread_no))
-
- url = generate_thread_html_url(board, thread_no)
- p = generate_html_path(board, thread_no)
-
- print('Downloading html to {}'.format(p))
-
- download_file(url, p)
-
-
- def dump_thread_files(board, thread_no, thread_json):
- """ Downloads the files referenced in the supplied JSON. """
-
- # Make a list of the files in the thread
- filenames = extract_filenames(thread_json)
-
- # Filter out files that already exist in the cache
- filtered_filenames = [f for f in filenames if not file_exists_in_cache(board, thread_no, normalize_filename(f))]
-
- # Download the files
- print('Downloading {} files:'.format(len(filtered_filenames)))
- for f in filtered_filenames:
- normalized = normalize_filename(f)
- print('\t{}'.format(normalized))
-
- file_url = generate_file_url(f['hashed_name'])
- p = path_to_cached_file(board, thread_no, normalized)
- if not download_file(file_url, p):
- print('\t\tAttempt failed, trying alternate link.')
-
- # Try alternate link
- alternate_url = generate_alternate_file_url(board, f['hashed_name'])
- download_file(alternate_url, p)
-
-
- def download_file(file_url, file_path):
- """ If the request succeeds, downloads the file and returns True.
-
- On a 404 or 504, returns False. On other responses, raises exception. """
- r = requests.get(file_url, headers={'user-agent': user_agent})
-
- if r.status_code == requests.codes.ok:
- save_file(file_path, r.content)
- return True
- elif r.status_code == 404:
- return False
- elif r.status_code == 504:
- print("Server busy (504), couldn't get {}, try again later.".format(file_url))
- return False
- elif r.status_code == 403:
- print("8ch.net returned 403 (forbidden). There can be many reasons for this, but a common one is trying to "
- "connect to 8ch.net (instead oxwugzccvk3dk6tj.onion) through Tor. If you are using Tor, "
- "please substitute r commands with tr (trl, trc, etc).")
- return False
- else:
- raise RuntimeError("Unexpected status code {} while trying to fetch {} - try opening in the browser, "
- "if that doesn't work submit an issue to the tracker.".format(r.status_code, file_url))
-
-
- def generate_thread_json_url(board, thread_no):
- url = 'https://8ch.net/{}/res/{}.json'.format(board, thread_no)
- return url
-
-
- def generate_thread_html_url(board, thread_no, use_onion_link=False):
- if use_onion_link:
- url = 'http://oxwugzccvk3dk6tj.onion/{}/res/{}.html'.format(board, thread_no)
- else:
- url = 'https://8ch.net/{}/res/{}.html'.format(board, thread_no)
-
- return url
-
-
- def generate_file_url(filename):
- url = 'https://media.8ch.net/file_store/{}'.format(filename)
- return url
-
-
- def generate_alternate_file_url(board, filename):
- """Some images, like the OP pic of the /tech/ sticky, use an alternate media2 URL. """
- url = 'https://media2.8ch.net/{}/src/{}'.format(board, filename)
- return url
-
-
- def parse_url(url):
- """Extracts the board name and thread no from a URL.
- """
- parts = url.split('#', 1)
-
- board, thread_no = re.findall('(\w+)\/res\/(\d+)', parts[0])[0]
- anchored_reply = '' if len(parts) < 2 else re.findall('q?(\d+)$', parts[1])[0]
-
- return [board, thread_no, anchored_reply]
|