You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pluralsight.py 19KB


  1. import json
  2. import os
  3. import random
  4. import re
  5. import secrets
  6. import socket
  7. import string
  8. import sys
  9. import time
  10. from functools import partial
  11. from multiprocessing.pool import Pool
  12. from pathlib import Path
  13. from typing import List, Tuple, Dict
  14. import click
  15. import requests
  16. import youtube_dl
  17. from bs4 import BeautifulSoup
  18. from selenium import webdriver
  19. from selenium.webdriver import FirefoxProfile
  20. from selenium.webdriver.common.by import By
  21. from selenium.webdriver.firefox.options import Options
  22. from selenium.webdriver.support import expected_conditions
  23. from selenium.webdriver.support.wait import WebDriverWait
  24. # region Global Constant(s) and Readonly Variable(s)
  25. # True/False to determine whether selenium instances will be visible or not (headless)
  26. HIDE_SELENIUM_INSTANCES = False
  27. # Denotes Maximum seconds selenium will wait for an element to load
  28. SELENIUM_TIMEOUT_IN_SECONDS = 30
  29. # Maximum parallel course download count
  30. MAX_PARALLEL_COURSE_DOWNLOAD_COUNT = 1
  31. # Maximum number of retry count for downloads
  32. DOWNLOAD_RETRY_COUNT = 100
  33. # Maximum number of account registrations allowed per IP
  34. MAX_ACCOUNT_PER_IP = 5
  35. # True/False to determine whether consecutive video downloading should have delays in between
  36. IS_DELAY_BETWEEN_VIDEO = True
  37. # Minimum number of seconds to wait between consecutive video downloads
  38. MIN_VIDEO_DOWNLOAD_DELAY = 10
  39. # Minimum number of seconds to wait between consecutive video downloads
  40. MAX_VIDEO_DOWNLOAD_DELAY = 20
  41. # Check if current OS/platform is Windows
  42. IS_WINDOWS = sys.platform.startswith("win")
  43. # Master Directory Path (Default: Working Directory)
  44. MASTER_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
  45. # Path of the archive text file to log all downloaded videos
  46. ARCHIVE_FILE_PATH = os.path.join(MASTER_DIRECTORY, "archive.txt")
  47. # Path of the directory where downloaded paths will be saved
  48. SAVE_PATHS_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Paths")
  49. # Path of the directory where downloaded independent courses will be saved
  50. SAVE_COURSES_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Courses")
  51. # JSON File (.json) containing the JSON of the paths/courses dump of Pluralsight
  52. PATHS_JSON_LOCATION = os.path.join(MASTER_DIRECTORY, "pluralsight.json")
  53. # Text File (.txt) containing the independent courses dump of Pluralsight
  54. INDEPENDENT_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "courses.txt")
  55. # Text File (.txt) containing the list of links of the downloaded courses
  56. DOWNLOADED_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "downloaded.txt")
  57. # Options for youtube-dl. For a complete list of options, check https://github.com/ytdl-org/youtube-dl/blob/3e4cedf9e8cd3157df2457df7274d0c842421945/youtube_dl/YoutubeDL.py#L137-L312
  58. ydl_options = {
  59. 'writesubtitles': True,
  60. 'nooverwrites': True,
  61. 'retries': 100,
  62. 'download_archive': ARCHIVE_FILE_PATH
  63. }
  64. if IS_DELAY_BETWEEN_VIDEO:
  65. ydl_options['sleep_interval'] = MIN_VIDEO_DOWNLOAD_DELAY
  66. ydl_options['max_sleep_interval'] = MAX_VIDEO_DOWNLOAD_DELAY
  67. # endregion
  68. # region Classes
  69. class PluralsightPath(object):
  70. def __init__(self, id: int, link: str, title: str, slug: str, course_links: List[str]):
  71. self.id = id
  72. self.link = link
  73. self.title = title
  74. self.slug = slug
  75. self.course_links = course_links
  76. class DisposableMail:
  77. """
  78. This class is used to generate random disposable emails
  79. """
  80. def __init__(self):
  81. self.email_address = requests.post("https://gmailnator.com/index/indexquery", {'action': 'GenerateEmail'}).text
  82. def get_mail_item_id(self) -> object:
  83. post_url = "https://gmailnator.com/mailbox/mailboxquery"
  84. post_data = {
  85. 'action': 'LoadMailList',
  86. 'Email_address': self.email_address
  87. }
  88. while True:
  89. try:
  90. time.sleep(1)
  91. response_text = requests.post(post_url, post_data).json()[0]['content']
  92. result = re.findall('#(.*)\\">', response_text)
  93. mail_id = result[0]
  94. return mail_id
  95. except:
  96. pass
  97. def get_verification_link(self) -> str:
  98. post_url = "https://gmailnator.com/mailbox/get_single_message/"
  99. post_data = {
  100. 'action': 'LoadMailList',
  101. 'message_id': self.get_mail_item_id(),
  102. 'email': self.email_address.split("+")[0]
  103. }
  104. response_data = requests.post(post_url, post_data).text
  105. soup = BeautifulSoup(response_data, 'html.parser')
  106. for link in soup.findAll('a', href=True):
  107. if "https://app.pluralsight.com/id/forgotpassword/reset?token" in link['href']:
  108. return link['href']
  109. class Pluralsight:
  110. """
  111. This class handles the registration and verification of new Pluralsight accounts
  112. """
  113. def __init__(self, email: str, password: str, is_headless: bool = True, proxy_tuple: Tuple[str, int] = None):
  114. profile = FirefoxProfile()
  115. if proxy_tuple is not None:
  116. profile.set_preference("network.proxy.type", 1)
  117. profile.set_preference("network.proxy.http", proxy_tuple[0])
  118. profile.set_preference("network.proxy.http_port", proxy_tuple[1])
  119. profile.set_preference("network.proxy.ssl", proxy_tuple[0])
  120. profile.set_preference("network.proxy.ssl_port", proxy_tuple[1])
  121. if is_headless:
  122. options = Options()
  123. options.add_argument("--headless")
  124. self.driver = webdriver.Firefox(options=options, firefox_profile=profile)
  125. else:
  126. self.driver = webdriver.Firefox(firefox_profile=profile)
  127. self.email = email
  128. self.password = password
  129. def __enter__(self):
  130. return self
  131. def __exit__(self, exc_type, exc_val, exc_tb):
  132. self.driver.quit()
  133. def register(self) -> None:
  134. """
  135. Registers new Pluralsight account
  136. """
  137. self.driver.get("https://www.pluralsight.com/offer/2020/free-april-month")
  138. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  139. .until(expected_conditions
  140. .presence_of_element_located((By.CLASS_NAME, "cookie_notification--opt_in")))
  141. accept_cookie_button_element = self.driver.find_element_by_class_name("cookie_notification--opt_in")
  142. accept_cookie_button_element.click()
  143. sign_up_now_button_element = self.driver.find_element_by_xpath('//a[@data-aa-title="Free-April-Start-Now"]')
  144. sign_up_now_button_element.click()
  145. email_input_element = self.driver.find_element_by_name("email")
  146. firstname_input_element = self.driver.find_element_by_name("firstname")
  147. lastname_input_element = self.driver.find_element_by_name("lastname")
  148. tos_checkbox_element = self.driver.find_element_by_name("optInBox")
  149. firstname, lastname = get_name()
  150. email_input_element.send_keys(self.email)
  151. firstname_input_element.send_keys(firstname)
  152. lastname_input_element.send_keys(lastname)
  153. tos_checkbox_element.click()
  154. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  155. .until(expected_conditions
  156. .presence_of_element_located((By.XPATH, "//*[contains(text(), 'I agree, activate benefit')]")))
  157. create_account_button_element = self.driver.find_element_by_xpath(
  158. "//*[contains(text(), 'I agree, activate benefit')]")
  159. create_account_button_element.click()
  160. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS * 4) \
  161. .until(expected_conditions
  162. .presence_of_element_located((By.CLASS_NAME, "cancelButton---CKAut")))
  163. cancel_button_element = self.driver.find_element_by_class_name("cancelButton---CKAut")
  164. cancel_button_element.click()
  165. def set_password(self, verification_link: str) -> None:
  166. """
  167. Sets password in the given verification link
  168. Args:
  169. verification_link: The verification link (as string) to set up password
  170. """
  171. self.driver.get(verification_link)
  172. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  173. .until(expected_conditions
  174. .presence_of_element_located((By.ID, "Password")))
  175. password_input_element = self.driver.find_element_by_id("Password")
  176. password_confirm_input_element = self.driver.find_element_by_id("PasswordConfirmation")
  177. save_button_element = self.driver.find_element_by_class_name("psds-button--appearance-primary")
  178. password_input_element.send_keys(self.password)
  179. password_confirm_input_element.send_keys(self.password)
  180. save_button_element.click()
  181. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  182. .until(expected_conditions
  183. .presence_of_element_located((By.ID, "Password")))
  184. # endregion
  185. # region Static Utility Functions
  186. def get_name() -> Tuple[str, str]:
  187. """
  188. Generate a random string to be used as first or last name
  189. Returns:
  190. str: Generated string
  191. """
  192. request_url = "http://names.drycodes.com/1?nameOptions=boy_names" if random.choice([True, False]) \
  193. else "http://names.drycodes.com/1?nameOptions=girl_names"
  194. first_last_name = requests.get(request_url).text.strip('"[]').split('_')
  195. return first_last_name
  196. def get_password(min_length: int = 25, max_length: int = 50) -> str:
  197. """
  198. Generates a random password using ascii letters and numerical digits
  199. Args:
  200. min_length: Minimum length of the password, default is 25
  201. max_length: Minimum length of the password, default is 50
  202. Returns: Generated password as string
  203. """
  204. length = random.randint(min_length, max_length)
  205. alphabet = string.ascii_letters + string.digits
  206. password = ''.join(secrets.choice(alphabet) for _ in range(length))
  207. return password
  208. # endregion
  209. def create_pluralsight_account(proxy_tuple: Tuple[str, int] = None) -> Dict[str, str]:
  210. """
  211. Creates new Pluralsight account and returns the email/password as a dictionary
  212. Returns:
  213. Dict[str, str]: Dictionary containing email and password with eponymous pair keys
  214. """
  215. disposable_email = DisposableMail()
  216. password = get_password()
  217. with Pluralsight(email=disposable_email.email_address,
  218. password=password,
  219. is_headless=HIDE_SELENIUM_INSTANCES,
  220. proxy_tuple=proxy_tuple) as ps:
  221. ps.register()
  222. verification_link = disposable_email.get_verification_link()
  223. ps.set_password(verification_link=verification_link)
  224. return {'email': disposable_email.email_address, 'password': password}
  225. def get_paths_and_independent_courses(paths_file: str, courses_file: str) -> Tuple[List[PluralsightPath], List[str]]:
  226. if paths_file.startswith("http"):
  227. json_string = requests.get(paths_file).text
  228. else:
  229. json_string = Path(paths_file).read_text()
  230. pluralsight_paths_dicts_list = json.loads(json_string)
  231. pluralsight_paths_list = [PluralsightPath(**paths_dict) for paths_dict in pluralsight_paths_dicts_list]
  232. if not os.path.isfile(courses_file):
  233. return pluralsight_paths_list, []
  234. if courses_file.startswith("http"):
  235. courses_list_string = requests.get(courses_file).text
  236. else:
  237. courses_list_string = Path(courses_file).read_text()
  238. courses_list = list(filter(None, courses_list_string.split('\n')))
  239. return pluralsight_paths_list, courses_list
  240. def get_directory_full_path(root_directory_path: str, pluralsight_path: PluralsightPath) -> str:
  241. directory_name = f"{pluralsight_path.id:03d} - {pluralsight_path.title}" if not IS_WINDOWS \
  242. else f"{pluralsight_path.id:03d} - {pluralsight_path.slug}"
  243. directory_full_path = os.path.join(root_directory_path, directory_name)
  244. return directory_full_path
  245. def save_progress(course_link: str) -> None:
  246. with open(DOWNLOADED_COURSES_FILE_LOCATION, 'a+') as downloaded_file:
  247. downloaded_file.write(f"{course_link}\n")
  248. def is_downloaded(course_link: str) -> bool:
  249. if not os.path.isfile(DOWNLOADED_COURSES_FILE_LOCATION):
  250. return False
  251. downloaded_courses_list = list(filter(None, Path(DOWNLOADED_COURSES_FILE_LOCATION).read_text().split('\n')))
  252. return course_link in downloaded_courses_list
  253. # region Downloading related Functions
  254. def download_single_course(course_link: str,
  255. username: str,
  256. password: str,
  257. save_directory_path: str,
  258. proxy_tuple: Tuple[str, int] = None) -> bool:
  259. """
  260. Download the given course using the provided credential
  261. Args:
  262. course_link: The link of the course to download
  263. username: Username (Email) of the Pluralsight account to be used for download
  264. password: Password of the Pluralsight account to be used for download
  265. save_directory_path: Absolute path of Root save directory
  266. proxy_tuple: Proxy in (IP, Port) Tuple format
  267. Returns: True/False bool value denoting the success status of the download
  268. """
  269. if is_downloaded(course_link):
  270. return True
  271. retry_count = 0
  272. while retry_count < DOWNLOAD_RETRY_COUNT:
  273. try:
  274. ydl_options['username'] = username
  275. ydl_options['password'] = password
  276. if proxy_tuple is not None:
  277. ydl_options['proxy'] = f"https://{proxy_tuple[0]}:{proxy_tuple[1]}"
  278. if IS_WINDOWS:
  279. ydl_options[
  280. 'outtmpl'] = f"{save_directory_path}\\%(playlist)s\\%(chapter_number)s - %(chapter)s\\%(playlist_index)s - %(title)s.%(ext)s"
  281. else:
  282. ydl_options[
  283. 'outtmpl'] = f"{save_directory_path}/%(playlist)s/%(chapter_number)s - %(chapter)s/%(playlist_index)s - %(title)s.%(ext)s"
  284. with youtube_dl.YoutubeDL(ydl_options) as ydl:
  285. ydl.download([course_link])
  286. save_progress(course_link)
  287. return True
  288. except KeyboardInterrupt:
  289. break
  290. except Exception as e:
  291. if "http error 429" in str(e).lower():
  292. time.sleep(300)
  293. elif "http error 403" in str(e).lower():
  294. return False
  295. retry_count += 1
  296. return False
  297. def download_batch_course(course_links_list: List[str],
  298. username: str,
  299. password: str,
  300. save_directory_path: str,
  301. proxy_tuple: Tuple[str, int] = None) -> bool:
  302. download_single_course_function = partial(download_single_course,
  303. username=username,
  304. password=password,
  305. proxy_tuple=proxy_tuple,
  306. save_directory_path=save_directory_path)
  307. with Pool(MAX_PARALLEL_COURSE_DOWNLOAD_COUNT) as p:
  308. download_results = map(lambda link: link, p.map(download_single_course_function, course_links_list))
  309. return False not in download_results
  310. def download_pluralsight_path(pluralsight_paths_list: List[PluralsightPath],
  311. path_id: int,
  312. proxy_tuple: Tuple[str, int] = None) -> bool:
  313. try:
  314. retry_count = 0
  315. while retry_count < MAX_ACCOUNT_PER_IP:
  316. credential_dict = create_pluralsight_account(proxy_tuple)
  317. for pluralsight_path in pluralsight_paths_list:
  318. if pluralsight_path.id == path_id:
  319. save_directory_path = get_directory_full_path(SAVE_PATHS_DIRECTORY_PATH, pluralsight_path)
  320. result = download_batch_course(course_links_list=pluralsight_path.course_links,
  321. username=credential_dict['email'],
  322. password=credential_dict['password'],
  323. save_directory_path=save_directory_path,
  324. proxy_tuple=proxy_tuple)
  325. if result:
  326. return True
  327. retry_count += 1
  328. except Exception as e:
  329. print(e)
  330. return False
  331. def download_independent_courses(course_list: List[str], proxy_tuple: Tuple[str, int] = None) -> bool:
  332. try:
  333. retry_count = 0
  334. while retry_count < MAX_ACCOUNT_PER_IP:
  335. credential_dict = create_pluralsight_account()
  336. result = download_batch_course(course_links_list=course_list,
  337. username=credential_dict['email'],
  338. password=credential_dict['password'],
  339. save_directory_path=SAVE_COURSES_DIRECTORY_PATH,
  340. proxy_tuple=proxy_tuple)
  341. if result:
  342. return True
  343. retry_count += 1
  344. except Exception as e:
  345. print(e)
  346. return False
  347. # endregion
  348. # region Click Callback Functions
  349. def click_validate_proxy(ctx, param, value) -> Tuple[str, int]:
  350. """
  351. Click callback validation to determine if a given proxy is valid or in correct format (IP:Port)
  352. Args:
  353. ctx: NA
  354. param: NA
  355. value: Callback caller's data
  356. Returns:
  357. Tuple[str, int]: Proxy in a Tuple containing IP as str and port as int
  358. """
  359. try:
  360. if value is None:
  361. return None
  362. ip, port = value.split(":")
  363. try:
  364. socket.inet_aton(ip)
  365. except socket.error:
  366. raise ValueError
  367. if not 1 <= int(port) <= 65535:
  368. raise ValueError
  369. return ip, int(port)
  370. except ValueError:
  371. raise click.BadParameter("Proxy needs to be in IP:Port format (example: 127.0.0.1:8000)")
  372. # endregion
  373. @click.command()
  374. @click.option("-i", "--path-id", "path_id", type=int, help="Path ID")
  375. @click.option("-p", "--proxy", "proxy_tuple", default=None, type=str, callback=click_validate_proxy, help="Proxy in IP:Port Format")
  376. def main(path_id: int, proxy_tuple: Tuple[str, int]):
  377. try:
  378. paths_list, individual_courses_list = get_paths_and_independent_courses(PATHS_JSON_LOCATION,
  379. INDEPENDENT_COURSES_FILE_LOCATION)
  380. if path_id is None:
  381. download_independent_courses(individual_courses_list, proxy_tuple)
  382. else:
  383. if not 1 <= path_id <= len(paths_list):
  384. raise Exception("Path ID is invalid or is out of range")
  385. result = download_pluralsight_path(paths_list, path_id, proxy_tuple)
  386. if result:
  387. print("SUCCESS")
  388. else:
  389. print("ERROR OCCURRED!!")
  390. except Exception as exception:
  391. print(exception)
  392. if __name__ == '__main__':
  393. main()