You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pluralsight.py 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. import json
  2. import os
  3. import random
  4. import re
  5. import secrets
  6. import string
  7. import sys
  8. import time
  9. from functools import partial
  10. from multiprocessing.pool import Pool
  11. from pathlib import Path
  12. from typing import List, Tuple, Dict
  13. import requests
  14. import youtube_dl
  15. from bs4 import BeautifulSoup
  16. from selenium import webdriver
  17. from selenium.webdriver.common.by import By
  18. from selenium.webdriver.firefox.options import Options
  19. from selenium.webdriver.support import expected_conditions
  20. from selenium.webdriver.support.wait import WebDriverWait
  21. # region Global Constant(s) and Readonly Variable(s)
  22. # True/False to determine whether selenium instances will be visible or not (headless)
  23. HIDE_SELENIUM_INSTANCES = False
  24. # Denotes Time.Sleep() duration in seconds
  25. SELENIUM_TIMEOUT_IN_SECONDS = 30
  26. # Maximum parallel course download count
  27. MAX_PARALLEL_COURSE_DOWNLOAD_COUNT = 1
  28. # Maximum number of retry count for downloads
  29. DOWNLOAD_RETRY_COUNT = 100
  30. # Maximum number of account registrations allowed per IP
  31. MAX_ACCOUNT_PER_IP = 5
  32. # True/False to determine whether consecutive video downloading should have delays in between
  33. IS_DELAY_BETWEEN_VIDEO = True
  34. # Minimum number of seconds to wait between consecutive video downloads
  35. MIN_VIDEO_DOWNLOAD_DELAY = 10
  36. # Minimum number of seconds to wait between consecutive video downloads
  37. MAX_VIDEO_DOWNLOAD_DELAY = 30
  38. # Check if current OS/platform is Windows
  39. IS_WINDOWS = sys.platform.startswith("win")
  40. # Master Directory Path (Default: Working Directory)
  41. MASTER_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
  42. # Path of the archive text file to log all downloaded videos
  43. ARCHIVE_FILE_PATH = os.path.join(MASTER_DIRECTORY, "archive.txt")
  44. # Path of the directory where downloaded paths will be saved
  45. SAVE_PATHS_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Paths")
  46. # Path of the directory where downloaded independent courses will be saved
  47. SAVE_COURSES_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Courses")
  48. # JSON File (.json) containing the JSON of the paths/courses dump of Pluralsight
  49. PATHS_JSON_LOCATION = os.path.join(MASTER_DIRECTORY, "pluralsight.json")
  50. # Text File (.txt) containing the independent courses dump of Pluralsight
  51. INDEPENDENT_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "courses.txt")
  52. # Text File (.txt) containing the list of links of the downloaded courses
  53. DOWNLOADED_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "downloaded.txt")
  54. # Options for youtube-dl. For a complete list of options, check https://github.com/ytdl-org/youtube-dl/blob/3e4cedf9e8cd3157df2457df7274d0c842421945/youtube_dl/YoutubeDL.py#L137-L312
  55. ydl_options = {
  56. 'writesubtitles': True,
  57. 'nooverwrites': True,
  58. 'retries': 100,
  59. 'download_archive': ARCHIVE_FILE_PATH
  60. }
  61. if IS_DELAY_BETWEEN_VIDEO:
  62. ydl_options['sleep_interval'] = MIN_VIDEO_DOWNLOAD_DELAY
  63. ydl_options['max_sleep_interval'] = MAX_VIDEO_DOWNLOAD_DELAY
  64. # endregion
  65. # region Classes
  66. class PluralsightPath(object):
  67. def __init__(self, id: int, link: str, title: str, slug: str, course_links: List[str]):
  68. self.id = id
  69. self.link = link
  70. self.title = title
  71. self.slug = slug
  72. self.course_links = course_links
  73. class DisposableMail:
  74. """
  75. This class is used to generate random disposable emails
  76. """
  77. def __init__(self):
  78. self.email_address = requests.post("https://gmailnator.com/index/indexquery", {'action': 'GenerateEmail'}).text
  79. def get_mail_item_id(self) -> object:
  80. post_url = "https://gmailnator.com/mailbox/mailboxquery"
  81. post_data = {
  82. 'action': 'LoadMailList',
  83. 'Email_address': self.email_address
  84. }
  85. while True:
  86. try:
  87. time.sleep(1)
  88. response_text = requests.post(post_url, post_data).json()[0]['content']
  89. result = re.findall('#(.*)\\">', response_text)
  90. mail_id = result[0]
  91. return mail_id
  92. except:
  93. pass
  94. def get_verification_link(self) -> str:
  95. post_url = "https://gmailnator.com/mailbox/get_single_message/"
  96. post_data = {
  97. 'action': 'LoadMailList',
  98. 'message_id': self.get_mail_item_id(),
  99. 'email': self.email_address.split("+")[0]
  100. }
  101. response_data = requests.post(post_url, post_data).text
  102. soup = BeautifulSoup(response_data, 'html.parser')
  103. for link in soup.findAll('a', href=True):
  104. if "https://app.pluralsight.com/id/forgotpassword/reset?token" in link['href']:
  105. return link['href']
  106. class Pluralsight:
  107. """
  108. This class handles the registration and verification of new Pluralsight accounts
  109. """
  110. def __init__(self, email: str, password: str, is_headless: bool = True):
  111. if is_headless:
  112. options = Options()
  113. options.add_argument("--headless")
  114. self.driver = webdriver.Firefox(options=options)
  115. else:
  116. self.driver = webdriver.Firefox()
  117. self.email = email
  118. self.password = password
  119. def __enter__(self):
  120. return self
  121. def __exit__(self, exc_type, exc_val, exc_tb):
  122. self.driver.quit()
  123. def register(self) -> None:
  124. """
  125. Registers new Pluralsight account
  126. """
  127. self.driver.get("https://www.pluralsight.com/offer/2020/free-april-month")
  128. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  129. .until(expected_conditions
  130. .presence_of_element_located((By.CLASS_NAME, "cookie_notification--opt_in")))
  131. accept_cookie_button_element = self.driver.find_element_by_class_name("cookie_notification--opt_in")
  132. accept_cookie_button_element.click()
  133. sign_up_now_button_element = self.driver.find_element_by_xpath('//a[@data-aa-title="Free-April-Start-Now"]')
  134. sign_up_now_button_element.click()
  135. email_input_element = self.driver.find_element_by_name("email")
  136. firstname_input_element = self.driver.find_element_by_name("firstname")
  137. lastname_input_element = self.driver.find_element_by_name("lastname")
  138. tos_checkbox_element = self.driver.find_element_by_name("optInBox")
  139. firstname, lastname = get_name()
  140. email_input_element.send_keys(self.email)
  141. firstname_input_element.send_keys(firstname)
  142. lastname_input_element.send_keys(lastname)
  143. tos_checkbox_element.click()
  144. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  145. .until(expected_conditions
  146. .presence_of_element_located((By.XPATH, "//*[contains(text(), 'I agree, activate benefit')]")))
  147. create_account_button_element = self.driver.find_element_by_xpath(
  148. "//*[contains(text(), 'I agree, activate benefit')]")
  149. create_account_button_element.click()
  150. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS * 4) \
  151. .until(expected_conditions
  152. .presence_of_element_located((By.CLASS_NAME, "cancelButton---CKAut")))
  153. cancel_button_element = self.driver.find_element_by_class_name("cancelButton---CKAut")
  154. cancel_button_element.click()
  155. def set_password(self, verification_link: str) -> None:
  156. """
  157. Sets password in the given verification link
  158. Args:
  159. verification_link: The verification link (as string) to set up password
  160. """
  161. self.driver.get(verification_link)
  162. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  163. .until(expected_conditions
  164. .presence_of_element_located((By.ID, "Password")))
  165. password_input_element = self.driver.find_element_by_id("Password")
  166. password_confirm_input_element = self.driver.find_element_by_id("PasswordConfirmation")
  167. save_button_element = self.driver.find_element_by_class_name("psds-button--appearance-primary")
  168. password_input_element.send_keys(self.password)
  169. password_confirm_input_element.send_keys(self.password)
  170. save_button_element.click()
  171. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  172. .until(expected_conditions
  173. .presence_of_element_located((By.ID, "Password")))
  174. # endregion
  175. # region Utility Functions
  176. def get_name() -> Tuple[str, str]:
  177. """
  178. Generate a random string to be used as first or last name
  179. Returns:
  180. str: Generated string
  181. """
  182. request_url = "http://names.drycodes.com/1?nameOptions=boy_names" if random.choice([True, False]) \
  183. else "http://names.drycodes.com/1?nameOptions=girl_names"
  184. first_last_name = requests.get(request_url).text.strip('"[]').split('_')
  185. return first_last_name
  186. def get_password(min_length: int = 25, max_length: int = 50) -> str:
  187. """
  188. Generates a random password using ascii letters and numerical digits
  189. Args:
  190. min_length: Minimum length of the password, default is 25
  191. max_length: Minimum length of the password, default is 50
  192. Returns: Generated password as string
  193. """
  194. length = random.randint(min_length, max_length)
  195. alphabet = string.ascii_letters + string.digits
  196. password = ''.join(secrets.choice(alphabet) for _ in range(length))
  197. return password
  198. # endregion
  199. def create_pluralsight_account() -> Dict[str, str]:
  200. """
  201. Creates new Pluralsight account and returns the email/password as a dictionary
  202. Returns:
  203. Dict[str, str]: Dictionary containing email and password with eponymous pair keys
  204. """
  205. disposable_email = DisposableMail()
  206. password = get_password()
  207. with Pluralsight(email=disposable_email.email_address,
  208. password=password,
  209. is_headless=HIDE_SELENIUM_INSTANCES) as ps:
  210. ps.register()
  211. verification_link = disposable_email.get_verification_link()
  212. ps.set_password(verification_link=verification_link)
  213. return {'email': disposable_email.email_address, 'password': password}
  214. def get_paths_and_independent_courses(paths_file: str, courses_file: str) -> Tuple[List[PluralsightPath], List[str]]:
  215. if paths_file.startswith("http"):
  216. json_string = requests.get(paths_file).text
  217. else:
  218. json_string = Path(paths_file).read_text()
  219. pluralsight_paths_dicts_list = json.loads(json_string)
  220. pluralsight_paths_list = [PluralsightPath(**paths_dict) for paths_dict in pluralsight_paths_dicts_list]
  221. if courses_file.startswith("http"):
  222. courses_list_string = requests.get(courses_file).text
  223. else:
  224. courses_list_string = Path(courses_file).read_text()
  225. courses_list = list(filter(None, courses_list_string.split('\n')))
  226. return pluralsight_paths_list, courses_list
  227. def get_directory_full_path(root_directory_path: str, pluralsight_path: PluralsightPath) -> str:
  228. directory_name = f"{pluralsight_path.id:03d} - {pluralsight_path.title}" if not IS_WINDOWS \
  229. else f"{pluralsight_path.id:03d} - {pluralsight_path.slug}"
  230. directory_full_path = os.path.join(root_directory_path, directory_name)
  231. return directory_full_path
  232. def save_progress(course_link: str) -> None:
  233. with open(DOWNLOADED_COURSES_FILE_LOCATION, 'a+') as downloaded_file:
  234. downloaded_file.write(f"{course_link}\n")
  235. def is_downloaded(course_link: str) -> bool:
  236. if not os.path.isfile(DOWNLOADED_COURSES_FILE_LOCATION):
  237. return False
  238. downloaded_courses_list = list(filter(None, Path(DOWNLOADED_COURSES_FILE_LOCATION).read_text().split('\n')))
  239. return course_link in downloaded_courses_list
  240. def download_single_course(course_link: str, username: str, password: str, save_directory_path: str) -> bool:
  241. """
  242. Download the given course using the provided credential
  243. Args:
  244. course_link: The link of the course to download
  245. username: Username (Email) of the Pluralsight account to be used for download
  246. password: Password of the Pluralsight account to be used for download
  247. save_directory_path: Absolute path of Root save directory
  248. Returns: True/False bool value denoting the success status of the download
  249. """
  250. if is_downloaded(course_link):
  251. return True
  252. retry_count = 0
  253. while retry_count < DOWNLOAD_RETRY_COUNT:
  254. try:
  255. ydl_options['username'] = username
  256. ydl_options['password'] = password
  257. if IS_WINDOWS:
  258. ydl_options[
  259. 'outtmpl'] = f"{save_directory_path}\\%(playlist)s\\%(chapter_number)s - %(chapter)s\\%(playlist_index)s - %(title)s.%(ext)s"
  260. else:
  261. ydl_options[
  262. 'outtmpl'] = f"{save_directory_path}/%(playlist)s/%(chapter_number)s - %(chapter)s/%(playlist_index)s - %(title)s.%(ext)s"
  263. with youtube_dl.YoutubeDL(ydl_options) as ydl:
  264. ydl.download([course_link])
  265. save_progress(course_link)
  266. return True
  267. except KeyboardInterrupt:
  268. break
  269. except Exception as e:
  270. if "http error 429" in str(e).lower():
  271. time.sleep(300)
  272. elif "http error 403" in str(e).lower():
  273. return False
  274. retry_count += 1
  275. return False
  276. def download_batch_course(course_links_list: List[str], username: str, password: str, save_directory_path: str) -> bool:
  277. download_single_course_function = partial(download_single_course,
  278. username=username,
  279. password=password,
  280. save_directory_path=save_directory_path)
  281. with Pool(MAX_PARALLEL_COURSE_DOWNLOAD_COUNT) as p:
  282. download_results = map(lambda link: link, p.map(download_single_course_function, course_links_list))
  283. return False not in download_results
  284. def download_pluralsight_path(pluralsight_paths_list: List[PluralsightPath], path_id: int) -> bool:
  285. try:
  286. retry_count = 0
  287. while retry_count < MAX_ACCOUNT_PER_IP:
  288. credential_dict = create_pluralsight_account()
  289. for pluralsight_path in pluralsight_paths_list:
  290. if pluralsight_path.id == path_id:
  291. save_directory_path = get_directory_full_path(SAVE_PATHS_DIRECTORY_PATH, pluralsight_path)
  292. result = download_batch_course(course_links_list=pluralsight_path.course_links,
  293. username=credential_dict['email'],
  294. password=credential_dict['password'],
  295. save_directory_path=save_directory_path)
  296. if result:
  297. return True
  298. retry_count += 1
  299. except Exception as e:
  300. print(e)
  301. return False
  302. def download_independent_courses(course_list: List[str]) -> bool:
  303. try:
  304. retry_count = 0
  305. while retry_count < MAX_ACCOUNT_PER_IP:
  306. credential_dict = create_pluralsight_account()
  307. result = download_batch_course(course_links_list=course_list,
  308. username=credential_dict['email'],
  309. password=credential_dict['password'],
  310. save_directory_path=SAVE_COURSES_DIRECTORY_PATH)
  311. if result:
  312. return True
  313. retry_count += 1
  314. except Exception as e:
  315. print(e)
  316. return False
  317. def main():
  318. try:
  319. paths_list, individual_courses_list = get_paths_and_independent_courses(PATHS_JSON_LOCATION,
  320. INDEPENDENT_COURSES_FILE_LOCATION)
  321. if len(sys.argv) == 1:
  322. download_independent_courses(individual_courses_list)
  323. else:
  324. for arg in sys.argv[1:]:
  325. if 1 <= int(arg) <= len(paths_list):
  326. result = download_pluralsight_path(paths_list, int(arg))
  327. if result:
  328. print("SUCCESS")
  329. else:
  330. print("ERROR OCCURRED!!")
  331. except Exception as exception:
  332. print(exception)
  333. if __name__ == '__main__':
  334. main()