You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pluralsight.py 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. import json
  2. import os
  3. import random
  4. import re
  5. import secrets
  6. import string
  7. import sys
  8. import time
  9. from functools import partial
  10. from multiprocessing.pool import Pool
  11. from pathlib import Path
  12. from typing import List, Tuple, Dict
  13. import requests
  14. import youtube_dl
  15. from bs4 import BeautifulSoup
  16. from selenium import webdriver
  17. from selenium.webdriver.common.by import By
  18. from selenium.webdriver.firefox.options import Options
  19. from selenium.webdriver.support import expected_conditions
  20. from selenium.webdriver.support.wait import WebDriverWait
  21. # region Global Constant(s) and Readonly Variable(s)
  22. # True/False to determine whether selenium instances will be visible or not (headless)
  23. HIDE_SELENIUM_INSTANCES = False
  24. # Denotes Time.Sleep() duration in seconds
  25. SELENIUM_TIMEOUT_IN_SECONDS = 30
  26. # Maximum parallel course download count
  27. MAX_PARALLEL_COURSE_DOWNLOAD_COUNT = 3
  28. # Maximum number of retry count for downloads
  29. DOWNLOAD_RETRY_COUNT = 3
  30. # Minimum number of seconds to wait between consecutive video downloads
  31. MIN_VIDEO_DOWNLOAD_DELAY = 10
  32. # Minimum number of seconds to wait between consecutive video downloads
  33. MAX_VIDEO_DOWNLOAD_DELAY = 30
  34. # Check if current OS/platform is Windows
  35. IS_WINDOWS = sys.platform.startswith("win")
  36. # Master Directory Path (Default: Working Directory)
  37. MASTER_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
  38. # Path of the archive text file to log all downloaded videos
  39. ARCHIVE_FILE_PATH = os.path.join(MASTER_DIRECTORY, "archive.txt")
  40. # Path of the directory where downloaded paths will be saved
  41. SAVE_PATHS_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Paths")
  42. # Path of the directory where downloaded independent courses will be saved
  43. SAVE_COURSES_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Courses")
  44. # JSON File (.json) containing the JSON of the paths/courses dump of Pluralsight
  45. PATHS_JSON_LOCATION = os.path.join(MASTER_DIRECTORY, "pluralsight.json")
  46. # Text File (.txt) containing the independent courses dump of Pluralsight
  47. INDEPENDENT_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "courses.txt")
  48. # Text File (.txt) containing the list of links of the downloaded courses
  49. DOWNLOADED_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "downloaded.txt")
  50. # Options for youtube-dl. For a complete list of options, check https://github.com/ytdl-org/youtube-dl/blob/3e4cedf9e8cd3157df2457df7274d0c842421945/youtube_dl/YoutubeDL.py#L137-L312
  51. ydl_options = {
  52. 'writesubtitles': True,
  53. 'nooverwrites': True,
  54. 'sleep_interval': MIN_VIDEO_DOWNLOAD_DELAY,
  55. 'max_sleep_interval': MAX_VIDEO_DOWNLOAD_DELAY,
  56. 'download_archive': ARCHIVE_FILE_PATH
  57. }
  58. # List of links of already downloaded courses
  59. DOWNLOADED_COURSE_LINKS_LIST = []
  60. # endregion
  61. # region Classes
  62. class PluralsightPath(object):
  63. def __init__(self, id: int, link: str, title: str, slug: str, course_links: List[str]):
  64. self.id = id
  65. self.link = link
  66. self.title = title
  67. self.slug = slug
  68. self.course_links = course_links
  69. class DisposableMail:
  70. """
  71. This class is used to generate random disposable emails
  72. """
  73. def __init__(self):
  74. self.email_address = requests.post("https://gmailnator.com/index/indexquery", {'action': 'GenerateEmail'}).text
  75. def get_mail_item_id(self) -> object:
  76. post_url = "https://gmailnator.com/mailbox/mailboxquery"
  77. post_data = {
  78. 'action': 'LoadMailList',
  79. 'Email_address': self.email_address
  80. }
  81. while True:
  82. try:
  83. time.sleep(1)
  84. response_text = requests.post(post_url, post_data).json()[0]['content']
  85. result = re.findall('#(.*)\\">', response_text)
  86. mail_id = result[0]
  87. return mail_id
  88. except:
  89. pass
  90. def get_verification_link(self) -> str:
  91. post_url = "https://gmailnator.com/mailbox/get_single_message/"
  92. post_data = {
  93. 'action': 'LoadMailList',
  94. 'message_id': self.get_mail_item_id(),
  95. 'email': self.email_address.split("+")[0]
  96. }
  97. response_data = requests.post(post_url, post_data).text
  98. soup = BeautifulSoup(response_data, 'html.parser')
  99. for link in soup.findAll('a', href=True):
  100. if "https://app.pluralsight.com/id/forgotpassword/reset?token" in link['href']:
  101. return link['href']
  102. class Pluralsight:
  103. """
  104. This class handles the registration and verification of new Pluralsight accounts
  105. """
  106. def __init__(self, email: str, password: str, is_headless: bool = True):
  107. if is_headless:
  108. options = Options()
  109. options.add_argument("--headless")
  110. self.driver = webdriver.Firefox(options=options)
  111. else:
  112. self.driver = webdriver.Firefox()
  113. self.email = email
  114. self.password = password
  115. def __enter__(self):
  116. return self
  117. def __exit__(self, exc_type, exc_val, exc_tb):
  118. self.driver.quit()
  119. def register(self) -> None:
  120. """
  121. Registers new Pluralsight account
  122. """
  123. self.driver.get("https://www.pluralsight.com/offer/2020/free-april-month")
  124. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  125. .until(expected_conditions
  126. .presence_of_element_located((By.CLASS_NAME, "cookie_notification--opt_in")))
  127. accept_cookie_button_element = self.driver.find_element_by_class_name("cookie_notification--opt_in")
  128. accept_cookie_button_element.click()
  129. sign_up_now_button_element = self.driver.find_element_by_xpath('//a[@data-aa-title="Free-April-Start-Now"]')
  130. sign_up_now_button_element.click()
  131. email_input_element = self.driver.find_element_by_name("email")
  132. firstname_input_element = self.driver.find_element_by_name("firstname")
  133. lastname_input_element = self.driver.find_element_by_name("lastname")
  134. tos_checkbox_element = self.driver.find_element_by_name("optInBox")
  135. firstname, lastname = get_name()
  136. email_input_element.send_keys(self.email)
  137. firstname_input_element.send_keys(firstname)
  138. lastname_input_element.send_keys(lastname)
  139. tos_checkbox_element.click()
  140. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  141. .until(expected_conditions
  142. .presence_of_element_located((By.XPATH, "//*[contains(text(), 'I agree, activate benefit')]")))
  143. # time.sleep(SELENIUM_SLEEP_DURATION)
  144. create_account_button_element = self.driver.find_element_by_xpath(
  145. "//*[contains(text(), 'I agree, activate benefit')]")
  146. create_account_button_element.click()
  147. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS * 4) \
  148. .until(expected_conditions
  149. .presence_of_element_located((By.CLASS_NAME, "cancelButton---CKAut")))
  150. # time.sleep(SELENIUM_SLEEP_DURATION*3)
  151. cancel_button_element = self.driver.find_element_by_class_name("cancelButton---CKAut")
  152. cancel_button_element.click()
  153. # time.sleep(SELENIUM_SLEEP_DURATION)
  154. def set_password(self, verification_link: str) -> None:
  155. """
  156. Sets password in the given verification link
  157. Args:
  158. verification_link: The verification link (as string) to set up password
  159. """
  160. self.driver.get(verification_link)
  161. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  162. .until(expected_conditions
  163. .presence_of_element_located((By.ID, "Password")))
  164. # time.sleep(SELENIUM_SLEEP_DURATION)
  165. password_input_element = self.driver.find_element_by_id("Password")
  166. password_confirm_input_element = self.driver.find_element_by_id("PasswordConfirmation")
  167. save_button_element = self.driver.find_element_by_class_name("psds-button--appearance-primary")
  168. password_input_element.send_keys(self.password)
  169. password_confirm_input_element.send_keys(self.password)
  170. save_button_element.click()
  171. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  172. .until(expected_conditions
  173. .presence_of_element_located((By.ID, "Password")))
  174. # time.sleep(SELENIUM_SLEEP_DURATION)
  175. # endregion
  176. # region Utility Functions
  177. def get_name() -> Tuple[str, str]:
  178. """
  179. Generate a random string to be used as first or last name
  180. Returns:
  181. str: Generated string
  182. """
  183. request_url = "http://names.drycodes.com/1?nameOptions=boy_names" if random.choice([True, False]) \
  184. else "http://names.drycodes.com/1?nameOptions=girl_names"
  185. first_last_name = requests.get(request_url).text.strip('"[]').split('_')
  186. return first_last_name
  187. def get_password(min_length: int = 25, max_length: int = 50) -> str:
  188. """
  189. Generates a random password using ascii letters and numerical digits
  190. Args:
  191. min_length: Minimum length of the password, default is 25
  192. max_length: Minimum length of the password, default is 50
  193. Returns: Generated password as string
  194. """
  195. length = random.randint(min_length, max_length)
  196. alphabet = string.ascii_letters + string.digits
  197. password = ''.join(secrets.choice(alphabet) for _ in range(length))
  198. return password
  199. # endregion
  200. def create_pluralsight_account() -> Dict[str, str]:
  201. """
  202. Creates new Pluralsight account and returns the email/password as a dictionary
  203. Returns:
  204. Dict[str, str]: Dictionary containing email and password with eponymous pair keys
  205. """
  206. disposable_email = DisposableMail()
  207. password = get_password()
  208. with Pluralsight(email=disposable_email.email_address,
  209. password=password,
  210. is_headless=HIDE_SELENIUM_INSTANCES) as ps:
  211. ps.register()
  212. verification_link = disposable_email.get_verification_link()
  213. ps.set_password(verification_link=verification_link)
  214. return {'email': disposable_email.email_address, 'password': password}
  215. def get_paths_and_independent_courses(paths_file: str, courses_file: str) -> Tuple[List[PluralsightPath], List[str]]:
  216. if paths_file.startswith("http"):
  217. json_string = requests.get(paths_file).text
  218. else:
  219. json_string = Path(paths_file).read_text()
  220. pluralsight_paths_dicts_list = json.loads(json_string)
  221. pluralsight_paths_list = [PluralsightPath(**paths_dict) for paths_dict in pluralsight_paths_dicts_list]
  222. if courses_file.startswith("http"):
  223. courses_list_string = requests.get(paths_file).text
  224. else:
  225. courses_list_string = Path(paths_file).read_text()
  226. courses_list = list(filter(None, courses_list_string.split('\n')))
  227. return pluralsight_paths_list, courses_list
  228. def get_directory_full_path(root_directory_path: str, pluralsight_path: PluralsightPath) -> str:
  229. directory_name = f"{pluralsight_path.id:03d} - {pluralsight_path.title}" if not IS_WINDOWS \
  230. else f"{pluralsight_path.id:03d} - {pluralsight_path.slug}"
  231. directory_full_path = os.path.join(root_directory_path, directory_name)
  232. return directory_full_path
  233. def download_single_course(course_link: str, username: str, password: str, save_directory_path: str) -> bool:
  234. """
  235. Download the given course using the provided credential
  236. Args:
  237. course_link: The link of the course to download
  238. username: Username (Email) of the Pluralsight account to be used for download
  239. password: Password of the Pluralsight account to be used for download
  240. save_directory_path: Absolute path of Root save directory
  241. Returns: True/False bool value denoting the success status of the download
  242. """
  243. global DOWNLOADED_COURSE_LINKS_LIST
  244. if course_link in DOWNLOADED_COURSE_LINKS_LIST:
  245. return True
  246. retry_count = 0
  247. while retry_count < DOWNLOAD_RETRY_COUNT:
  248. try:
  249. ydl_options['username'] = username
  250. ydl_options['password'] = password
  251. if IS_WINDOWS:
  252. ydl_options[
  253. 'outtmpl'] = f"{save_directory_path}\\%(playlist)s\\%(chapter_number)s - %(chapter)s\\%(playlist_index)s - %(title)s.%(ext)s"
  254. else:
  255. ydl_options[
  256. 'outtmpl'] = f"{save_directory_path}/%(playlist)s/%(chapter_number)s - %(chapter)s/%(playlist_index)s - %(title)s.%(ext)s"
  257. with youtube_dl.YoutubeDL(ydl_options) as ydl:
  258. ydl.download([course_link])
  259. DOWNLOADED_COURSE_LINKS_LIST.append(course_link)
  260. return True
  261. except KeyboardInterrupt:
  262. break
  263. except:
  264. retry_count += 1
  265. return False
  266. def download_batch_course(course_links_list: List[str], username: str, password: str, save_directory_path: str) -> bool:
  267. download_single_course_function = partial(download_single_course,
  268. username=username,
  269. password=password,
  270. save_directory_path=save_directory_path)
  271. with Pool(MAX_PARALLEL_COURSE_DOWNLOAD_COUNT) as p:
  272. download_results = map(lambda link: link, p.map(download_single_course_function, course_links_list))
  273. return False in download_results
  274. def download_pluralsight_path(pluralsight_paths_list: List[PluralsightPath], path_id: int) -> None:
  275. credential_dict = create_pluralsight_account()
  276. for pluralsight_path in pluralsight_paths_list:
  277. if pluralsight_path.id == path_id:
  278. save_directory_path = get_directory_full_path(SAVE_PATHS_DIRECTORY_PATH, pluralsight_path)
  279. download_batch_course(course_links_list=pluralsight_path.course_links,
  280. username=credential_dict['email'],
  281. password=credential_dict['password'],
  282. save_directory_path=save_directory_path)
  283. def download_independent_courses(course_list: List[str]) -> None:
  284. credential_dict = create_pluralsight_account()
  285. download_batch_course(course_links_list=course_list,
  286. username=credential_dict['email'],
  287. password=credential_dict['password'],
  288. save_directory_path=SAVE_COURSES_DIRECTORY_PATH)
  289. def main():
  290. global DOWNLOADED_COURSE_LINKS_LIST
  291. while True:
  292. try:
  293. if os.path.isfile(DOWNLOADED_COURSES_FILE_LOCATION):
  294. DOWNLOADED_COURSE_LINKS_LIST = list(
  295. filter(None, Path(DOWNLOADED_COURSES_FILE_LOCATION).read_text().split('\n')))
  296. paths_list, individual_courses_list = get_paths_and_independent_courses(PATHS_JSON_LOCATION,
  297. INDEPENDENT_COURSES_FILE_LOCATION)
  298. if len(sys.argv) == 1:
  299. download_independent_courses(individual_courses_list)
  300. else:
  301. for arg in sys.argv[1:]:
  302. if 1 <= int(arg) <= len(paths_list):
  303. download_pluralsight_path(paths_list, int(arg))
  304. break
  305. except Exception as exception:
  306. print(exception)
  307. retry = input("Retry (Y/y for yes): ").lower()
  308. if retry != 'y':
  309. break
  310. finally:
  311. with open(DOWNLOADED_COURSES_FILE_LOCATION, 'w+') as downloaded_file:
  312. downloaded_file.write('\n'.join(set(DOWNLOADED_COURSE_LINKS_LIST)))
  313. if __name__ == '__main__':
  314. main()