You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pluralsight.py 18KB


  1. import json
  2. import os
  3. import random
  4. import re
  5. import secrets
  6. import string
  7. import sys
  8. import time
  9. from functools import partial
  10. from multiprocessing.pool import Pool
  11. from pathlib import Path
  12. from typing import List, Tuple, Dict
  13. import requests
  14. import youtube_dl
  15. from bs4 import BeautifulSoup
  16. from selenium import webdriver
  17. from selenium.webdriver import FirefoxProfile
  18. from selenium.webdriver.common.by import By
  19. from selenium.webdriver.firefox.options import Options
  20. from selenium.webdriver.support import expected_conditions
  21. from selenium.webdriver.support.wait import WebDriverWait
  22. # region Global Constant(s) and Readonly Variable(s)
  23. # True/False to determine whether selenium instances will be visible or not (headless)
  24. HIDE_SELENIUM_INSTANCES = False
  25. # Denotes Maximum seconds selenium will wait for an element to load
  26. SELENIUM_TIMEOUT_IN_SECONDS = 30
  27. # Maximum parallel course download count
  28. MAX_PARALLEL_COURSE_DOWNLOAD_COUNT = 1
  29. # Maximum number of retry count for downloads
  30. DOWNLOAD_RETRY_COUNT = 100
  31. # Maximum number of account registrations allowed per IP
  32. MAX_ACCOUNT_PER_IP = 5
  33. # True/False to determine whether consecutive video downloading should have delays in between
  34. IS_DELAY_BETWEEN_VIDEO = True
  35. # Minimum number of seconds to wait between consecutive video downloads
  36. MIN_VIDEO_DOWNLOAD_DELAY = 10
  37. # Minimum number of seconds to wait between consecutive video downloads
  38. MAX_VIDEO_DOWNLOAD_DELAY = 20
  39. # Check if current OS/platform is Windows
  40. IS_WINDOWS = sys.platform.startswith("win")
  41. # Master Directory Path (Default: Working Directory)
  42. MASTER_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
  43. # Path of the archive text file to log all downloaded videos
  44. ARCHIVE_FILE_PATH = os.path.join(MASTER_DIRECTORY, "archive.txt")
  45. # Path of the directory where downloaded paths will be saved
  46. SAVE_PATHS_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Paths")
  47. # Path of the directory where downloaded independent courses will be saved
  48. SAVE_COURSES_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Courses")
  49. # JSON File (.json) containing the JSON of the paths/courses dump of Pluralsight
  50. PATHS_JSON_LOCATION = os.path.join(MASTER_DIRECTORY, "pluralsight.json")
  51. # Text File (.txt) containing the independent courses dump of Pluralsight
  52. INDEPENDENT_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "courses.txt")
  53. # Text File (.txt) containing the list of links of the downloaded courses
  54. DOWNLOADED_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "downloaded.txt")
  55. # Options for youtube-dl. For a complete list of options, check https://github.com/ytdl-org/youtube-dl/blob/3e4cedf9e8cd3157df2457df7274d0c842421945/youtube_dl/YoutubeDL.py#L137-L312
  56. ydl_options = {
  57. 'writesubtitles': True,
  58. 'nooverwrites': True,
  59. 'retries': 100,
  60. 'download_archive': ARCHIVE_FILE_PATH
  61. }
  62. if IS_DELAY_BETWEEN_VIDEO:
  63. ydl_options['sleep_interval'] = MIN_VIDEO_DOWNLOAD_DELAY
  64. ydl_options['max_sleep_interval'] = MAX_VIDEO_DOWNLOAD_DELAY
  65. # endregion
  66. # region Classes
  67. class PluralsightPath(object):
  68. def __init__(self, id: int, link: str, title: str, slug: str, course_links: List[str]):
  69. self.id = id
  70. self.link = link
  71. self.title = title
  72. self.slug = slug
  73. self.course_links = course_links
  74. class DisposableMail:
  75. """
  76. This class is used to generate random disposable emails
  77. """
  78. def __init__(self):
  79. self.email_address = requests.post("https://gmailnator.com/index/indexquery", {'action': 'GenerateEmail'}).text
  80. def get_mail_item_id(self) -> object:
  81. post_url = "https://gmailnator.com/mailbox/mailboxquery"
  82. post_data = {
  83. 'action': 'LoadMailList',
  84. 'Email_address': self.email_address
  85. }
  86. while True:
  87. try:
  88. time.sleep(1)
  89. response_text = requests.post(post_url, post_data).json()[0]['content']
  90. result = re.findall('#(.*)\\">', response_text)
  91. mail_id = result[0]
  92. return mail_id
  93. except:
  94. pass
  95. def get_verification_link(self) -> str:
  96. post_url = "https://gmailnator.com/mailbox/get_single_message/"
  97. post_data = {
  98. 'action': 'LoadMailList',
  99. 'message_id': self.get_mail_item_id(),
  100. 'email': self.email_address.split("+")[0]
  101. }
  102. response_data = requests.post(post_url, post_data).text
  103. soup = BeautifulSoup(response_data, 'html.parser')
  104. for link in soup.findAll('a', href=True):
  105. if "https://app.pluralsight.com/id/forgotpassword/reset?token" in link['href']:
  106. return link['href']
  107. class Pluralsight:
  108. """
  109. This class handles the registration and verification of new Pluralsight accounts
  110. """
  111. def __init__(self, email: str, password: str, is_headless: bool = True, proxy_tuple: Tuple[str, str] = None):
  112. profile = FirefoxProfile()
  113. if proxy_tuple is not None:
  114. profile.set_preference("network.proxy.type", 1)
  115. profile.set_preference("network.proxy.http", proxy_tuple[0])
  116. profile.set_preference("network.proxy.http_port", int(proxy_tuple[1]))
  117. profile.set_preference("network.proxy.ssl", proxy_tuple[0])
  118. profile.set_preference("network.proxy.ssl_port", int(proxy_tuple[1]))
  119. if is_headless:
  120. options = Options()
  121. options.add_argument("--headless")
  122. self.driver = webdriver.Firefox(options=options, firefox_profile=profile)
  123. else:
  124. self.driver = webdriver.Firefox(firefox_profile=profile)
  125. self.email = email
  126. self.password = password
  127. def __enter__(self):
  128. return self
  129. def __exit__(self, exc_type, exc_val, exc_tb):
  130. self.driver.quit()
  131. def register(self) -> None:
  132. """
  133. Registers new Pluralsight account
  134. """
  135. self.driver.get("https://www.pluralsight.com/offer/2020/free-april-month")
  136. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  137. .until(expected_conditions
  138. .presence_of_element_located((By.CLASS_NAME, "cookie_notification--opt_in")))
  139. accept_cookie_button_element = self.driver.find_element_by_class_name("cookie_notification--opt_in")
  140. accept_cookie_button_element.click()
  141. sign_up_now_button_element = self.driver.find_element_by_xpath('//a[@data-aa-title="Free-April-Start-Now"]')
  142. sign_up_now_button_element.click()
  143. email_input_element = self.driver.find_element_by_name("email")
  144. firstname_input_element = self.driver.find_element_by_name("firstname")
  145. lastname_input_element = self.driver.find_element_by_name("lastname")
  146. tos_checkbox_element = self.driver.find_element_by_name("optInBox")
  147. firstname, lastname = get_name()
  148. email_input_element.send_keys(self.email)
  149. firstname_input_element.send_keys(firstname)
  150. lastname_input_element.send_keys(lastname)
  151. tos_checkbox_element.click()
  152. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  153. .until(expected_conditions
  154. .presence_of_element_located((By.XPATH, "//*[contains(text(), 'I agree, activate benefit')]")))
  155. create_account_button_element = self.driver.find_element_by_xpath(
  156. "//*[contains(text(), 'I agree, activate benefit')]")
  157. create_account_button_element.click()
  158. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS * 4) \
  159. .until(expected_conditions
  160. .presence_of_element_located((By.CLASS_NAME, "cancelButton---CKAut")))
  161. cancel_button_element = self.driver.find_element_by_class_name("cancelButton---CKAut")
  162. cancel_button_element.click()
  163. def set_password(self, verification_link: str) -> None:
  164. """
  165. Sets password in the given verification link
  166. Args:
  167. verification_link: The verification link (as string) to set up password
  168. """
  169. self.driver.get(verification_link)
  170. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  171. .until(expected_conditions
  172. .presence_of_element_located((By.ID, "Password")))
  173. password_input_element = self.driver.find_element_by_id("Password")
  174. password_confirm_input_element = self.driver.find_element_by_id("PasswordConfirmation")
  175. save_button_element = self.driver.find_element_by_class_name("psds-button--appearance-primary")
  176. password_input_element.send_keys(self.password)
  177. password_confirm_input_element.send_keys(self.password)
  178. save_button_element.click()
  179. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  180. .until(expected_conditions
  181. .presence_of_element_located((By.ID, "Password")))
  182. # endregion
  183. # region Utility Functions
  184. def get_name() -> Tuple[str, str]:
  185. """
  186. Generate a random string to be used as first or last name
  187. Returns:
  188. str: Generated string
  189. """
  190. request_url = "http://names.drycodes.com/1?nameOptions=boy_names" if random.choice([True, False]) \
  191. else "http://names.drycodes.com/1?nameOptions=girl_names"
  192. first_last_name = requests.get(request_url).text.strip('"[]').split('_')
  193. return first_last_name
  194. def get_password(min_length: int = 25, max_length: int = 50) -> str:
  195. """
  196. Generates a random password using ascii letters and numerical digits
  197. Args:
  198. min_length: Minimum length of the password, default is 25
  199. max_length: Minimum length of the password, default is 50
  200. Returns: Generated password as string
  201. """
  202. length = random.randint(min_length, max_length)
  203. alphabet = string.ascii_letters + string.digits
  204. password = ''.join(secrets.choice(alphabet) for _ in range(length))
  205. return password
  206. # endregion
  207. def create_pluralsight_account(proxy_tuple: Tuple[str, str] = None) -> Dict[str, str]:
  208. """
  209. Creates new Pluralsight account and returns the email/password as a dictionary
  210. Returns:
  211. Dict[str, str]: Dictionary containing email and password with eponymous pair keys
  212. """
  213. disposable_email = DisposableMail()
  214. password = get_password()
  215. with Pluralsight(email=disposable_email.email_address,
  216. password=password,
  217. is_headless=HIDE_SELENIUM_INSTANCES,
  218. proxy_tuple=proxy_tuple) as ps:
  219. ps.register()
  220. verification_link = disposable_email.get_verification_link()
  221. ps.set_password(verification_link=verification_link)
  222. return {'email': disposable_email.email_address, 'password': password}
  223. def get_paths_and_independent_courses(paths_file: str, courses_file: str) -> Tuple[List[PluralsightPath], List[str]]:
  224. if paths_file.startswith("http"):
  225. json_string = requests.get(paths_file).text
  226. else:
  227. json_string = Path(paths_file).read_text()
  228. pluralsight_paths_dicts_list = json.loads(json_string)
  229. pluralsight_paths_list = [PluralsightPath(**paths_dict) for paths_dict in pluralsight_paths_dicts_list]
  230. if not os.path.isfile(courses_file):
  231. return pluralsight_paths_list, []
  232. if courses_file.startswith("http"):
  233. courses_list_string = requests.get(courses_file).text
  234. else:
  235. courses_list_string = Path(courses_file).read_text()
  236. courses_list = list(filter(None, courses_list_string.split('\n')))
  237. return pluralsight_paths_list, courses_list
  238. def get_directory_full_path(root_directory_path: str, pluralsight_path: PluralsightPath) -> str:
  239. directory_name = f"{pluralsight_path.id:03d} - {pluralsight_path.title}" if not IS_WINDOWS \
  240. else f"{pluralsight_path.id:03d} - {pluralsight_path.slug}"
  241. directory_full_path = os.path.join(root_directory_path, directory_name)
  242. return directory_full_path
  243. def save_progress(course_link: str) -> None:
  244. with open(DOWNLOADED_COURSES_FILE_LOCATION, 'a+') as downloaded_file:
  245. downloaded_file.write(f"{course_link}\n")
  246. def is_downloaded(course_link: str) -> bool:
  247. if not os.path.isfile(DOWNLOADED_COURSES_FILE_LOCATION):
  248. return False
  249. downloaded_courses_list = list(filter(None, Path(DOWNLOADED_COURSES_FILE_LOCATION).read_text().split('\n')))
  250. return course_link in downloaded_courses_list
  251. def download_single_course(course_link: str,
  252. username: str,
  253. password: str,
  254. save_directory_path: str,
  255. proxy_tuple: Tuple[str, str] = None) -> bool:
  256. """
  257. Download the given course using the provided credential
  258. Args:
  259. course_link: The link of the course to download
  260. username: Username (Email) of the Pluralsight account to be used for download
  261. password: Password of the Pluralsight account to be used for download
  262. save_directory_path: Absolute path of Root save directory
  263. proxy_tuple: Proxy in (IP, Port) Tuple format
  264. Returns: True/False bool value denoting the success status of the download
  265. """
  266. if is_downloaded(course_link):
  267. return True
  268. retry_count = 0
  269. while retry_count < DOWNLOAD_RETRY_COUNT:
  270. try:
  271. ydl_options['username'] = username
  272. ydl_options['password'] = password
  273. if proxy_tuple is not None:
  274. ydl_options['proxy'] = f"https://{proxy_tuple[0]}:{proxy_tuple[1]}"
  275. if IS_WINDOWS:
  276. ydl_options[
  277. 'outtmpl'] = f"{save_directory_path}\\%(playlist)s\\%(chapter_number)s - %(chapter)s\\%(playlist_index)s - %(title)s.%(ext)s"
  278. else:
  279. ydl_options[
  280. 'outtmpl'] = f"{save_directory_path}/%(playlist)s/%(chapter_number)s - %(chapter)s/%(playlist_index)s - %(title)s.%(ext)s"
  281. with youtube_dl.YoutubeDL(ydl_options) as ydl:
  282. ydl.download([course_link])
  283. save_progress(course_link)
  284. return True
  285. except KeyboardInterrupt:
  286. break
  287. except Exception as e:
  288. if "http error 429" in str(e).lower():
  289. time.sleep(300)
  290. elif "http error 403" in str(e).lower():
  291. return False
  292. retry_count += 1
  293. return False
  294. def download_batch_course(course_links_list: List[str],
  295. username: str,
  296. password: str,
  297. save_directory_path: str,
  298. proxy_tuple: Tuple[str, str] = None) -> bool:
  299. download_single_course_function = partial(download_single_course,
  300. username=username,
  301. password=password,
  302. proxy_tuple=proxy_tuple,
  303. save_directory_path=save_directory_path)
  304. with Pool(MAX_PARALLEL_COURSE_DOWNLOAD_COUNT) as p:
  305. download_results = map(lambda link: link, p.map(download_single_course_function, course_links_list))
  306. return False not in download_results
  307. def download_pluralsight_path(pluralsight_paths_list: List[PluralsightPath],
  308. path_id: int,
  309. proxy_tuple: Tuple[str, str] = None) -> bool:
  310. try:
  311. retry_count = 0
  312. while retry_count < MAX_ACCOUNT_PER_IP:
  313. credential_dict = create_pluralsight_account(proxy_tuple)
  314. for pluralsight_path in pluralsight_paths_list:
  315. if pluralsight_path.id == path_id:
  316. save_directory_path = get_directory_full_path(SAVE_PATHS_DIRECTORY_PATH, pluralsight_path)
  317. result = download_batch_course(course_links_list=pluralsight_path.course_links,
  318. username=credential_dict['email'],
  319. password=credential_dict['password'],
  320. save_directory_path=save_directory_path,
  321. proxy_tuple=proxy_tuple)
  322. if result:
  323. return True
  324. retry_count += 1
  325. except Exception as e:
  326. print(e)
  327. return False
  328. def download_independent_courses(course_list: List[str], proxy_tuple:Tuple[str, str] = None) -> bool:
  329. try:
  330. retry_count = 0
  331. while retry_count < MAX_ACCOUNT_PER_IP:
  332. credential_dict = create_pluralsight_account()
  333. result = download_batch_course(course_links_list=course_list,
  334. username=credential_dict['email'],
  335. password=credential_dict['password'],
  336. save_directory_path=SAVE_COURSES_DIRECTORY_PATH,
  337. proxy_tuple=proxy_tuple)
  338. if result:
  339. return True
  340. retry_count += 1
  341. except Exception as e:
  342. print(e)
  343. return False
  344. def main():
  345. try:
  346. paths_list, individual_courses_list = get_paths_and_independent_courses(PATHS_JSON_LOCATION,
  347. INDEPENDENT_COURSES_FILE_LOCATION)
  348. proxy_tuple = None
  349. if len(sys.argv[1].split(":")) == 2:
  350. proxy_tuple = sys.argv[1].split(":")
  351. if len(sys.argv) == 2:
  352. download_independent_courses(individual_courses_list, proxy_tuple)
  353. else:
  354. if 1 <= int(sys.argv[2]) <= len(paths_list):
  355. result = download_pluralsight_path(paths_list, int(sys.argv[2]), proxy_tuple)
  356. if result:
  357. print("SUCCESS")
  358. else:
  359. print("ERROR OCCURRED!!")
  360. except Exception as exception:
  361. print(exception)
  362. if __name__ == '__main__':
  363. main()