Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

pluralsight.py 16KB


  1. import json
  2. import os
  3. import random
  4. import re
  5. import secrets
  6. import string
  7. import sys
  8. import time
  9. from functools import partial
  10. from multiprocessing.pool import Pool
  11. from pathlib import Path
  12. from typing import List, Tuple, Dict
  13. import requests
  14. import youtube_dl
  15. from bs4 import BeautifulSoup
  16. from selenium import webdriver
  17. from selenium.webdriver.common.by import By
  18. from selenium.webdriver.firefox.options import Options
  19. from selenium.webdriver.support import expected_conditions
  20. from selenium.webdriver.support.wait import WebDriverWait
  21. # region Global Constant(s) and Readonly Variable(s)
  22. # True/False to determine whether selenium instances will be visible or not (headless)
  23. HIDE_SELENIUM_INSTANCES = False
  24. # Denotes Time.Sleep() duration in seconds
  25. SELENIUM_TIMEOUT_IN_SECONDS = 30
  26. # Maximum parallel course download count
  27. MAX_PARALLEL_COURSE_DOWNLOAD_COUNT = 3
  28. # Maximum number of retry count for downloads
  29. DOWNLOAD_RETRY_COUNT = 100
  30. # Maximum number of account registrations allowed per IP
  31. MAX_ACCOUNT_PER_IP = 5
  32. # Minimum number of seconds to wait between consecutive video downloads
  33. MIN_VIDEO_DOWNLOAD_DELAY = 10
  34. # Minimum number of seconds to wait between consecutive video downloads
  35. MAX_VIDEO_DOWNLOAD_DELAY = 30
  36. # Check if current OS/platform is Windows
  37. IS_WINDOWS = sys.platform.startswith("win")
  38. # Master Directory Path (Default: Working Directory)
  39. MASTER_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
  40. # Path of the archive text file to log all downloaded videos
  41. ARCHIVE_FILE_PATH = os.path.join(MASTER_DIRECTORY, "archive.txt")
  42. # Path of the directory where downloaded paths will be saved
  43. SAVE_PATHS_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Paths")
  44. # Path of the directory where downloaded independent courses will be saved
  45. SAVE_COURSES_DIRECTORY_PATH = os.path.join(MASTER_DIRECTORY, "Courses")
  46. # JSON File (.json) containing the JSON of the paths/courses dump of Pluralsight
  47. PATHS_JSON_LOCATION = os.path.join(MASTER_DIRECTORY, "pluralsight.json")
  48. # Text File (.txt) containing the independent courses dump of Pluralsight
  49. INDEPENDENT_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "courses.txt")
  50. # Text File (.txt) containing the list of links of the downloaded courses
  51. DOWNLOADED_COURSES_FILE_LOCATION = os.path.join(MASTER_DIRECTORY, "downloaded.txt")
  52. # Options for youtube-dl. For a complete list of options, check https://github.com/ytdl-org/youtube-dl/blob/3e4cedf9e8cd3157df2457df7274d0c842421945/youtube_dl/YoutubeDL.py#L137-L312
  53. ydl_options = {
  54. 'writesubtitles': True,
  55. 'nooverwrites': True,
  56. 'sleep_interval': MIN_VIDEO_DOWNLOAD_DELAY,
  57. 'max_sleep_interval': MAX_VIDEO_DOWNLOAD_DELAY,
  58. 'retries': 100,
  59. 'download_archive': ARCHIVE_FILE_PATH
  60. }
  61. # endregion
  62. # region Classes
  63. class PluralsightPath(object):
  64. def __init__(self, id: int, link: str, title: str, slug: str, course_links: List[str]):
  65. self.id = id
  66. self.link = link
  67. self.title = title
  68. self.slug = slug
  69. self.course_links = course_links
  70. class DisposableMail:
  71. """
  72. This class is used to generate random disposable emails
  73. """
  74. def __init__(self):
  75. self.email_address = requests.post("https://gmailnator.com/index/indexquery", {'action': 'GenerateEmail'}).text
  76. def get_mail_item_id(self) -> object:
  77. post_url = "https://gmailnator.com/mailbox/mailboxquery"
  78. post_data = {
  79. 'action': 'LoadMailList',
  80. 'Email_address': self.email_address
  81. }
  82. while True:
  83. try:
  84. time.sleep(1)
  85. response_text = requests.post(post_url, post_data).json()[0]['content']
  86. result = re.findall('#(.*)\\">', response_text)
  87. mail_id = result[0]
  88. return mail_id
  89. except:
  90. pass
  91. def get_verification_link(self) -> str:
  92. post_url = "https://gmailnator.com/mailbox/get_single_message/"
  93. post_data = {
  94. 'action': 'LoadMailList',
  95. 'message_id': self.get_mail_item_id(),
  96. 'email': self.email_address.split("+")[0]
  97. }
  98. response_data = requests.post(post_url, post_data).text
  99. soup = BeautifulSoup(response_data, 'html.parser')
  100. for link in soup.findAll('a', href=True):
  101. if "https://app.pluralsight.com/id/forgotpassword/reset?token" in link['href']:
  102. return link['href']
  103. class Pluralsight:
  104. """
  105. This class handles the registration and verification of new Pluralsight accounts
  106. """
  107. def __init__(self, email: str, password: str, is_headless: bool = True):
  108. if is_headless:
  109. options = Options()
  110. options.add_argument("--headless")
  111. self.driver = webdriver.Firefox(options=options)
  112. else:
  113. self.driver = webdriver.Firefox()
  114. self.email = email
  115. self.password = password
  116. def __enter__(self):
  117. return self
  118. def __exit__(self, exc_type, exc_val, exc_tb):
  119. self.driver.quit()
  120. def register(self) -> None:
  121. """
  122. Registers new Pluralsight account
  123. """
  124. self.driver.get("https://www.pluralsight.com/offer/2020/free-april-month")
  125. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  126. .until(expected_conditions
  127. .presence_of_element_located((By.CLASS_NAME, "cookie_notification--opt_in")))
  128. accept_cookie_button_element = self.driver.find_element_by_class_name("cookie_notification--opt_in")
  129. accept_cookie_button_element.click()
  130. sign_up_now_button_element = self.driver.find_element_by_xpath('//a[@data-aa-title="Free-April-Start-Now"]')
  131. sign_up_now_button_element.click()
  132. email_input_element = self.driver.find_element_by_name("email")
  133. firstname_input_element = self.driver.find_element_by_name("firstname")
  134. lastname_input_element = self.driver.find_element_by_name("lastname")
  135. tos_checkbox_element = self.driver.find_element_by_name("optInBox")
  136. firstname, lastname = get_name()
  137. email_input_element.send_keys(self.email)
  138. firstname_input_element.send_keys(firstname)
  139. lastname_input_element.send_keys(lastname)
  140. tos_checkbox_element.click()
  141. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  142. .until(expected_conditions
  143. .presence_of_element_located((By.XPATH, "//*[contains(text(), 'I agree, activate benefit')]")))
  144. create_account_button_element = self.driver.find_element_by_xpath(
  145. "//*[contains(text(), 'I agree, activate benefit')]")
  146. create_account_button_element.click()
  147. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS * 4) \
  148. .until(expected_conditions
  149. .presence_of_element_located((By.CLASS_NAME, "cancelButton---CKAut")))
  150. cancel_button_element = self.driver.find_element_by_class_name("cancelButton---CKAut")
  151. cancel_button_element.click()
  152. def set_password(self, verification_link: str) -> None:
  153. """
  154. Sets password in the given verification link
  155. Args:
  156. verification_link: The verification link (as string) to set up password
  157. """
  158. self.driver.get(verification_link)
  159. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  160. .until(expected_conditions
  161. .presence_of_element_located((By.ID, "Password")))
  162. password_input_element = self.driver.find_element_by_id("Password")
  163. password_confirm_input_element = self.driver.find_element_by_id("PasswordConfirmation")
  164. save_button_element = self.driver.find_element_by_class_name("psds-button--appearance-primary")
  165. password_input_element.send_keys(self.password)
  166. password_confirm_input_element.send_keys(self.password)
  167. save_button_element.click()
  168. WebDriverWait(self.driver, SELENIUM_TIMEOUT_IN_SECONDS) \
  169. .until(expected_conditions
  170. .presence_of_element_located((By.ID, "Password")))
  171. # endregion
  172. # region Utility Functions
  173. def get_name() -> Tuple[str, str]:
  174. """
  175. Generate a random string to be used as first or last name
  176. Returns:
  177. str: Generated string
  178. """
  179. request_url = "http://names.drycodes.com/1?nameOptions=boy_names" if random.choice([True, False]) \
  180. else "http://names.drycodes.com/1?nameOptions=girl_names"
  181. first_last_name = requests.get(request_url).text.strip('"[]').split('_')
  182. return first_last_name
  183. def get_password(min_length: int = 25, max_length: int = 50) -> str:
  184. """
  185. Generates a random password using ascii letters and numerical digits
  186. Args:
  187. min_length: Minimum length of the password, default is 25
  188. max_length: Minimum length of the password, default is 50
  189. Returns: Generated password as string
  190. """
  191. length = random.randint(min_length, max_length)
  192. alphabet = string.ascii_letters + string.digits
  193. password = ''.join(secrets.choice(alphabet) for _ in range(length))
  194. return password
  195. # endregion
  196. def create_pluralsight_account() -> Dict[str, str]:
  197. """
  198. Creates new Pluralsight account and returns the email/password as a dictionary
  199. Returns:
  200. Dict[str, str]: Dictionary containing email and password with eponymous pair keys
  201. """
  202. disposable_email = DisposableMail()
  203. password = get_password()
  204. with Pluralsight(email=disposable_email.email_address,
  205. password=password,
  206. is_headless=HIDE_SELENIUM_INSTANCES) as ps:
  207. ps.register()
  208. verification_link = disposable_email.get_verification_link()
  209. ps.set_password(verification_link=verification_link)
  210. return {'email': disposable_email.email_address, 'password': password}
  211. def get_paths_and_independent_courses(paths_file: str, courses_file: str) -> Tuple[List[PluralsightPath], List[str]]:
  212. if paths_file.startswith("http"):
  213. json_string = requests.get(paths_file).text
  214. else:
  215. json_string = Path(paths_file).read_text()
  216. pluralsight_paths_dicts_list = json.loads(json_string)
  217. pluralsight_paths_list = [PluralsightPath(**paths_dict) for paths_dict in pluralsight_paths_dicts_list]
  218. if courses_file.startswith("http"):
  219. courses_list_string = requests.get(paths_file).text
  220. else:
  221. courses_list_string = Path(paths_file).read_text()
  222. courses_list = list(filter(None, courses_list_string.split('\n')))
  223. return pluralsight_paths_list, courses_list
  224. def get_directory_full_path(root_directory_path: str, pluralsight_path: PluralsightPath) -> str:
  225. directory_name = f"{pluralsight_path.id:03d} - {pluralsight_path.title}" if not IS_WINDOWS \
  226. else f"{pluralsight_path.id:03d} - {pluralsight_path.slug}"
  227. directory_full_path = os.path.join(root_directory_path, directory_name)
  228. return directory_full_path
  229. def save_progress(course_link: str) -> None:
  230. with open(DOWNLOADED_COURSES_FILE_LOCATION, 'a+') as downloaded_file:
  231. downloaded_file.write(f"{course_link}\n")
  232. def is_downloaded(course_link: str) -> bool:
  233. if not os.path.isfile(DOWNLOADED_COURSES_FILE_LOCATION):
  234. return False
  235. downloaded_courses_list = list(filter(None, Path(DOWNLOADED_COURSES_FILE_LOCATION).read_text().split('\n')))
  236. return course_link in downloaded_courses_list
  237. def download_single_course(course_link: str, username: str, password: str, save_directory_path: str) -> bool:
  238. """
  239. Download the given course using the provided credential
  240. Args:
  241. course_link: The link of the course to download
  242. username: Username (Email) of the Pluralsight account to be used for download
  243. password: Password of the Pluralsight account to be used for download
  244. save_directory_path: Absolute path of Root save directory
  245. Returns: True/False bool value denoting the success status of the download
  246. """
  247. if is_downloaded(course_link):
  248. return True
  249. retry_count = 0
  250. while retry_count < DOWNLOAD_RETRY_COUNT:
  251. try:
  252. ydl_options['username'] = username
  253. ydl_options['password'] = password
  254. if IS_WINDOWS:
  255. ydl_options[
  256. 'outtmpl'] = f"{save_directory_path}\\%(playlist)s\\%(chapter_number)s - %(chapter)s\\%(playlist_index)s - %(title)s.%(ext)s"
  257. else:
  258. ydl_options[
  259. 'outtmpl'] = f"{save_directory_path}/%(playlist)s/%(chapter_number)s - %(chapter)s/%(playlist_index)s - %(title)s.%(ext)s"
  260. with youtube_dl.YoutubeDL(ydl_options) as ydl:
  261. ydl.download([course_link])
  262. save_progress(course_link)
  263. return True
  264. except KeyboardInterrupt:
  265. break
  266. except Exception as e:
  267. if "http error 429" in str(e).lower():
  268. time.sleep(300)
  269. elif "http error 403" in str(e).lower():
  270. return False
  271. retry_count += 1
  272. return False
  273. def download_batch_course(course_links_list: List[str], username: str, password: str, save_directory_path: str) -> bool:
  274. download_single_course_function = partial(download_single_course,
  275. username=username,
  276. password=password,
  277. save_directory_path=save_directory_path)
  278. with Pool(MAX_PARALLEL_COURSE_DOWNLOAD_COUNT) as p:
  279. download_results = map(lambda link: link, p.map(download_single_course_function, course_links_list))
  280. return False not in download_results
  281. def download_pluralsight_path(pluralsight_paths_list: List[PluralsightPath], path_id: int) -> bool:
  282. try:
  283. retry_count = 0
  284. while retry_count < MAX_ACCOUNT_PER_IP:
  285. credential_dict = create_pluralsight_account()
  286. for pluralsight_path in pluralsight_paths_list:
  287. if pluralsight_path.id == path_id:
  288. save_directory_path = get_directory_full_path(SAVE_PATHS_DIRECTORY_PATH, pluralsight_path)
  289. result = download_batch_course(course_links_list=pluralsight_path.course_links,
  290. username=credential_dict['email'],
  291. password=credential_dict['password'],
  292. save_directory_path=save_directory_path)
  293. if result:
  294. return True
  295. retry_count += 1
  296. except Exception as e:
  297. print(e)
  298. return False
  299. def download_independent_courses(course_list: List[str]) -> bool:
  300. try:
  301. retry_count = 0
  302. while retry_count < MAX_ACCOUNT_PER_IP:
  303. credential_dict = create_pluralsight_account()
  304. result = download_batch_course(course_links_list=course_list,
  305. username=credential_dict['email'],
  306. password=credential_dict['password'],
  307. save_directory_path=SAVE_COURSES_DIRECTORY_PATH)
  308. if result:
  309. return True
  310. retry_count += 1
  311. except Exception as e:
  312. print(e)
  313. return False
  314. def main():
  315. try:
  316. paths_list, individual_courses_list = get_paths_and_independent_courses(PATHS_JSON_LOCATION,
  317. INDEPENDENT_COURSES_FILE_LOCATION)
  318. if len(sys.argv) == 1:
  319. download_independent_courses(individual_courses_list)
  320. else:
  321. for arg in sys.argv[1:]:
  322. if 1 <= int(arg) <= len(paths_list):
  323. result = download_pluralsight_path(paths_list, int(arg))
  324. if result:
  325. print("SUCCESS")
  326. else:
  327. print("ERROR OCCURRED!!")
  328. except Exception as exception:
  329. print(exception)
  330. if __name__ == '__main__':
  331. main()