12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- import json
- from multiprocessing.pool import Pool
- from typing import Dict
-
- import requests
- from bs4 import BeautifulSoup
-
- from src.pluralsight.pluralsight import PluralsightPath
-
- # region Global Constant(s) and Readonly Variable(s)
-
- MAX_POOL_SIZE = 25
- ROOT_PLURALSIGHT_URL = "https://www.pluralsight.com"
- PATH_ID_COUNTER = 1
-
- # endregion
-
-
- def process_path(link_dict: Dict) -> PluralsightPath:
- """
- Process a pluralsight path to extract title and all courses and return it as PluralsightPath variable
-
- Args:
- link_dict: Dictionary containing id (int) and link (str) with two keys: 'id' & 'link'
-
- Returns:
- PluralsightPath: The processed PluralsightPath object
- """
-
- path_id = int(link_dict['id'])
- path_link = link_dict['link']
-
- path_index_html = requests.get(path_link).text
- soup = BeautifulSoup(path_index_html, 'html.parser')
-
- title = soup.find('title').string.replace("| Pluralsight", "").rstrip()
-
- course_absolute_links = []
- for link_node in soup.findAll('a', href=True):
- course_relative_link = str(link_node['href'])
- if course_relative_link.startswith("/course"):
- course_absolute_links.append(f"{ROOT_PLURALSIGHT_URL}{course_relative_link}")
-
- pluralsight_path = PluralsightPath(id=path_id,
- link=path_link,
- title=title,
- slug=link_dict['link'].replace("https://www.pluralsight.com/paths/", ""),
- course_links=course_absolute_links)
-
- return pluralsight_path
-
-
- def get_path_courses_json_string() -> str:
- """
- Get all Pluralsight paths & courses as a JSON string
-
- Returns:
- str: JSON string containing all the paths and courses of Pluralsight
- """
-
- global PATH_ID_COUNTER
-
- index_html = requests.get("https://www.pluralsight.com/product/paths").text
- soup = BeautifulSoup(index_html, 'html.parser')
- all_course_divs = soup.findAll("div", {"class": "item"})
-
- path_dict_list = []
- for course_div in all_course_divs:
- temp_soup = BeautifulSoup(str(course_div), 'html.parser')
-
- link_string = f"{ROOT_PLURALSIGHT_URL}{str(temp_soup.find('a', href=True)['href'])}"
-
- path_dict_list.append({
- 'id': PATH_ID_COUNTER,
- 'link': link_string
- })
-
- PATH_ID_COUNTER += 1
-
- with Pool(MAX_POOL_SIZE) as p:
- pluralsight_paths = map(lambda link: link, p.map(process_path, path_dict_list))
-
- json_string = json.dumps([ob.__dict__ for ob in pluralsight_paths])
-
- return json_string
-
-
- def main():
- pluralsight_dump_as_json = get_path_courses_json_string()
- print(pluralsight_dump_as_json)
-
-
- if __name__ == '__main__':
- main()
|