|
|
@@ -1,28 +1,22 @@ |
|
|
|
import json |
|
|
|
from multiprocessing.pool import Pool |
|
|
|
from typing import Dict |
|
|
|
|
|
|
|
import requests |
|
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
from src.pluralsight.pluralsight import PluralsightPath |
|
|
|
|
|
|
|
# region Global Constant(s) and Readonly Variable(s) |
|
|
|
|
|
|
|
MAX_POOL_SIZE = 25 |
|
|
|
ROOT_PLURALSIGHT_URL = "https://www.pluralsight.com" |
|
|
|
ID_COUNTER = 1 |
|
|
|
PATH_ID_COUNTER = 1 |
|
|
|
|
|
|
|
# endregion |
|
|
|
|
|
|
|
|
|
|
|
class PluralsightPath: |
|
|
|
def __init__(self, id: int, link: str): |
|
|
|
self.id = id |
|
|
|
self.link = link |
|
|
|
self.title = "" |
|
|
|
self.slug = "" |
|
|
|
self.course_links = [] |
|
|
|
|
|
|
|
|
|
|
|
def process_path(link_dict) -> PluralsightPath: |
|
|
|
def process_path(link_dict: Dict) -> PluralsightPath: |
|
|
|
""" |
|
|
|
Process a pluralsight path to extract title and all courses and return it as PluralsightPath variable |
|
|
|
|
|
|
@@ -33,20 +27,25 @@ def process_path(link_dict) -> PluralsightPath: |
|
|
|
PluralsightPath: The processed PluralsightPath object |
|
|
|
""" |
|
|
|
|
|
|
|
pluralsight_path = PluralsightPath(id=link_dict['id'], link=link_dict['link']) |
|
|
|
path_id = int(link_dict['id']) |
|
|
|
path_link = link_dict['link'] |
|
|
|
|
|
|
|
path_index_html = requests.get(pluralsight_path.link).text |
|
|
|
path_index_html = requests.get(path_link).text |
|
|
|
soup = BeautifulSoup(path_index_html, 'html.parser') |
|
|
|
|
|
|
|
title = soup.find('title').string.replace("| Pluralsight", "").rstrip() |
|
|
|
|
|
|
|
pluralsight_path.title = title |
|
|
|
pluralsight_path.slug = pluralsight_path.link.replace("https://www.pluralsight.com/paths/","") |
|
|
|
|
|
|
|
course_absolute_links = [] |
|
|
|
for link_node in soup.findAll('a', href=True): |
|
|
|
link_dict = str(link_node['href']) |
|
|
|
if link_dict.startswith("/course"): |
|
|
|
pluralsight_path.course_links.append(f"{ROOT_PLURALSIGHT_URL}{link_dict}") |
|
|
|
course_relative_link = str(link_node['href']) |
|
|
|
if course_relative_link.startswith("/course"): |
|
|
|
course_absolute_links.append(f"{ROOT_PLURALSIGHT_URL}{course_relative_link}") |
|
|
|
|
|
|
|
pluralsight_path = PluralsightPath(id=path_id, |
|
|
|
link=path_link, |
|
|
|
title=title, |
|
|
|
slug=link_dict['link'].replace("https://www.pluralsight.com/paths/", ""), |
|
|
|
course_links=course_absolute_links) |
|
|
|
|
|
|
|
return pluralsight_path |
|
|
|
|
|
|
@@ -59,35 +58,36 @@ def get_path_courses_json_string() -> str: |
|
|
|
str: JSON string containing all the paths and courses of Pluralsight |
|
|
|
""" |
|
|
|
|
|
|
|
global ID_COUNTER |
|
|
|
global PATH_ID_COUNTER |
|
|
|
|
|
|
|
index_html = requests.get("https://www.pluralsight.com/product/paths").text |
|
|
|
soup = BeautifulSoup(index_html, 'html.parser') |
|
|
|
all_course_divs = soup.findAll("div", {"class": "item"}) |
|
|
|
|
|
|
|
link_list = [] |
|
|
|
path_dict_list = [] |
|
|
|
for course_div in all_course_divs: |
|
|
|
temp_soup = BeautifulSoup(str(course_div), 'html.parser') |
|
|
|
|
|
|
|
link_string = f"{ROOT_PLURALSIGHT_URL}{str(temp_soup.find('a', href=True)['href'])}" |
|
|
|
|
|
|
|
link_list.append({ |
|
|
|
'id': ID_COUNTER, |
|
|
|
path_dict_list.append({ |
|
|
|
'id': PATH_ID_COUNTER, |
|
|
|
'link': link_string |
|
|
|
}) |
|
|
|
|
|
|
|
ID_COUNTER += 1 |
|
|
|
PATH_ID_COUNTER += 1 |
|
|
|
|
|
|
|
with Pool(MAX_POOL_SIZE) as p: |
|
|
|
path_list = map(lambda link: link, p.map(process_path, link_list)) |
|
|
|
pluralsight_paths = map(lambda link: link, p.map(process_path, path_dict_list)) |
|
|
|
|
|
|
|
json_string = json.dumps([ob.__dict__ for ob in path_list]) |
|
|
|
json_string = json.dumps([ob.__dict__ for ob in pluralsight_paths]) |
|
|
|
|
|
|
|
return json_string |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
print(get_path_courses_json_string()) |
|
|
|
pluralsight_dump_as_json = get_path_courses_json_string() |
|
|
|
print(pluralsight_dump_as_json) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |