You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper.py 2.7KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import json
  2. from multiprocessing.pool import Pool
  3. from typing import Dict
  4. import requests
  5. from bs4 import BeautifulSoup
  6. from src.pluralsight.pluralsight import PluralsightPath
  7. # region Global Constant(s) and Readonly Variable(s)
  8. MAX_POOL_SIZE = 25
  9. ROOT_PLURALSIGHT_URL = "https://www.pluralsight.com"
  10. PATH_ID_COUNTER = 1
  11. # endregion
  12. def process_path(link_dict: Dict) -> PluralsightPath:
  13. """
  14. Process a pluralsight path to extract title and all courses and return it as PluralsightPath variable
  15. Args:
  16. link_dict: Dictionary containing id (int) and link (str) with two keys: 'id' & 'link'
  17. Returns:
  18. PluralsightPath: The processed PluralsightPath object
  19. """
  20. path_id = int(link_dict['id'])
  21. path_link = link_dict['link']
  22. path_index_html = requests.get(path_link).text
  23. soup = BeautifulSoup(path_index_html, 'html.parser')
  24. title = soup.find('title').string.replace("| Pluralsight", "").rstrip()
  25. course_absolute_links = []
  26. for link_node in soup.findAll('a', href=True):
  27. course_relative_link = str(link_node['href'])
  28. if course_relative_link.startswith("/course"):
  29. course_absolute_links.append(f"{ROOT_PLURALSIGHT_URL}{course_relative_link}")
  30. pluralsight_path = PluralsightPath(id=path_id,
  31. link=path_link,
  32. title=title,
  33. slug=link_dict['link'].replace("https://www.pluralsight.com/paths/", ""),
  34. course_links=course_absolute_links)
  35. return pluralsight_path
  36. def get_path_courses_json_string() -> str:
  37. """
  38. Get all Pluralsight paths & courses as a JSON string
  39. Returns:
  40. str: JSON string containing all the paths and courses of Pluralsight
  41. """
  42. global PATH_ID_COUNTER
  43. index_html = requests.get("https://www.pluralsight.com/product/paths").text
  44. soup = BeautifulSoup(index_html, 'html.parser')
  45. all_course_divs = soup.findAll("div", {"class": "item"})
  46. path_dict_list = []
  47. for course_div in all_course_divs:
  48. temp_soup = BeautifulSoup(str(course_div), 'html.parser')
  49. link_string = f"{ROOT_PLURALSIGHT_URL}{str(temp_soup.find('a', href=True)['href'])}"
  50. path_dict_list.append({
  51. 'id': PATH_ID_COUNTER,
  52. 'link': link_string
  53. })
  54. PATH_ID_COUNTER += 1
  55. with Pool(MAX_POOL_SIZE) as p:
  56. pluralsight_paths = map(lambda link: link, p.map(process_path, path_dict_list))
  57. json_string = json.dumps([ob.__dict__ for ob in pluralsight_paths])
  58. return json_string
  59. def main():
  60. pluralsight_dump_as_json = get_path_courses_json_string()
  61. print(pluralsight_dump_as_json)
  62. if __name__ == '__main__':
  63. main()