You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper.py 2.5KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import json
  2. from multiprocessing.pool import Pool
  3. import requests
  4. from bs4 import BeautifulSoup
  5. # region Global Constant(s) and Readonly Variable(s)
  6. MAX_POOL_SIZE = 25
  7. ROOT_PLURALSIGHT_URL = "https://www.pluralsight.com"
  8. ID_COUNTER = 1
  9. # endregion
  10. class PluralsightPath:
  11. def __init__(self, id: int, link: str):
  12. self.id = id
  13. self.link = link
  14. self.title = ""
  15. self.slug = ""
  16. self.course_links = []
  17. def process_path(link_dict) -> PluralsightPath:
  18. """
  19. Process a pluralsight path to extract title and all courses and return it as PluralsightPath variable
  20. Args:
  21. link_dict: Dictionary containing id (int) and link (str) with two keys: 'id' & 'link'
  22. Returns:
  23. PluralsightPath: The processed PluralsightPath object
  24. """
  25. pluralsight_path = PluralsightPath(id=link_dict['id'], link=link_dict['link'])
  26. path_index_html = requests.get(pluralsight_path.link).text
  27. soup = BeautifulSoup(path_index_html, 'html.parser')
  28. title = soup.find('title').string.replace("| Pluralsight", "").rstrip()
  29. pluralsight_path.title = title
  30. pluralsight_path.slug = pluralsight_path.link.replace("https://www.pluralsight.com/paths/","")
  31. for link_node in soup.findAll('a', href=True):
  32. link_dict = str(link_node['href'])
  33. if link_dict.startswith("/course"):
  34. pluralsight_path.course_links.append(f"{ROOT_PLURALSIGHT_URL}{link_dict}")
  35. return pluralsight_path
  36. def get_path_courses_json_string() -> str:
  37. """
  38. Get all Pluralsight paths & courses as a JSON string
  39. Returns:
  40. str: JSON string containing all the paths and courses of Pluralsight
  41. """
  42. global ID_COUNTER
  43. index_html = requests.get("https://www.pluralsight.com/product/paths").text
  44. soup = BeautifulSoup(index_html, 'html.parser')
  45. all_course_divs = soup.findAll("div", {"class": "item"})
  46. link_list = []
  47. for course_div in all_course_divs:
  48. temp_soup = BeautifulSoup(str(course_div), 'html.parser')
  49. link_string = f"{ROOT_PLURALSIGHT_URL}{str(temp_soup.find('a', href=True)['href'])}"
  50. link_list.append({
  51. 'id': ID_COUNTER,
  52. 'link': link_string
  53. })
  54. ID_COUNTER += 1
  55. with Pool(MAX_POOL_SIZE) as p:
  56. path_list = map(lambda link: link, p.map(process_path, link_list))
  57. json_string = json.dumps([ob.__dict__ for ob in path_list])
  58. return json_string
  59. def main():
  60. print(get_path_courses_json_string())
  61. if __name__ == '__main__':
  62. main()