Browse Source

Add 'scraper.py'

Scrape the way!
master
parent
commit
f2d2be6616
1 changed files with 92 additions and 0 deletions
  1. 92
    0
      scraper.py

+ 92
- 0
scraper.py View File

@@ -0,0 +1,92 @@
import json
from multiprocessing.pool import Pool

import requests
from bs4 import BeautifulSoup

# region Global Constant(s) and Readonly Variable(s)

MAX_POOL_SIZE = 25
ROOT_PLURALSIGHT_URL = "https://www.pluralsight.com"
ID_COUNTER = 1

# endregion


class PluralsightPath:
def __init__(self, id: int, link: str):
self.id = id
self.link = link
self.title = ""
self.course_links = []


def process_path(link_dict) -> PluralsightPath:
"""
Process a pluralsight path to extract title and all courses and return it as PluralsightPath variable

Args:
link_dict: Dictionary containing id (int) and link (str) with two keys: 'id' & 'link'

Returns:
PluralsightPath: The processed PluralsightPath object
"""

pluralsight_path = PluralsightPath(id=link_dict['id'], link=link_dict['link'])

path_index_html = requests.get(pluralsight_path.link).text
soup = BeautifulSoup(path_index_html, 'html.parser')

title = soup.find('title').string.replace("| Pluralsight", "").rstrip()

pluralsight_path.title = title

for link_node in soup.findAll('a', href=True):
link_dict = str(link_node['href'])
if link_dict.startswith("/course"):
pluralsight_path.course_links.append(f"{ROOT_PLURALSIGHT_URL}{link_dict}")

return pluralsight_path


def get_path_courses_json_string() -> str:
"""
Get all Pluralsight paths & courses as a JSON string

Returns:
str: JSON string containing all the paths and courses of Pluralsight
"""

global ID_COUNTER

index_html = requests.get("https://www.pluralsight.com/product/paths").text
soup = BeautifulSoup(index_html, 'html.parser')
all_course_divs = soup.findAll("div", {"class": "item"})

link_list = []
for course_div in all_course_divs:
temp_soup = BeautifulSoup(str(course_div), 'html.parser')

link_string = f"{ROOT_PLURALSIGHT_URL}{str(temp_soup.find('a', href=True)['href'])}"

link_list.append({
'id': ID_COUNTER,
'link': link_string
})

ID_COUNTER += 1

with Pool(MAX_POOL_SIZE) as p:
path_list = map(lambda link: link, p.map(process_path, link_list))

json_string = json.dumps([ob.__dict__ for ob in path_list])

return json_string


def main():
print(get_path_courses_json_string())


if __name__ == '__main__':
main()

Loading…
Cancel
Save