Quellcode durchsuchen

Dunno, forgot

master
Ursprung
Commit
65bd6590fc
1 geänderte Dateien mit 27 neuen und 27 gelöschten Zeilen
  1. 27
    27
      scraper.py

+ 27
- 27
scraper.py Datei anzeigen

@@ -1,28 +1,22 @@
import json
from multiprocessing.pool import Pool
from typing import Dict

import requests
from bs4 import BeautifulSoup

from src.pluralsight.pluralsight import PluralsightPath

# region Global Constant(s) and Readonly Variable(s)

MAX_POOL_SIZE = 25
ROOT_PLURALSIGHT_URL = "https://www.pluralsight.com"
ID_COUNTER = 1
PATH_ID_COUNTER = 1

# endregion


class PluralsightPath:
def __init__(self, id: int, link: str):
self.id = id
self.link = link
self.title = ""
self.slug = ""
self.course_links = []


def process_path(link_dict) -> PluralsightPath:
def process_path(link_dict: Dict) -> PluralsightPath:
"""
Process a pluralsight path to extract title and all courses and return it as PluralsightPath variable

@@ -33,20 +27,25 @@ def process_path(link_dict) -> PluralsightPath:
PluralsightPath: The processed PluralsightPath object
"""

pluralsight_path = PluralsightPath(id=link_dict['id'], link=link_dict['link'])
path_id = int(link_dict['id'])
path_link = link_dict['link']

path_index_html = requests.get(pluralsight_path.link).text
path_index_html = requests.get(path_link).text
soup = BeautifulSoup(path_index_html, 'html.parser')

title = soup.find('title').string.replace("| Pluralsight", "").rstrip()

pluralsight_path.title = title
pluralsight_path.slug = pluralsight_path.link.replace("https://www.pluralsight.com/paths/","")

course_absolute_links = []
for link_node in soup.findAll('a', href=True):
link_dict = str(link_node['href'])
if link_dict.startswith("/course"):
pluralsight_path.course_links.append(f"{ROOT_PLURALSIGHT_URL}{link_dict}")
course_relative_link = str(link_node['href'])
if course_relative_link.startswith("/course"):
course_absolute_links.append(f"{ROOT_PLURALSIGHT_URL}{course_relative_link}")

pluralsight_path = PluralsightPath(id=path_id,
link=path_link,
title=title,
slug=link_dict['link'].replace("https://www.pluralsight.com/paths/", ""),
course_links=course_absolute_links)

return pluralsight_path

@@ -59,35 +58,36 @@ def get_path_courses_json_string() -> str:
str: JSON string containing all the paths and courses of Pluralsight
"""

global ID_COUNTER
global PATH_ID_COUNTER

index_html = requests.get("https://www.pluralsight.com/product/paths").text
soup = BeautifulSoup(index_html, 'html.parser')
all_course_divs = soup.findAll("div", {"class": "item"})

link_list = []
path_dict_list = []
for course_div in all_course_divs:
temp_soup = BeautifulSoup(str(course_div), 'html.parser')

link_string = f"{ROOT_PLURALSIGHT_URL}{str(temp_soup.find('a', href=True)['href'])}"

link_list.append({
'id': ID_COUNTER,
path_dict_list.append({
'id': PATH_ID_COUNTER,
'link': link_string
})

ID_COUNTER += 1
PATH_ID_COUNTER += 1

with Pool(MAX_POOL_SIZE) as p:
path_list = map(lambda link: link, p.map(process_path, link_list))
pluralsight_paths = map(lambda link: link, p.map(process_path, path_dict_list))

json_string = json.dumps([ob.__dict__ for ob in path_list])
json_string = json.dumps([ob.__dict__ for ob in pluralsight_paths])

return json_string


def main():
print(get_path_courses_json_string())
pluralsight_dump_as_json = get_path_courses_json_string()
print(pluralsight_dump_as_json)


if __name__ == '__main__':

Laden…
Abbrechen
Speichern