From 4aab4087dc97906d0b9890035401175cdaab32d4 Mon Sep 17 00:00:00 2001 From: blackhao <13851610112@163.com> Date: Fri, 22 Aug 2025 02:51:50 -0500 Subject: 2.0 --- scripts/fetch_uiuc_courses.py | 230 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 scripts/fetch_uiuc_courses.py (limited to 'scripts/fetch_uiuc_courses.py') diff --git a/scripts/fetch_uiuc_courses.py b/scripts/fetch_uiuc_courses.py new file mode 100644 index 0000000..0f38fdd --- /dev/null +++ b/scripts/fetch_uiuc_courses.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +import argparse +import concurrent.futures +import json +import re +import sys +import time +from dataclasses import dataclass, asdict +from typing import Dict, List, Optional, Tuple + +import requests +from xml.etree import ElementTree as ET + + +BASE_URL = "https://courses.illinois.edu/cisapp/explorer/catalog" + + +@dataclass +class CourseRecord: + index: str + name: Optional[str] + description: Optional[str] + prerequisites: Optional[str] + + +def parse_xml(content: bytes) -> ET.Element: + try: + return ET.fromstring(content) + except ET.ParseError as exc: + raise RuntimeError(f"Failed to parse XML: {exc}") + + +def fetch(session: requests.Session, url: str) -> bytes: + resp = session.get(url, timeout=30) + if resp.status_code != 200: + raise RuntimeError(f"GET {url} -> {resp.status_code}") + return resp.content + + +def get_subject_ids(session: requests.Session, year: str, term: str) -> List[str]: + url = f"{BASE_URL}/{year}/{term}.xml" + root = parse_xml(fetch(session, url)) + subjects = [] + for node in root.findall(".//subject"): + node_id = node.attrib.get("id") + if node_id: + subjects.append(node_id) + return subjects + + +def get_course_numbers_for_subject(session: requests.Session, year: str, term: str, subject: str) -> List[str]: + url = f"{BASE_URL}/{year}/{term}/{subject}.xml" + root = parse_xml(fetch(session, url)) + courses = [] + for node in root.findall(".//course"): + node_id = node.attrib.get("id") + if node_id: + courses.append(node_id) + return courses + + +def extract_prerequisite_text(root: ET.Element) -> Optional[str]: + # Prefer explicitly labeled prerequisite elements if present + for tag in ["prerequisites", "prerequisite", "Prerequisites", "Prerequisite"]: + found = root.find(f".//{tag}") + if found is not None and (found.text and found.text.strip()): + return found.text.strip() + + # Fallback: courseSectionInformation often contains "Prerequisite:" free text + csi = root.find(".//courseSectionInformation") + if csi is not None and csi.text: + text = csi.text.strip() + match = re.search(r"Prerequisite[s]?:\s*(.*)$", text, flags=re.IGNORECASE | re.DOTALL) + if match: + return match.group(1).strip() + + # As a last resort, scan description for a Prerequisite sentence + desc = root.find(".//description") + if desc is not None and desc.text: + text = desc.text.strip() + match = re.search(r"Prerequisite[s]?:\s*(.*)$", text, flags=re.IGNORECASE | re.DOTALL) + if match: + return match.group(1).strip() + + return None + + +def get_course_details(session: requests.Session, year: str, term: str, subject: str, course_number: str) -> CourseRecord: + url = f"{BASE_URL}/{year}/{term}/{subject}/{course_number}.xml" + root = parse_xml(fetch(session, url)) + + # Title/name may be in