From 4aab4087dc97906d0b9890035401175cdaab32d4 Mon Sep 17 00:00:00 2001 From: blackhao <13851610112@163.com> Date: Fri, 22 Aug 2025 02:51:50 -0500 Subject: 2.0 --- scripts/analyze_prereqs.py | 120 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 scripts/analyze_prereqs.py (limited to 'scripts/analyze_prereqs.py') diff --git a/scripts/analyze_prereqs.py b/scripts/analyze_prereqs.py new file mode 100644 index 0000000..7c580f7 --- /dev/null +++ b/scripts/analyze_prereqs.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +import argparse +import json +import re +from typing import Any, Dict, Iterable, List + + +COURSE_TOKEN = re.compile(r"\b([A-Z]{2,4})\s*(\d{2,3}[A-Z]?)\b") +NONE_PATTERNS = [ + re.compile(r"^\s*none\.?\s*$", re.IGNORECASE), + re.compile(r"no prerequisites", re.IGNORECASE), + re.compile(r"prerequisite[s]?:\s*none\b", re.IGNORECASE), +] + + +def is_none_text(text: str) -> bool: + t = text.strip() + return any(p.search(t) for p in NONE_PATTERNS) + + +def extract_course_refs(text: str) -> List[str]: + refs = [] + for m in COURSE_TOKEN.finditer(text): + subject, number = m.group(1), m.group(2) + refs.append(f"{subject} {number}") + return refs + + +NON_COURSE_KEYWORDS = [ + r"consent", r"permission", r"approval", + r"standing", r"senior", r"junior", r"sophomore", r"freshman", + r"major", r"minor", r"program", r"restricted", r"enrollment", r"enrolled", + r"gpa", r"grade", r"minimum", r"credit hour", r"credits?", r"hours?", + r"registration", r"concurrent", r"co-requisite", r"corequisite", + r"department", r"instructor", +] + +def has_non_course_requirements(text: str) -> bool: + t = text.lower() + return any(re.search(k, t) for k in NON_COURSE_KEYWORDS) + + +def is_course_only(text: str) -> bool: + t = text.strip() + if has_non_course_requirements(t): + return False + # Remove course tokens, then see if any nontrivial tokens remain besides basic connectors + placeholder = COURSE_TOKEN.sub("COURSE", t) + # Remove conjunctions and punctuation + simplified = re.sub(r"[(),.;]", " ", placeholder) + simplified = re.sub(r"\b(and|or|and/or|either|both|one of|two of|with|credit in)\b", " ", simplified, flags=re.IGNORECASE) + # Remove common quantifiers + simplified = re.sub(r"\b(at\s+least)\b", " ", simplified, flags=re.IGNORECASE) + # Collapse whitespace + simplified = re.sub(r"\s+", " ", simplified).strip() + # If empty or only words like COURSE left, treat as course-only + return simplified == "" or re.fullmatch(r"(COURSE\s*)+", simplified) is not None + + +def analyze(courses: Iterable[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: + results = { + "none": [], + "course_only": [], + "remaining": [], + } + + for c in courses: + prereq = c.get("prerequisites") or "" + if not prereq.strip(): + results["none"].append(c) + continue + if is_none_text(prereq): + results["none"].append(c) + continue + if is_course_only(prereq): + results["course_only"].append({ + "index": c.get("index"), + "name": c.get("name"), + "prerequisites": prereq, + "courses": extract_course_refs(prereq), + }) + else: + results["remaining"].append({ + "index": c.get("index"), + "name": c.get("name"), + "prerequisites": prereq, + }) + return results + + +def main() -> int: + ap = argparse.ArgumentParser(description="Analyze UIUC course prerequisite text") + ap.add_argument("input", default="data/courses.json", nargs="?", help="Input courses JSON array") + ap.add_argument("--outdir", default="data/analysis", help="Output directory") + args = ap.parse_args() + + with open(args.input, "r", encoding="utf-8") as f: + data = json.load(f) + + res = analyze(data) + + import os + os.makedirs(args.outdir, exist_ok=True) + with open(os.path.join(args.outdir, "none.json"), "w", encoding="utf-8") as f: + json.dump(res["none"], f, ensure_ascii=False, indent=2) + with open(os.path.join(args.outdir, "course_only.json"), "w", encoding="utf-8") as f: + json.dump(res["course_only"], f, ensure_ascii=False, indent=2) + with open(os.path.join(args.outdir, "remaining.json"), "w", encoding="utf-8") as f: + json.dump(res["remaining"], f, ensure_ascii=False, indent=2) + + print(f"none: {len(res['none'])}") + print(f"course_only: {len(res['course_only'])}") + print(f"remaining: {len(res['remaining'])}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + + -- cgit v1.2.3