scripts/analyze_prereqs.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

#!/usr/bin/env python3
import argparse
import json
import re
from typing import Any, Dict, Iterable, List


COURSE_TOKEN = re.compile(r"\b([A-Z]{2,4})\s*(\d{2,3}[A-Z]?)\b")
NONE_PATTERNS = [
    re.compile(r"^\s*none\.?\s*$", re.IGNORECASE),
    re.compile(r"no prerequisites", re.IGNORECASE),
    re.compile(r"prerequisite[s]?:\s*none\b", re.IGNORECASE),
]


def is_none_text(text: str) -> bool:
    t = text.strip()
    return any(p.search(t) for p in NONE_PATTERNS)


def extract_course_refs(text: str) -> List[str]:
    refs = []
    for m in COURSE_TOKEN.finditer(text):
        subject, number = m.group(1), m.group(2)
        refs.append(f"{subject} {number}")
    return refs


NON_COURSE_KEYWORDS = [
    r"consent", r"permission", r"approval",
    r"standing", r"senior", r"junior", r"sophomore", r"freshman",
    r"major", r"minor", r"program", r"restricted", r"enrollment", r"enrolled",
    r"gpa", r"grade", r"minimum", r"credit hour", r"credits?", r"hours?",
    r"registration", r"concurrent", r"co-requisite", r"corequisite",
    r"department", r"instructor",
]

def has_non_course_requirements(text: str) -> bool:
    t = text.lower()
    return any(re.search(k, t) for k in NON_COURSE_KEYWORDS)


def is_course_only(text: str) -> bool:
    t = text.strip()
    if has_non_course_requirements(t):
        return False
    # Remove course tokens, then see if any nontrivial tokens remain besides basic connectors
    placeholder = COURSE_TOKEN.sub("COURSE", t)
    # Remove conjunctions and punctuation
    simplified = re.sub(r"[(),.;]", " ", placeholder)
    simplified = re.sub(r"\b(and|or|and/or|either|both|one of|two of|with|credit in)\b", " ", simplified, flags=re.IGNORECASE)
    # Remove common quantifiers
    simplified = re.sub(r"\b(at\s+least)\b", " ", simplified, flags=re.IGNORECASE)
    # Collapse whitespace
    simplified = re.sub(r"\s+", " ", simplified).strip()
    # If empty or only words like COURSE left, treat as course-only
    return simplified == "" or re.fullmatch(r"(COURSE\s*)+", simplified) is not None


def analyze(courses: Iterable[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    results = {
        "none": [],
        "course_only": [],
        "remaining": [],
    }

    for c in courses:
        prereq = c.get("prerequisites") or ""
        if not prereq.strip():
            results["none"].append(c)
            continue
        if is_none_text(prereq):
            results["none"].append(c)
            continue
        if is_course_only(prereq):
            results["course_only"].append({
                "index": c.get("index"),
                "name": c.get("name"),
                "prerequisites": prereq,
                "courses": extract_course_refs(prereq),
            })
        else:
            results["remaining"].append({
                "index": c.get("index"),
                "name": c.get("name"),
                "prerequisites": prereq,
            })
    return results


def main() -> int:
    ap = argparse.ArgumentParser(description="Analyze UIUC course prerequisite text")
    ap.add_argument("input", default="data/courses.json", nargs="?", help="Input courses JSON array")
    ap.add_argument("--outdir", default="data/analysis", help="Output directory")
    args = ap.parse_args()

    with open(args.input, "r", encoding="utf-8") as f:
        data = json.load(f)

    res = analyze(data)

    import os
    os.makedirs(args.outdir, exist_ok=True)
    with open(os.path.join(args.outdir, "none.json"), "w", encoding="utf-8") as f:
        json.dump(res["none"], f, ensure_ascii=False, indent=2)
    with open(os.path.join(args.outdir, "course_only.json"), "w", encoding="utf-8") as f:
        json.dump(res["course_only"], f, ensure_ascii=False, indent=2)
    with open(os.path.join(args.outdir, "remaining.json"), "w", encoding="utf-8") as f:
        json.dump(res["remaining"], f, ensure_ascii=False, indent=2)

    print(f"none: {len(res['none'])}")
    print(f"course_only: {len(res['course_only'])}")
    print(f"remaining: {len(res['remaining'])}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())