scripts/build_final_parsed.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

#!/usr/bin/env python3
import argparse
import json
import os
import re
import sys
from typing import Any, Dict, List


# Ensure we can import sibling script
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
if SCRIPT_DIR not in sys.path:
    sys.path.insert(0, SCRIPT_DIR)

from parse_course_prereqs import parse_prereq_text  # type: ignore


COURSE_TOKEN = re.compile(r"\b([A-Z]{2,4})\s*(\d{2,3}[A-Z]?)\b")
CLAUSE_SPLIT_RE = re.compile(r";+")

NON_COURSE_KEYWORDS = [
    r"consent", r"permission", r"approval",
    r"standing", r"senior", r"junior", r"sophomore", r"freshman",
    r"major", r"minor", r"program", r"restricted", r"enrollment", r"enrolled",
    r"gpa", r"grade", r"minimum", r"credit hour", r"credits?", r"hours?",
    r"registration", r"concurrent", r"co-requisite", r"corequisite",
    r"department", r"instructor",
]


def has_course_token(s: str) -> bool:
    return COURSE_TOKEN.search(s) is not None


def detect_flags(text: str) -> List[str]:
    t = text.lower()
    flags: List[str] = []
    mapping = [
        (r"consent|permission|approval", "CONSENT"),
        (r"standing|senior|junior|sophomore|freshman", "STANDING"),
        (r"major|minor|program|restricted|enrollment|enrolled", "MAJOR_OR_PROGRAM"),
        (r"gpa|grade|minimum", "GRADE_OR_GPA"),
        (r"concurrent|co-requisite|corequisite", "COREQ_ALLOWED"),
        (r"department|instructor", "DEPT_OR_INSTRUCTOR"),
    ]
    for pat, name in mapping:
        if re.search(pat, t):
            flags.append(name)
    return sorted(set(flags))


def main() -> int:
    ap = argparse.ArgumentParser(description="Build final parsed JSON for all courses")
    ap.add_argument("input", nargs="?", default="data/courses.json", help="Input courses.json")
    ap.add_argument("--output", default="data/courses_parsed.json", help="Output JSON path")
    args = ap.parse_args()

    with open(args.input, "r", encoding="utf-8") as f:
        courses = json.load(f)

    out: List[Dict[str, Any]] = []
    stats = {"total": 0, "hard_nonempty": 0, "coreq_nonempty": 0}
    for c in courses:
        stats["total"] += 1
        raw = (c.get("prerequisites") or "").strip()
        ast = parse_prereq_text(raw)

        hard = ast.get("hard") if isinstance(ast, dict) else {"op": "EMPTY"}
        coreq_ok = ast.get("coreq_ok") if isinstance(ast, dict) else {"op": "EMPTY"}
        if hard and hard.get("op") != "EMPTY":
            stats["hard_nonempty"] += 1
        if coreq_ok and coreq_ok.get("op") != "EMPTY":
            stats["coreq_nonempty"] += 1

        # Capture non-course clauses for reference
        notes: List[str] = []
        if raw:
            clauses = [s.strip() for s in CLAUSE_SPLIT_RE.split(raw) if s.strip()]
            for cl in clauses:
                if not has_course_token(cl) or detect_flags(cl):
                    notes.append(cl)

        out.append({
            "index": c.get("index"),
            "name": c.get("name"),
            "description": c.get("description"),
            "prerequisites": {
                "raw": raw or None,
                "hard": hard,
                "coreq_ok": coreq_ok,
                "flags": detect_flags(raw) if raw else [],
                "notes": notes,
            },
        })

    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
    with open(args.output, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print(json.dumps(stats))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())