summaryrefslogtreecommitdiff
path: root/scripts/parse_course_prereqs.py
blob: 609303c312d63fe055dd7a1aa2f1ed4dcb7b6ebd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python3
import argparse
import json
import re
from typing import Any, Dict, List, Optional, Tuple


COURSE_RE = re.compile(r"\b([A-Z]{2,4})\s*(\d{2,3}[A-Z]?)\b")

# Clause boundaries: semicolons are strong AND separators at UIUC
CLAUSE_SPLIT_RE = re.compile(r";+")


def find_course_spans(text: str) -> List[Tuple[str, int, int]]:
    spans: List[Tuple[str, int, int]] = []
    for m in COURSE_RE.finditer(text):
        course = f"{m.group(1)} {m.group(2)}"
        spans.append((course, m.start(), m.end()))
    return spans


def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()


def parse_clause_into_group(clause: str) -> Dict[str, Any]:
    clause_clean = normalize_space(clause)
    courses = find_course_spans(clause_clean)
    if not courses:
        return {"op": "EMPTY"}

    # Detect "one of" window: treat everything until boundary as OR
    one_of_match = re.search(r"\b(one of|any of)\b", clause_clean, flags=re.IGNORECASE)
    if one_of_match:
        # Take all courses in the clause as OR if they appear after the phrase
        start_idx = one_of_match.end()
        or_list = [c for (c, s, e) in courses if s >= start_idx]
        if or_list:
            # Also include any course tokens that appear BEFORE the one-of phrase as separate AND terms
            prior_courses = [c for (c, s, e) in courses if s < start_idx]
            items: List[Dict[str, Any]] = []
            for c in prior_courses:
                items.append({"op": "COURSE", "course": c})
            items.append({"op": "OR", "items": [{"op": "COURSE", "course": c} for c in or_list]})
            return {"op": "AND", "items": items} if len(items) > 1 else items[0]

    # Otherwise, infer connectors between adjacent course tokens
    # Build pairwise connectors from text between tokens
    connectors: List[str] = []
    for i in range(len(courses) - 1):
        _, _, end_prev = courses[i]
        _, start_next, _ = courses[i + 1]
        between = clause_clean[end_prev:start_next].lower()
        if "and/or" in between:
            connectors.append("OR")
        elif re.search(r"\band\b", between):
            connectors.append("AND")
        elif re.search(r"\bor\b", between):
            connectors.append("OR")
        else:
            # Default: comma-only separation; lean towards OR if followed by or earlier in span
            if "," in between:
                connectors.append("LIST")
            else:
                connectors.append("UNKNOWN")

    course_items = [{"op": "COURSE", "course": c} for (c, _, _) in courses]

    # If there is any explicit AND, group AND chunks; otherwise treat as OR if any OR, else LIST->OR
    if "AND" in connectors and "OR" not in connectors:
        return {"op": "AND", "items": course_items}
    if "OR" in connectors and "AND" not in connectors:
        return {"op": "OR", "items": course_items}
    if "AND" not in connectors and "OR" not in connectors:
        # All LIST/UNKNOWN: choose OR as a safer default for admissions like "A, B, or C" where last token has or
        if any(k == "LIST" for k in connectors):
            return {"op": "OR", "items": course_items}
        return {"op": "AND", "items": course_items} if len(course_items) > 1 else course_items[0]

    # Mixed AND and OR: build small AST by splitting on commas and respecting local conjunctions
    # Simple heuristic: split clause by commas, parse each segment for explicit AND/OR
    segments = [normalize_space(s) for s in re.split(r",+", clause_clean) if normalize_space(s)]
    subitems: List[Dict[str, Any]] = []
    for seg in segments:
        seg_courses = find_course_spans(seg)
        if not seg_courses:
            continue
        if re.search(r"\band\b", seg.lower()) and not re.search(r"\bor\b", seg.lower()):
            subitems.append({"op": "AND", "items": [{"op": "COURSE", "course": c} for (c, _, _) in seg_courses]})
        elif re.search(r"\bor\b", seg.lower()) and not re.search(r"\band\b", seg.lower()):
            subitems.append({"op": "OR", "items": [{"op": "COURSE", "course": c} for (c, _, _) in seg_courses]})
        else:
            # ambiguous within segment; default to OR
            subitems.append({"op": "OR", "items": [{"op": "COURSE", "course": c} for (c, _, _) in seg_courses]})

    if not subitems:
        subitems = [{"op": "COURSE", "course": c} for (c, _, _) in courses]

    # Combine segments with AND if split by semicolons at higher level; here stay at clause level
    # For mixed case within one clause, default to OR-over-segments unless explicit AND dominates
    and_count = sum(1 for s in subitems if s.get("op") == "AND")
    or_count = sum(1 for s in subitems if s.get("op") == "OR")
    if and_count and not or_count:
        return {"op": "AND", "items": subitems}
    if or_count and not and_count:
        return {"op": "OR", "items": subitems}
    # Mixed: wrap in AND of items that are groups; treat OR groups as single requirements groups
    return {"op": "AND", "items": subitems}


def parse_prereq_text(text: str) -> Dict[str, Any]:
    # Split by semicolons into top-level AND clauses
    clauses = [normalize_space(c) for c in CLAUSE_SPLIT_RE.split(text) if normalize_space(c)]
    if not clauses:
        return {"hard": {"op": "EMPTY"}, "coreq_ok": {"op": "EMPTY"}}

    def is_coreq_clause(c: str) -> bool:
        c_low = c.lower()
        return (
            ("concurrent" in c_low) or
            ("co-requisite" in c_low) or
            ("corequisite" in c_low) or
            re.search(r"credit\s+or\s+concurrent\s+(enrollment|registration)\s+in", c_low) is not None
        )

    hard_groups: List[Dict[str, Any]] = []
    coreq_groups: List[Dict[str, Any]] = []
    for clause in clauses:
        grp = parse_clause_into_group(clause)
        if grp.get("op") == "EMPTY":
            continue
        if is_coreq_clause(clause):
            coreq_groups.append(grp)
        else:
            hard_groups.append(grp)

    def fold(groups: List[Dict[str, Any]]) -> Dict[str, Any]:
        if not groups:
            return {"op": "EMPTY"}
        if len(groups) == 1:
            return groups[0]
        return {"op": "AND", "items": groups}

    return {"hard": fold(hard_groups), "coreq_ok": fold(coreq_groups)}


def main() -> int:
    ap = argparse.ArgumentParser(description="Parse course-only prerequisite text into AND/OR groups")
    ap.add_argument("input", default="data/analysis/course_only.json", nargs="?", help="Input JSON array of course-only prereqs")
    ap.add_argument("--output", default="data/parsed/course_only_parsed.json", help="Output JSON path")
    ap.add_argument("--unparsed-output", default="data/parsed/course_only_unparsed.json", help="Unparsed/empty output JSON path")
    args = ap.parse_args()

    with open(args.input, "r", encoding="utf-8") as f:
        data = json.load(f)

    parsed: List[Dict[str, Any]] = []
    unparsed: List[Dict[str, Any]] = []

    for item in data:
        raw = item.get("prerequisites") or ""
        ast = parse_prereq_text(raw)
        record = {
            "index": item.get("index"),
            "name": item.get("name"),
            "raw": raw,
            "ast": ast,
        }
        # Consider unparsed only if both hard and coreq_ok are EMPTY
        if (isinstance(ast, dict) and ast.get("hard", {}).get("op") == "EMPTY" and ast.get("coreq_ok", {}).get("op") == "EMPTY"):
            unparsed.append(record)
        else:
            parsed.append(record)

    import os
    os.makedirs("data/parsed", exist_ok=True)
    with open(args.output, "w", encoding="utf-8") as f:
        json.dump(parsed, f, ensure_ascii=False, indent=2)
    with open(args.unparsed_output, "w", encoding="utf-8") as f:
        json.dump(unparsed, f, ensure_ascii=False, indent=2)

    print(f"parsed: {len(parsed)}")
    print(f"unparsed: {len(unparsed)}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())