#!/usr/bin/env python3 import argparse import json import re from typing import Any, Dict, List, Optional, Tuple COURSE_RE = re.compile(r"\b([A-Z]{2,4})\s*(\d{2,3}[A-Z]?)\b") # Clause boundaries: semicolons are strong AND separators at UIUC CLAUSE_SPLIT_RE = re.compile(r";+") def find_course_spans(text: str) -> List[Tuple[str, int, int]]: spans: List[Tuple[str, int, int]] = [] for m in COURSE_RE.finditer(text): course = f"{m.group(1)} {m.group(2)}" spans.append((course, m.start(), m.end())) return spans def normalize_space(s: str) -> str: return re.sub(r"\s+", " ", s).strip() def parse_clause_into_group(clause: str) -> Dict[str, Any]: clause_clean = normalize_space(clause) courses = find_course_spans(clause_clean) if not courses: return {"op": "EMPTY"} # Detect "one of" window: treat everything until boundary as OR one_of_match = re.search(r"\b(one of|any of)\b", clause_clean, flags=re.IGNORECASE) if one_of_match: # Take all courses in the clause as OR if they appear after the phrase start_idx = one_of_match.end() or_list = [c for (c, s, e) in courses if s >= start_idx] if or_list: # Also include any course tokens that appear BEFORE the one-of phrase as separate AND terms prior_courses = [c for (c, s, e) in courses if s < start_idx] items: List[Dict[str, Any]] = [] for c in prior_courses: items.append({"op": "COURSE", "course": c}) items.append({"op": "OR", "items": [{"op": "COURSE", "course": c} for c in or_list]}) return {"op": "AND", "items": items} if len(items) > 1 else items[0] # Otherwise, infer connectors between adjacent course tokens # Build pairwise connectors from text between tokens connectors: List[str] = [] for i in range(len(courses) - 1): _, _, end_prev = courses[i] _, start_next, _ = courses[i + 1] between = clause_clean[end_prev:start_next].lower() if "and/or" in between: connectors.append("OR") elif re.search(r"\band\b", between): connectors.append("AND") elif re.search(r"\bor\b", between): connectors.append("OR") else: # Default: comma-only separation; lean towards OR if followed by or earlier in span if "," in between: connectors.append("LIST") else: connectors.append("UNKNOWN") course_items = [{"op": "COURSE", "course": c} for (c, _, _) in courses] # If there is any explicit AND, group AND chunks; otherwise treat as OR if any OR, else LIST->OR if "AND" in connectors and "OR" not in connectors: return {"op": "AND", "items": course_items} if "OR" in connectors and "AND" not in connectors: return {"op": "OR", "items": course_items} if "AND" not in connectors and "OR" not in connectors: # All LIST/UNKNOWN: choose OR as a safer default for admissions like "A, B, or C" where last token has or if any(k == "LIST" for k in connectors): return {"op": "OR", "items": course_items} return {"op": "AND", "items": course_items} if len(course_items) > 1 else course_items[0] # Mixed AND and OR: build small AST by splitting on commas and respecting local conjunctions # Simple heuristic: split clause by commas, parse each segment for explicit AND/OR segments = [normalize_space(s) for s in re.split(r",+", clause_clean) if normalize_space(s)] subitems: List[Dict[str, Any]] = [] for seg in segments: seg_courses = find_course_spans(seg) if not seg_courses: continue if re.search(r"\band\b", seg.lower()) and not re.search(r"\bor\b", seg.lower()): subitems.append({"op": "AND", "items": [{"op": "COURSE", "course": c} for (c, _, _) in seg_courses]}) elif re.search(r"\bor\b", seg.lower()) and not re.search(r"\band\b", seg.lower()): subitems.append({"op": "OR", "items": [{"op": "COURSE", "course": c} for (c, _, _) in seg_courses]}) else: # ambiguous within segment; default to OR subitems.append({"op": "OR", "items": [{"op": "COURSE", "course": c} for (c, _, _) in seg_courses]}) if not subitems: subitems = [{"op": "COURSE", "course": c} for (c, _, _) in courses] # Combine segments with AND if split by semicolons at higher level; here stay at clause level # For mixed case within one clause, default to OR-over-segments unless explicit AND dominates and_count = sum(1 for s in subitems if s.get("op") == "AND") or_count = sum(1 for s in subitems if s.get("op") == "OR") if and_count and not or_count: return {"op": "AND", "items": subitems} if or_count and not and_count: return {"op": "OR", "items": subitems} # Mixed: wrap in AND of items that are groups; treat OR groups as single requirements groups return {"op": "AND", "items": subitems} def parse_prereq_text(text: str) -> Dict[str, Any]: # Split by semicolons into top-level AND clauses clauses = [normalize_space(c) for c in CLAUSE_SPLIT_RE.split(text) if normalize_space(c)] if not clauses: return {"hard": {"op": "EMPTY"}, "coreq_ok": {"op": "EMPTY"}} def is_coreq_clause(c: str) -> bool: c_low = c.lower() return ( ("concurrent" in c_low) or ("co-requisite" in c_low) or ("corequisite" in c_low) or re.search(r"credit\s+or\s+concurrent\s+(enrollment|registration)\s+in", c_low) is not None ) hard_groups: List[Dict[str, Any]] = [] coreq_groups: List[Dict[str, Any]] = [] for clause in clauses: grp = parse_clause_into_group(clause) if grp.get("op") == "EMPTY": continue if is_coreq_clause(clause): coreq_groups.append(grp) else: hard_groups.append(grp) def fold(groups: List[Dict[str, Any]]) -> Dict[str, Any]: if not groups: return {"op": "EMPTY"} if len(groups) == 1: return groups[0] return {"op": "AND", "items": groups} return {"hard": fold(hard_groups), "coreq_ok": fold(coreq_groups)} def main() -> int: ap = argparse.ArgumentParser(description="Parse course-only prerequisite text into AND/OR groups") ap.add_argument("input", default="data/analysis/course_only.json", nargs="?", help="Input JSON array of course-only prereqs") ap.add_argument("--output", default="data/parsed/course_only_parsed.json", help="Output JSON path") ap.add_argument("--unparsed-output", default="data/parsed/course_only_unparsed.json", help="Unparsed/empty output JSON path") args = ap.parse_args() with open(args.input, "r", encoding="utf-8") as f: data = json.load(f) parsed: List[Dict[str, Any]] = [] unparsed: List[Dict[str, Any]] = [] for item in data: raw = item.get("prerequisites") or "" ast = parse_prereq_text(raw) record = { "index": item.get("index"), "name": item.get("name"), "raw": raw, "ast": ast, } # Consider unparsed only if both hard and coreq_ok are EMPTY if (isinstance(ast, dict) and ast.get("hard", {}).get("op") == "EMPTY" and ast.get("coreq_ok", {}).get("op") == "EMPTY"): unparsed.append(record) else: parsed.append(record) import os os.makedirs("data/parsed", exist_ok=True) with open(args.output, "w", encoding="utf-8") as f: json.dump(parsed, f, ensure_ascii=False, indent=2) with open(args.unparsed_output, "w", encoding="utf-8") as f: json.dump(unparsed, f, ensure_ascii=False, indent=2) print(f"parsed: {len(parsed)}") print(f"unparsed: {len(unparsed)}") return 0 if __name__ == "__main__": raise SystemExit(main())