diff options
| author | blackhao <13851610112@163.com> | 2025-08-22 02:51:50 -0500 |
|---|---|---|
| committer | blackhao <13851610112@163.com> | 2025-08-22 02:51:50 -0500 |
| commit | 4aab4087dc97906d0b9890035401175cdaab32d4 (patch) | |
| tree | 4e2e9d88a711ec5b1cfa02e8ac72a55183b99123 /scripts/parse_course_prereqs.py | |
| parent | afa8f50d1d21c721dabcb31ad244610946ab65a3 (diff) | |
2.0
Diffstat (limited to 'scripts/parse_course_prereqs.py')
| -rw-r--r-- | scripts/parse_course_prereqs.py | 190 |
1 files changed, 190 insertions, 0 deletions
diff --git a/scripts/parse_course_prereqs.py b/scripts/parse_course_prereqs.py new file mode 100644 index 0000000..609303c --- /dev/null +++ b/scripts/parse_course_prereqs.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +import argparse +import json +import re +from typing import Any, Dict, List, Optional, Tuple + + +COURSE_RE = re.compile(r"\b([A-Z]{2,4})\s*(\d{2,3}[A-Z]?)\b") + +# Clause boundaries: semicolons are strong AND separators at UIUC +CLAUSE_SPLIT_RE = re.compile(r";+") + + +def find_course_spans(text: str) -> List[Tuple[str, int, int]]: + spans: List[Tuple[str, int, int]] = [] + for m in COURSE_RE.finditer(text): + course = f"{m.group(1)} {m.group(2)}" + spans.append((course, m.start(), m.end())) + return spans + + +def normalize_space(s: str) -> str: + return re.sub(r"\s+", " ", s).strip() + + +def parse_clause_into_group(clause: str) -> Dict[str, Any]: + clause_clean = normalize_space(clause) + courses = find_course_spans(clause_clean) + if not courses: + return {"op": "EMPTY"} + + # Detect "one of" window: treat everything until boundary as OR + one_of_match = re.search(r"\b(one of|any of)\b", clause_clean, flags=re.IGNORECASE) + if one_of_match: + # Take all courses in the clause as OR if they appear after the phrase + start_idx = one_of_match.end() + or_list = [c for (c, s, e) in courses if s >= start_idx] + if or_list: + # Also include any course tokens that appear BEFORE the one-of phrase as separate AND terms + prior_courses = [c for (c, s, e) in courses if s < start_idx] + items: List[Dict[str, Any]] = [] + for c in prior_courses: + items.append({"op": "COURSE", "course": c}) + items.append({"op": "OR", "items": [{"op": "COURSE", "course": c} for c in or_list]}) + return {"op": "AND", "items": items} if len(items) > 1 else items[0] + + # Otherwise, infer connectors between adjacent course tokens + # Build pairwise connectors from text between tokens + connectors: List[str] = [] + for i in range(len(courses) - 1): + _, _, end_prev = courses[i] + _, start_next, _ = courses[i + 1] + between = clause_clean[end_prev:start_next].lower() + if "and/or" in between: + connectors.append("OR") + elif re.search(r"\band\b", between): + connectors.append("AND") + elif re.search(r"\bor\b", between): + connectors.append("OR") + else: + # Default: comma-only separation; lean towards OR if followed by or earlier in span + if "," in between: + connectors.append("LIST") + else: + connectors.append("UNKNOWN") + + course_items = [{"op": "COURSE", "course": c} for (c, _, _) in courses] + + # If there is any explicit AND, group AND chunks; otherwise treat as OR if any OR, else LIST->OR + if "AND" in connectors and "OR" not in connectors: + return {"op": "AND", "items": course_items} + if "OR" in connectors and "AND" not in connectors: + return {"op": "OR", "items": course_items} + if "AND" not in connectors and "OR" not in connectors: + # All LIST/UNKNOWN: choose OR as a safer default for admissions like "A, B, or C" where last token has or + if any(k == "LIST" for k in connectors): + return {"op": "OR", "items": course_items} + return {"op": "AND", "items": course_items} if len(course_items) > 1 else course_items[0] + + # Mixed AND and OR: build small AST by splitting on commas and respecting local conjunctions + # Simple heuristic: split clause by commas, parse each segment for explicit AND/OR + segments = [normalize_space(s) for s in re.split(r",+", clause_clean) if normalize_space(s)] + subitems: List[Dict[str, Any]] = [] + for seg in segments: + seg_courses = find_course_spans(seg) + if not seg_courses: + continue + if re.search(r"\band\b", seg.lower()) and not re.search(r"\bor\b", seg.lower()): + subitems.append({"op": "AND", "items": [{"op": "COURSE", "course": c} for (c, _, _) in seg_courses]}) + elif re.search(r"\bor\b", seg.lower()) and not re.search(r"\band\b", seg.lower()): + subitems.append({"op": "OR", "items": [{"op": "COURSE", "course": c} for (c, _, _) in seg_courses]}) + else: + # ambiguous within segment; default to OR + subitems.append({"op": "OR", "items": [{"op": "COURSE", "course": c} for (c, _, _) in seg_courses]}) + + if not subitems: + subitems = [{"op": "COURSE", "course": c} for (c, _, _) in courses] + + # Combine segments with AND if split by semicolons at higher level; here stay at clause level + # For mixed case within one clause, default to OR-over-segments unless explicit AND dominates + and_count = sum(1 for s in subitems if s.get("op") == "AND") + or_count = sum(1 for s in subitems if s.get("op") == "OR") + if and_count and not or_count: + return {"op": "AND", "items": subitems} + if or_count and not and_count: + return {"op": "OR", "items": subitems} + # Mixed: wrap in AND of items that are groups; treat OR groups as single requirements groups + return {"op": "AND", "items": subitems} + + +def parse_prereq_text(text: str) -> Dict[str, Any]: + # Split by semicolons into top-level AND clauses + clauses = [normalize_space(c) for c in CLAUSE_SPLIT_RE.split(text) if normalize_space(c)] + if not clauses: + return {"hard": {"op": "EMPTY"}, "coreq_ok": {"op": "EMPTY"}} + + def is_coreq_clause(c: str) -> bool: + c_low = c.lower() + return ( + ("concurrent" in c_low) or + ("co-requisite" in c_low) or + ("corequisite" in c_low) or + re.search(r"credit\s+or\s+concurrent\s+(enrollment|registration)\s+in", c_low) is not None + ) + + hard_groups: List[Dict[str, Any]] = [] + coreq_groups: List[Dict[str, Any]] = [] + for clause in clauses: + grp = parse_clause_into_group(clause) + if grp.get("op") == "EMPTY": + continue + if is_coreq_clause(clause): + coreq_groups.append(grp) + else: + hard_groups.append(grp) + + def fold(groups: List[Dict[str, Any]]) -> Dict[str, Any]: + if not groups: + return {"op": "EMPTY"} + if len(groups) == 1: + return groups[0] + return {"op": "AND", "items": groups} + + return {"hard": fold(hard_groups), "coreq_ok": fold(coreq_groups)} + + +def main() -> int: + ap = argparse.ArgumentParser(description="Parse course-only prerequisite text into AND/OR groups") + ap.add_argument("input", default="data/analysis/course_only.json", nargs="?", help="Input JSON array of course-only prereqs") + ap.add_argument("--output", default="data/parsed/course_only_parsed.json", help="Output JSON path") + ap.add_argument("--unparsed-output", default="data/parsed/course_only_unparsed.json", help="Unparsed/empty output JSON path") + args = ap.parse_args() + + with open(args.input, "r", encoding="utf-8") as f: + data = json.load(f) + + parsed: List[Dict[str, Any]] = [] + unparsed: List[Dict[str, Any]] = [] + + for item in data: + raw = item.get("prerequisites") or "" + ast = parse_prereq_text(raw) + record = { + "index": item.get("index"), + "name": item.get("name"), + "raw": raw, + "ast": ast, + } + # Consider unparsed only if both hard and coreq_ok are EMPTY + if (isinstance(ast, dict) and ast.get("hard", {}).get("op") == "EMPTY" and ast.get("coreq_ok", {}).get("op") == "EMPTY"): + unparsed.append(record) + else: + parsed.append(record) + + import os + os.makedirs("data/parsed", exist_ok=True) + with open(args.output, "w", encoding="utf-8") as f: + json.dump(parsed, f, ensure_ascii=False, indent=2) + with open(args.unparsed_output, "w", encoding="utf-8") as f: + json.dump(unparsed, f, ensure_ascii=False, indent=2) + + print(f"parsed: {len(parsed)}") + print(f"unparsed: {len(unparsed)}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + + |
