summaryrefslogtreecommitdiff
path: root/scripts/analyze_prereqs.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/analyze_prereqs.py')
-rw-r--r--scripts/analyze_prereqs.py120
1 files changed, 120 insertions, 0 deletions
diff --git a/scripts/analyze_prereqs.py b/scripts/analyze_prereqs.py
new file mode 100644
index 0000000..7c580f7
--- /dev/null
+++ b/scripts/analyze_prereqs.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import re
+from typing import Any, Dict, Iterable, List
+
+
+COURSE_TOKEN = re.compile(r"\b([A-Z]{2,4})\s*(\d{2,3}[A-Z]?)\b")
+NONE_PATTERNS = [
+ re.compile(r"^\s*none\.?\s*$", re.IGNORECASE),
+ re.compile(r"no prerequisites", re.IGNORECASE),
+ re.compile(r"prerequisite[s]?:\s*none\b", re.IGNORECASE),
+]
+
+
+def is_none_text(text: str) -> bool:
+ t = text.strip()
+ return any(p.search(t) for p in NONE_PATTERNS)
+
+
+def extract_course_refs(text: str) -> List[str]:
+ refs = []
+ for m in COURSE_TOKEN.finditer(text):
+ subject, number = m.group(1), m.group(2)
+ refs.append(f"{subject} {number}")
+ return refs
+
+
+NON_COURSE_KEYWORDS = [
+ r"consent", r"permission", r"approval",
+ r"standing", r"senior", r"junior", r"sophomore", r"freshman",
+ r"major", r"minor", r"program", r"restricted", r"enrollment", r"enrolled",
+ r"gpa", r"grade", r"minimum", r"credit hour", r"credits?", r"hours?",
+ r"registration", r"concurrent", r"co-requisite", r"corequisite",
+ r"department", r"instructor",
+]
+
+def has_non_course_requirements(text: str) -> bool:
+ t = text.lower()
+ return any(re.search(k, t) for k in NON_COURSE_KEYWORDS)
+
+
+def is_course_only(text: str) -> bool:
+ t = text.strip()
+ if has_non_course_requirements(t):
+ return False
+ # Remove course tokens, then see if any nontrivial tokens remain besides basic connectors
+ placeholder = COURSE_TOKEN.sub("COURSE", t)
+ # Remove conjunctions and punctuation
+ simplified = re.sub(r"[(),.;]", " ", placeholder)
+ simplified = re.sub(r"\b(and|or|and/or|either|both|one of|two of|with|credit in)\b", " ", simplified, flags=re.IGNORECASE)
+ # Remove common quantifiers
+ simplified = re.sub(r"\b(at\s+least)\b", " ", simplified, flags=re.IGNORECASE)
+ # Collapse whitespace
+ simplified = re.sub(r"\s+", " ", simplified).strip()
+ # If empty or only words like COURSE left, treat as course-only
+ return simplified == "" or re.fullmatch(r"(COURSE\s*)+", simplified) is not None
+
+
+def analyze(courses: Iterable[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
+ results = {
+ "none": [],
+ "course_only": [],
+ "remaining": [],
+ }
+
+ for c in courses:
+ prereq = c.get("prerequisites") or ""
+ if not prereq.strip():
+ results["none"].append(c)
+ continue
+ if is_none_text(prereq):
+ results["none"].append(c)
+ continue
+ if is_course_only(prereq):
+ results["course_only"].append({
+ "index": c.get("index"),
+ "name": c.get("name"),
+ "prerequisites": prereq,
+ "courses": extract_course_refs(prereq),
+ })
+ else:
+ results["remaining"].append({
+ "index": c.get("index"),
+ "name": c.get("name"),
+ "prerequisites": prereq,
+ })
+ return results
+
+
+def main() -> int:
+ ap = argparse.ArgumentParser(description="Analyze UIUC course prerequisite text")
+ ap.add_argument("input", default="data/courses.json", nargs="?", help="Input courses JSON array")
+ ap.add_argument("--outdir", default="data/analysis", help="Output directory")
+ args = ap.parse_args()
+
+ with open(args.input, "r", encoding="utf-8") as f:
+ data = json.load(f)
+
+ res = analyze(data)
+
+ import os
+ os.makedirs(args.outdir, exist_ok=True)
+ with open(os.path.join(args.outdir, "none.json"), "w", encoding="utf-8") as f:
+ json.dump(res["none"], f, ensure_ascii=False, indent=2)
+ with open(os.path.join(args.outdir, "course_only.json"), "w", encoding="utf-8") as f:
+ json.dump(res["course_only"], f, ensure_ascii=False, indent=2)
+ with open(os.path.join(args.outdir, "remaining.json"), "w", encoding="utf-8") as f:
+ json.dump(res["remaining"], f, ensure_ascii=False, indent=2)
+
+ print(f"none: {len(res['none'])}")
+ print(f"course_only: {len(res['course_only'])}")
+ print(f"remaining: {len(res['remaining'])}")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
+
+