2.0

author: blackhao <13851610112@163.com> 2025-08-22 02:51:50 -0500
committer: blackhao <13851610112@163.com> 2025-08-22 02:51:50 -0500
commit: 4aab4087dc97906d0b9890035401175cdaab32d4 (patch)
tree: 4e2e9d88a711ec5b1cfa02e8ac72a55183b99123 /scripts/scrape.js
parent: afa8f50d1d21c721dabcb31ad244610946ab65a3 (diff)
1 files changed, 0 insertions, 63 deletions
diff --git a/scripts/scrape.js b/scripts/scrape.js
deleted file mode 100644
index d4b2ecb..0000000
--- a/scripts/scrape.js
+++ /dev/null
@@ -1,63 +0,0 @@
-import { XMLParser } from "fast-xml-parser";
-import fs from "fs/promises";
-import path from "path";
-import dns from "node:dns";
-
-import { ProxyAgent } from "undici";
-
-dns.setDefaultResultOrder("ipv4first");
-
-const proxy = process.env.HTTPS_PROXY || process.env.https_proxy || process.env.HTTP_PROXY || process.env.http_proxy;
-const dispatcher = proxy ? new ProxyAgent(proxy) : undefined;
-
-const [ , , YEAR = "2025", TERM = "fall" ] = process.argv;
-// UIUC's API uses XML namespaces (e.g. `<ns2:term>`). In order for the
-// returned object to have plain keys like `term` and `subject`, we instruct
-// fast-xml-parser to strip the namespace prefixes.
-const parser = new XMLParser({ ignoreAttributes: false, removeNSPrefix: true });
-const BASE = `https://courses.illinois.edu/cisapp/explorer`;
-
-async function getXML(url) {
-  const res = await fetch(url, { dispatcher });
-  if (!res.ok) throw new Error(`Request failed: ${res.status} ${url}`);
-  return parser.parse(await res.text());
-
-}
-
-async function scrapeSchedule(year, term) {
-  const catalog = {};
-  const termRoot = await getXML(`${BASE}/schedule/${year}/${term}.xml`);
-
-  const subjects = termRoot.term?.subjects?.subject;
-  if (!subjects) throw new Error(`Unexpected XML structure for ${year} ${term}`);
-
-  const subjHrefs = Array.isArray(subjects) ? subjects.map(s => s['@_href']) : [subjects['@_href']];
-
-  for (const subjURL of subjHrefs) {
-    const subjXML = await getXML(subjURL);
-
-    const courses = subjXML.subject?.courses?.course || [];
-
-    const courseList = Array.isArray(courses) ? courses : [courses];
-    for (const c of courseList) {
-      const courseURL = c['@_href'];
-      const courseXML = await getXML(courseURL);
-      const id = courseXML.course['@_id'];
-      const desc = courseXML.course.description ?? "";
-      const m = desc.match(/Prerequisite[s]?:\s*([^.;]*)/i);
-      if (!m) continue;
-      const prereqs = m[1]
-        .match(/[A-Z]{2,4}\s?\d{2,3}[A-Z]?/g)
-        ?.map(s => s.replace(/\s+/, "")) ?? [];
-      if (prereqs.length) catalog[id.replace(/\s+/, "")] = prereqs;
-    }
-    await new Promise(r => setTimeout(r, 300));
-  }
-  return catalog;
-}
-
-const data = await scrapeSchedule(YEAR, TERM);
-const outDir = path.resolve("data");
-await fs.mkdir(outDir, { recursive: true });
-await fs.writeFile(path.join(outDir, `catalog_${YEAR}_${TERM}.json`), JSON.stringify(data, null, 2));
-console.log(`Saved ${Object.keys(data).length} courses`);
author	blackhao <13851610112@163.com>	2025-08-22 02:51:50 -0500
committer	blackhao <13851610112@163.com>	2025-08-22 02:51:50 -0500
commit	4aab4087dc97906d0b9890035401175cdaab32d4 (patch)
tree	4e2e9d88a711ec5b1cfa02e8ac72a55183b99123 /scripts/scrape.js
parent	afa8f50d1d21c721dabcb31ad244610946ab65a3 (diff)