diff options
| author | Yuren Hao <97327730+YurenHao0426@users.noreply.github.com> | 2025-08-06 02:11:42 -0700 |
|---|---|---|
| committer | Yuren Hao <97327730+YurenHao0426@users.noreply.github.com> | 2025-08-06 02:11:42 -0700 |
| commit | 80e315ec800feeb91b858c2ad7816071725eeaee (patch) | |
| tree | e6cf818423afd2c9b6ae168f3150c7d36021ecdf /scripts | |
| parent | 79ec5fd1f0188a88f937737185c542e644cf98e4 (diff) | |
fix: handle namespace and add proxy support22poqj-codex/test-uiuc-api-cors-response
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/scrape.js | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/scripts/scrape.js b/scripts/scrape.js new file mode 100644 index 0000000..93f6a41 --- /dev/null +++ b/scripts/scrape.js @@ -0,0 +1,57 @@ +import { XMLParser } from "fast-xml-parser"; +import fs from "fs/promises"; +import path from "path"; +import dns from "node:dns"; +import { ProxyAgent } from "undici"; + +dns.setDefaultResultOrder("ipv4first"); + +const proxy = process.env.HTTPS_PROXY || process.env.https_proxy || process.env.HTTP_PROXY || process.env.http_proxy; +const dispatcher = proxy ? new ProxyAgent(proxy) : undefined; + +const [ , , YEAR = "2025", TERM = "fall" ] = process.argv; +// UIUC's API uses XML namespaces (e.g. `<ns2:term>`). In order for the +// returned object to have plain keys like `term` and `subject`, we instruct +// fast-xml-parser to strip the namespace prefixes. +const parser = new XMLParser({ ignoreAttributes: false, removeNSPrefix: true }); +const BASE = `https://courses.illinois.edu/cisapp/explorer`; + +async function getXML(url) { + const res = await fetch(url, { dispatcher }); + if (!res.ok) throw new Error(`Request failed: ${res.status} ${url}`); + return parser.parse(await res.text()); +} + +async function scrapeSchedule(year, term) { + const catalog = {}; + const termRoot = await getXML(`${BASE}/schedule/${year}/${term}.xml`); + const subjects = termRoot.term?.subjects?.subject; + if (!subjects) throw new Error(`Unexpected XML structure for ${year} ${term}`); + const subjHrefs = Array.isArray(subjects) ? subjects.map(s => s['@_href']) : [subjects['@_href']]; + + for (const subjURL of subjHrefs) { + const subjXML = await getXML(subjURL); + const courses = subjXML.subject?.courses?.course || []; + const courseList = Array.isArray(courses) ? courses : [courses]; + for (const c of courseList) { + const courseURL = c['@_href']; + const courseXML = await getXML(courseURL); + const id = courseXML.course['@_id']; + const desc = courseXML.course.description ?? ""; + const m = desc.match(/Prerequisite[s]?:\s*([^.;]*)/i); + if (!m) continue; + const prereqs = m[1] + .match(/[A-Z]{2,4}\s?\d{2,3}[A-Z]?/g) + ?.map(s => s.replace(/\s+/, "")) ?? []; + if (prereqs.length) catalog[id.replace(/\s+/, "")] = prereqs; + } + await new Promise(r => setTimeout(r, 300)); + } + return catalog; +} + +const data = await scrapeSchedule(YEAR, TERM); +const outDir = path.resolve("data"); +await fs.mkdir(outDir, { recursive: true }); +await fs.writeFile(path.join(outDir, `catalog_${YEAR}_${TERM}.json`), JSON.stringify(data, null, 2)); +console.log(`Saved ${Object.keys(data).length} courses`); |
