diff options
| -rw-r--r-- | package-lock.json | 13 | ||||
| -rw-r--r-- | package.json | 4 | ||||
| -rw-r--r-- | scripts/scrape.js | 24 |
3 files changed, 35 insertions, 6 deletions
diff --git a/package-lock.json b/package-lock.json index c3c715c..0f0ed5d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,7 +13,8 @@ "fast-xml-parser": "^4.4.2", "react": "^18.3.0", "react-cytoscapejs": "^2.0.0", - "react-dom": "^18.3.0" + "react-dom": "^18.3.0", + "undici": "^6.21.3" }, "devDependencies": { "@types/node": "^24.2.0", @@ -1671,6 +1672,16 @@ "node": ">=14.17" } }, + + "node_modules/undici": { + "version": "6.21.3", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.21.3.tgz", + "integrity": "sha512-gBLkYIlEnSp8pFbT64yFgGE6UIB9tAkhukC23PmMDCe5Nd+cRqKxSjw5y54MK2AZMgZfJWMaNE4nYUHgi1XEOw==", + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "7.10.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.10.0.tgz", diff --git a/package.json b/package.json index 4844bba..daf0c3b 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,9 @@ "fast-xml-parser": "^4.4.2", "react": "^18.3.0", "react-cytoscapejs": "^2.0.0", - "react-dom": "^18.3.0" + + "react-dom": "^18.3.0", + "undici": "^6.21.3" }, "devDependencies": { "@types/node": "^24.2.0", diff --git a/scripts/scrape.js b/scripts/scrape.js index 2bfca5e..d4b2ecb 100644 --- a/scripts/scrape.js +++ b/scripts/scrape.js @@ -3,25 +3,41 @@ import fs from "fs/promises"; import path from "path"; import dns from "node:dns"; +import { ProxyAgent } from "undici"; + dns.setDefaultResultOrder("ipv4first"); +const proxy = process.env.HTTPS_PROXY || process.env.https_proxy || process.env.HTTP_PROXY || process.env.http_proxy; +const dispatcher = proxy ? new ProxyAgent(proxy) : undefined; + const [ , , YEAR = "2025", TERM = "fall" ] = process.argv; -const parser = new XMLParser({ ignoreAttributes: false }); +// UIUC's API uses XML namespaces (e.g. `<ns2:term>`). In order for the +// returned object to have plain keys like `term` and `subject`, we instruct +// fast-xml-parser to strip the namespace prefixes. +const parser = new XMLParser({ ignoreAttributes: false, removeNSPrefix: true }); const BASE = `https://courses.illinois.edu/cisapp/explorer`; async function getXML(url) { - return parser.parse(await (await fetch(url)).text()); + const res = await fetch(url, { dispatcher }); + if (!res.ok) throw new Error(`Request failed: ${res.status} ${url}`); + return parser.parse(await res.text()); + } async function scrapeSchedule(year, term) { const catalog = {}; const termRoot = await getXML(`${BASE}/schedule/${year}/${term}.xml`); - const subjects = termRoot.term.subject; + + const subjects = termRoot.term?.subjects?.subject; + if (!subjects) throw new Error(`Unexpected XML structure for ${year} ${term}`); + const subjHrefs = Array.isArray(subjects) ? subjects.map(s => s['@_href']) : [subjects['@_href']]; for (const subjURL of subjHrefs) { const subjXML = await getXML(subjURL); - const courses = subjXML.subject.course || []; + + const courses = subjXML.subject?.courses?.course || []; + const courseList = Array.isArray(courses) ? courses : [courses]; for (const c of courseList) { const courseURL = c['@_href']; |
