WEAT data

author: Yuren Hao <yurenh2@timan108.cs.illinois.edu> 2025-09-10 11:49:28 -0500
committer: Yuren Hao <yurenh2@timan108.cs.illinois.edu> 2025-09-10 11:49:28 -0500
commit: 523b1747ee27b60d06424dcabd47a309cda80536 (patch)
tree: 6c86be6c50b013833dc4baa4e0830356b3c0d67b /scripts/extract_bad_seeds.py
parent: 78978afd0a132c8ecc491f3e9e275fe02eed3fba (diff)
1 files changed, 62 insertions, 0 deletions
diff --git a/scripts/extract_bad_seeds.py b/scripts/extract_bad_seeds.py
new file mode 100644
index 0000000..aa6340f
--- /dev/null
+++ b/scripts/extract_bad_seeds.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+import json, sys, pathlib
+
+IN = pathlib.Path("third_party/bad_seeds/gathered_seeds.json")
+OUT = pathlib.Path("assets/groups")
+OUT.mkdir(parents=True, exist_ok=True)
+
+def load_all():
+    try:
+        return json.loads(IN.read_text())
+    except FileNotFoundError:
+        sys.stderr.write(f"[ERR] Missing file: {IN}\n"); sys.exit(1)
+
+def pick(data, id_exact=None, contains=None):
+    """Return .Seeds list for a given entry by exact ID or substring match."""
+    def _sid(x):
+        return x.get("Seeds ID") or x.get("Seeds_ID") or x.get("SeedsID")
+    for obj in data:
+        sid = _sid(obj)
+        if not sid: continue
+        if id_exact is not None and sid == id_exact:
+            return obj.get("Seeds")
+    if contains:
+        for obj in data:
+            sid = _sid(obj) or ""
+            if all(sub.lower() in sid.lower() for sub in contains):
+                return obj.get("Seeds")
+    return None
+
+def dump(words, path):
+    toks = sorted({(w or "").strip() for w in words if isinstance(w, str) and w.strip()})
+    path.write_text("\n".join(toks) + "\n")
+    return len(toks)
+
+def main():
+    data = load_all()
+
+    # Canonical WEAT name sets (Caliskan et al. 2017)
+    male = pick(data, id_exact="male_names_1-Caliskan_et_al_2017") or pick(data, contains=["male","name"])
+    female = pick(data, id_exact="female_names_1-Caliskan_et_al_2017") or pick(data, contains=["female","name"])
+
+    if not male or not female:
+        # Help debug if schema changed
+        sys.stderr.write("[ERR] Could not locate WEAT male/female name sets. Available IDs:\n")
+        for obj in data:
+            sid = obj.get("Seeds ID") or obj.get("Seeds_ID") or obj.get("SeedsID")
+            if sid: sys.stderr.write("  - " + sid + "\n")
+        sys.exit(2)
+
+    n_m = dump(male, OUT / "weat_male_names.txt")
+    n_f = dump(female, OUT / "weat_female_names.txt")
+
+    # Optional: career/family word sets (also from Caliskan et al. 2017)
+    career = pick(data, id_exact="career_words_1-Caliskan_et_al_2017") or pick(data, contains=["career"])
+    family = pick(data, id_exact="family_words_1-Caliskan_et_al_2017") or pick(data, contains=["family"])
+    n_c = n_fam = 0
+    if career: n_c = dump(career, OUT / "weat_career_words.txt")
+    if family: n_fam = dump(family, OUT / "weat_family_words.txt")
+
+    print(f"Exported: male_names={n_m}, female_names={n_f}, career={n_c}, family={n_fam}")
+if __name__ == "__main__":
+    main()
author	Yuren Hao <yurenh2@timan108.cs.illinois.edu>	2025-09-10 11:49:28 -0500
committer	Yuren Hao <yurenh2@timan108.cs.illinois.edu>	2025-09-10 11:49:28 -0500
commit	523b1747ee27b60d06424dcabd47a309cda80536 (patch)
tree	6c86be6c50b013833dc4baa4e0830356b3c0d67b /scripts/extract_bad_seeds.py
parent	78978afd0a132c8ecc491f3e9e275fe02eed3fba (diff)