From f35d5c8b7380b906a6819ef436a9d808041731fb Mon Sep 17 00:00:00 2001 From: haoyuren <13851610112@163.com> Date: Fri, 4 Jul 2025 13:44:31 -0700 Subject: add genderbench --- explore_genderbench.py | 213 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 explore_genderbench.py (limited to 'explore_genderbench.py') diff --git a/explore_genderbench.py b/explore_genderbench.py new file mode 100644 index 0000000..d90a1fa --- /dev/null +++ b/explore_genderbench.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 +""" +GenderBench数据探索脚本 +分析不同探测器的数据格式,为masked LLM转换做准备 +""" + +import sys +import os +from pathlib import Path + +# 添加genderbench到路径 +sys.path.append('./genderbench') + +def explore_direct_probe(): + """探索DirectProbe:刻板印象陈述""" + print("🔍 === DirectProbe探索 ===") + + from genderbench.probes.direct.direct_probe import DirectProbe + + probe = DirectProbe() + print(f"📊 Probe类型: {type(probe).__name__}") + print(f"🎯 模板: {probe.template}") + + # 查看stereotype数据 + try: + items = probe._create_probe_items() + print(f"📝 总stereotype数量: {len(items)}") + + # 显示前几个例子 + print("\n📋 Stereotype样本:") + for i, item in enumerate(items[:10]): + if hasattr(item, 'prompts') and item.prompts: + prompt_text = item.prompts[0].text + print(f" {i+1}. {prompt_text}") + + # 按来源分析 + sources = {} + for item in items: + source = item.metadata.get('source', 'unknown') + sources[source] = sources.get(source, 0) + 1 + + print(f"\n📈 按来源统计:") + for source, count in sources.items(): + print(f" {source}: {count}个") + + except Exception as e: + print(f"❌ 加载DirectProbe数据失败: {e}") + +def explore_jobs_probe(): + """探索JobsLumProbe:职业相关偏见""" + print("\n🔍 === JobsLumProbe探索 ===") + + try: + from genderbench.probes.jobs_lum.jobs_lum_probe import JobsLumProbe + + probe = JobsLumProbe() + print(f"📊 Probe类型: {type(probe).__name__}") + + items = probe._create_probe_items() + print(f"🏢 总职业数量: {len(items)}") + + # 显示前几个职业例子 + print("\n💼 职业样本:") + for i, item in enumerate(items[:10]): + if hasattr(item, 'prompts') and item.prompts: + prompt_text = item.prompts[0].text + print(f" {i+1}. {prompt_text}") + + except Exception as e: + print(f"❌ 加载JobsLumProbe失败: {e}") + +def explore_gest_probe(): + """探索GestProbe:性别刻板印象归属""" + print("\n🔍 === GestProbe探索 ===") + + try: + from genderbench.probes.gest.gest_probe import GestProbe + + probe = GestProbe() + print(f"📊 Probe类型: {type(probe).__name__}") + + items = probe._create_probe_items() + print(f"🎭 总测试项数量: {len(items)}") + + # 显示前几个例子 + print("\n🗣️ GEST样本:") + for i, item in enumerate(items[:5]): + if hasattr(item, 'prompts') and item.prompts: + prompt_text = item.prompts[0].text + print(f" {i+1}. {prompt_text}") + + except Exception as e: + print(f"❌ 加载GestProbe失败: {e}") + +def explore_resources(): + """探索资源文件,了解原始数据""" + print("\n🔍 === 资源文件探索 ===") + + resources_path = Path("./genderbench/genderbench/resources") + if resources_path.exists(): + print(f"📁 资源目录: {resources_path}") + + # 探索stereotype文件 + sbic_file = resources_path / "sbic_stereotypes" / "stereotypes.txt" + if sbic_file.exists(): + with open(sbic_file, 'r') as f: + sbic_lines = f.readlines() + print(f"📄 SBIC stereotypes: {len(sbic_lines)}行") + + print("\n🔸 SBIC样本 (前10个):") + for i, line in enumerate(sbic_lines[:10]): + print(f" {i+1}. {line.strip()}") + + gest_file = resources_path / "gest_stereotypes" / "stereotypes.txt" + if gest_file.exists(): + with open(gest_file, 'r') as f: + gest_lines = f.readlines() + print(f"\n📄 GEST stereotypes: {len(gest_lines)}行") + + print("\n🔸 GEST样本 (前10个):") + for i, line in enumerate(gest_lines[:10]): + print(f" {i+1}. {line.strip()}") + + # 探索其他资源 + print(f"\n📂 所有资源目录:") + for subdir in resources_path.iterdir(): + if subdir.is_dir(): + files = list(subdir.glob("*")) + print(f" 📁 {subdir.name}/: {len(files)}个文件") + for file in files[:3]: # 显示前3个文件 + print(f" 📄 {file.name}") + + else: + print("❌ 资源目录不存在") + +def analyze_for_masked_llm(): + """分析数据,为转换为masked LLM格式做准备""" + print("\n🔍 === Masked LLM转换分析 ===") + + # 分析stereotype模式 + resources_path = Path("./genderbench/genderbench/resources") + sbic_file = resources_path / "sbic_stereotypes" / "stereotypes.txt" + + if sbic_file.exists(): + with open(sbic_file, 'r') as f: + stereotypes = [line.strip() for line in f.readlines()] + + # 分析性别词汇模式 + gender_patterns = { + 'men': 0, 'women': 0, 'man': 0, 'woman': 0, + 'male': 0, 'female': 0, 'trans': 0, 'nonbinary': 0, + 'he': 0, 'she': 0, 'his': 0, 'her': 0 + } + + for stereotype in stereotypes: + lower_text = stereotype.lower() + for pattern in gender_patterns: + if pattern in lower_text: + gender_patterns[pattern] += 1 + + print("🎯 性别词汇出现频次:") + for pattern, count in sorted(gender_patterns.items(), key=lambda x: x[1], reverse=True): + if count > 0: + print(f" {pattern}: {count}次") + + # 找出适合转换的stereotype + print("\n🔄 适合Masked LLM转换的stereotype样本:") + convertible = [] + for stereotype in stereotypes[:20]: + if any(word in stereotype.lower() for word in ['men are', 'women are', 'man is', 'woman is']): + convertible.append(stereotype) + + for i, stereotype in enumerate(convertible[:5]): + print(f" 原文: {stereotype}") + # 示例转换 + masked = stereotype.replace('men', '[GENDER]').replace('women', '[GENDER]') + masked = masked.replace('man', '[GENDER]').replace('woman', '[GENDER]') + print(f" 转换: {masked}") + print() + +def main(): + """主函数:运行所有探索""" + print("🚀 GenderBench数据探索开始") + print("=" * 50) + + # 基础信息 + print(f"📍 当前目录: {os.getcwd()}") + print(f"🐍 Python版本: {sys.version}") + + # 检查genderbench是否可用 + try: + import genderbench + print(f"✅ GenderBench版本: {genderbench.__version__ if hasattr(genderbench, '__version__') else '已安装'}") + except ImportError: + print("❌ GenderBench未安装") + return + + # 探索不同探测器 + explore_direct_probe() + explore_jobs_probe() + explore_gest_probe() + + # 探索资源文件 + explore_resources() + + # 分析转换可能性 + analyze_for_masked_llm() + + print("\n🎉 探索完成!") + print("💡 建议: 基于以上分析,我们可以设计数据转换和约束生成策略") + +if __name__ == "__main__": + main() \ No newline at end of file -- cgit v1.2.3