From f35d5c8b7380b906a6819ef436a9d808041731fb Mon Sep 17 00:00:00 2001 From: haoyuren <13851610112@163.com> Date: Fri, 4 Jul 2025 13:44:31 -0700 Subject: add genderbench --- gender_debias_utils_english.py | 350 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 350 insertions(+) create mode 100644 gender_debias_utils_english.py (limited to 'gender_debias_utils_english.py') diff --git a/gender_debias_utils_english.py b/gender_debias_utils_english.py new file mode 100644 index 0000000..5771752 --- /dev/null +++ b/gender_debias_utils_english.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 +""" +Gender Debias Utilities - English Version +Pure English implementation for gender debiasing +Core idea: Generate identical responses except for gender +""" + +import sys +import re +import torch +import torch.nn.functional as F +from typing import List, Dict, Tuple, Optional +from pathlib import Path + +# Add genderbench to path +sys.path.append('./genderbench') + +class EnglishGenderProcessor: + """English Gender Word Processor""" + + def __init__(self): + self.gender_pairs = { + 'he': 'she', 'she': 'he', + 'him': 'her', 'her': 'him', + 'his': 'hers', 'hers': 'his', + 'himself': 'herself', 'herself': 'himself', + 'men': 'women', 'women': 'men', + 'man': 'woman', 'woman': 'man', + 'male': 'female', 'female': 'male', + 'males': 'females', 'females': 'males', + 'gentleman': 'lady', 'lady': 'gentleman', + 'gentlemen': 'ladies', 'ladies': 'gentlemen', + 'father': 'mother', 'mother': 'father', + 'dad': 'mom', 'mom': 'dad', + 'son': 'daughter', 'daughter': 'son', + 'brother': 'sister', 'sister': 'brother', + 'uncle': 'aunt', 'aunt': 'uncle', + 'husband': 'wife', 'wife': 'husband', + 'boy': 'girl', 'girl': 'boy', + 'boys': 'girls', 'girls': 'boys', + 'guy': 'gal', 'gal': 'guy', + 'guys': 'gals', 'gals': 'guys', + } + + # Gender classification + self.male_words = { + 'men', 'man', 'he', 'him', 'his', 'male', 'males', 'father', 'dad', + 'son', 'brother', 'uncle', 'husband', 'boy', 'boys', 'gentleman', + 'gentlemen', 'guy', 'guys' + } + self.female_words = { + 'women', 'woman', 'she', 'her', 'hers', 'female', 'females', 'mother', + 'mom', 'daughter', 'sister', 'aunt', 'wife', 'girl', 'girls', 'lady', + 'ladies', 'gal', 'gals' + } + + def get_gender_opposite(self, word: str) -> str: + """Get gender opposite word""" + word_lower = word.lower() + if word_lower in self.gender_pairs: + opposite = self.gender_pairs[word_lower] + # Preserve original case + if word.isupper(): + return opposite.upper() + elif word.istitle(): + return opposite.title() + else: + return opposite + return word + + def extract_gender_words_with_roles(self, text: str) -> List[Dict]: + """Extract gender words with their roles in sentence""" + words = [] + for match in re.finditer(r'\b\w+\b', text): + word = match.group().lower() + if word in self.gender_pairs: + role = self._analyze_word_role(text, match.start(), match.end(), word) + words.append({ + 'word': word, + 'original': match.group(), + 'start': match.start(), + 'end': match.end(), + 'role': role, + 'is_male': word in self.male_words, + 'is_female': word in self.female_words + }) + return words + + def _analyze_word_role(self, text: str, start: int, end: int, word: str) -> str: + """Analyze word role in sentence (subject, object, modifier)""" + before_text = text[:start].strip() + after_text = text[end:].strip() + + # Check if subject (beginning of sentence or after conjunction) + if not before_text or before_text.endswith('.') or before_text.endswith(','): + return 'subject' + + # Check if object (after action verbs) + action_words = ['abuse', 'hit', 'help', 'support', 'love', 'hate', 'see', 'meet', 'like', 'prefer'] + for action in action_words: + if action in before_text.lower().split()[-3:]: + return 'object' + + # Default to modifier + return 'modifier' + +class EnglishStereotypeConverter: + """English Stereotype Converter""" + + def __init__(self): + self.gender_processor = EnglishGenderProcessor() + + def create_balanced_pairs(self, text: str) -> Tuple[str, str]: + """Create balanced gender pairs""" + gender_words = self.gender_processor.extract_gender_words_with_roles(text) + + if not gender_words: + return None, None + + # Strategy 1: Single gender - create symmetric versions + if len(gender_words) == 1: + return self._create_single_gender_pairs(text, gender_words[0]) + + # Strategy 2: Multiple genders - smart role swapping + return self._create_multi_gender_pairs(text, gender_words) + + def _create_single_gender_pairs(self, text: str, gender_word: Dict) -> Tuple[str, str]: + """Handle single gender word cases""" + word = gender_word['word'] + start = gender_word['start'] + end = gender_word['end'] + + # Keep original version for the matching gender + if gender_word['is_male']: + male_version = text + female_version = text[:start] + self.gender_processor.get_gender_opposite(text[start:end]) + text[end:] + else: + female_version = text + male_version = text[:start] + self.gender_processor.get_gender_opposite(text[start:end]) + text[end:] + + return male_version, female_version + + def _create_multi_gender_pairs(self, text: str, gender_words: List[Dict]) -> Tuple[str, str]: + """Handle multiple gender words - smart role swapping""" + # Create two versions maintaining sentence logic + male_dominant = text # Male-dominant version + female_dominant = text # Female-dominant version + + # Replace from back to front to avoid position shifts + all_words = sorted(gender_words, key=lambda x: x['start'], reverse=True) + + for word_info in all_words: + word = word_info['word'] + start = word_info['start'] + end = word_info['end'] + opposite = self.gender_processor.get_gender_opposite(word) + + # Replace based on word gender and target version + if word_info['is_male']: + # Male word: keep in male_dominant, replace in female_dominant + female_dominant = female_dominant[:start] + opposite + female_dominant[end:] + else: + # Female word: replace in male_dominant, keep in female_dominant + male_dominant = male_dominant[:start] + opposite + male_dominant[end:] + + return male_dominant, female_dominant + + def create_neutral_template(self, text: str) -> str: + """Create neutral template with [GENDER] placeholders""" + gender_words = self.gender_processor.extract_gender_words_with_roles(text) + + if not gender_words: + return text + + neutral_text = text + # Replace from back to front with [GENDER] + for word_info in reversed(gender_words): + start = word_info['start'] + end = word_info['end'] + neutral_text = neutral_text[:start] + '[GENDER]' + neutral_text[end:] + + return neutral_text + +class EnglishDebiasDataLoader: + """English Debias Data Loader""" + + def __init__(self): + self.converter = EnglishStereotypeConverter() + self.gender_processor = EnglishGenderProcessor() + + def load_direct_probe_data(self) -> List[Dict]: + """Load and convert DirectProbe data""" + from genderbench.probes.direct.direct_probe import DirectProbe + + probe = DirectProbe() + items = probe._create_probe_items() + + converted_data = [] + for item in items: + if hasattr(item, 'prompts') and item.prompts: + prompt_text = item.prompts[0].text + match = re.search(r'"([^"]+)"', prompt_text) + if match: + stereotype = match.group(1) + + # Smart conversion + version1, version2 = self.converter.create_balanced_pairs(stereotype) + if version1 and version2 and version1 != version2: + neutral_template = self.converter.create_neutral_template(stereotype) + + converted_data.append({ + 'type': 'stereotype', + 'original': stereotype, + 'neutral_template': neutral_template, + 'version1': version1, + 'version2': version2, + 'source': item.metadata.get('source', 'unknown') + }) + + return converted_data + + def load_jobs_probe_data(self) -> List[Dict]: + """Load and process JobsProbe data in English""" + try: + from genderbench.probes.jobs_lum.jobs_lum_probe import JobsLumProbe + + probe = JobsLumProbe() + items = probe._create_probe_items() + + converted_data = [] + for item in items: + if hasattr(item, 'prompts') and item.prompts: + prompt_text = item.prompts[0].text + match = re.search(r'works as a/an (.+?)\. Please', prompt_text) + if match: + job = match.group(1) + + converted_data.append({ + 'type': 'job', + 'job': job, + 'prompts': { + 'base': f"Describe this {job}:", + 'male_prompt': f"This {job} is a man. Describe his characteristics:", + 'female_prompt': f"This {job} is a woman. Describe her characteristics:", + 'neutral_prompt': f"This {job} is a professional. Describe their characteristics:" + } + }) + + return converted_data + except Exception as e: + print(f"Failed to load Jobs data: {e}") + return [] + + def create_training_data_for_em(self) -> List[Dict]: + """Create training data specifically for Entropy Minimization""" + # Load both stereotype and job data + stereotype_data = self.load_direct_probe_data() + job_data = self.load_jobs_probe_data() + + training_data = [] + + # Process stereotype data + for item in stereotype_data: + training_data.append({ + 'type': 'stereotype_pair', + 'prompt_male': f"Do you agree with this statement: '{item['version1']}'? Please explain your reasoning.", + 'prompt_female': f"Do you agree with this statement: '{item['version2']}'? Please explain your reasoning.", + 'constraint': 'responses_must_be_identical_except_pronouns', + 'original': item['original'] + }) + + # Process job data + for item in job_data: + training_data.append({ + 'type': 'job_pair', + 'prompt_male': item['prompts']['male_prompt'], + 'prompt_female': item['prompts']['female_prompt'], + 'constraint': 'responses_must_be_identical_except_pronouns', + 'job': item['job'] + }) + + return training_data + +def demonstrate_english_conversion(): + """Demonstrate English conversion functionality""" + print("šŸš€ === English Gender Debias Conversion Demo ===") + + # Load data + loader = EnglishDebiasDataLoader() + + # Load stereotype data + print("šŸ“Š Loading stereotype data...") + stereotype_data = loader.load_direct_probe_data() + print(f"āœ… Successfully converted {len(stereotype_data)} stereotypes") + + # Show conversion examples + print("\nšŸŽÆ English Conversion Examples:") + for i, item in enumerate(stereotype_data[:8]): + print(f" {i+1}. Original: {item['original']}") + print(f" Template: {item['neutral_template']}") + print(f" Version 1: {item['version1']}") + print(f" Version 2: {item['version2']}") + + # Analyze differences + words1 = set(item['version1'].lower().split()) + words2 = set(item['version2'].lower().split()) + diff = words1.symmetric_difference(words2) + print(f" Different words: {diff}") + print() + + # Load job data + print("šŸ“Š Loading job data...") + job_data = loader.load_jobs_probe_data() + print(f"āœ… Successfully loaded {len(job_data)} jobs") + + # Show job examples + print("\nšŸ’¼ English Job Examples:") + for i, item in enumerate(job_data[:5]): + print(f" {i+1}. Job: {item['job']}") + print(f" Base: {item['prompts']['base']}") + print(f" Male: {item['prompts']['male_prompt']}") + print(f" Female: {item['prompts']['female_prompt']}") + print(f" Neutral: {item['prompts']['neutral_prompt']}") + print() + + # Create training data + print("šŸ“Š Creating training data for EM...") + training_data = loader.create_training_data_for_em() + print(f"āœ… Created {len(training_data)} training pairs") + + # Show training data examples + print("\nšŸŽÆ Training Data Examples:") + for i, item in enumerate(training_data[:3]): + print(f" {i+1}. Type: {item['type']}") + print(f" Male prompt: {item['prompt_male']}") + print(f" Female prompt: {item['prompt_female']}") + print(f" Constraint: {item['constraint']}") + print() + + # Analysis + print("šŸ“Š Analysis:") + stereotype_pairs = sum(1 for item in training_data if item['type'] == 'stereotype_pair') + job_pairs = sum(1 for item in training_data if item['type'] == 'job_pair') + print(f" - Stereotype pairs: {stereotype_pairs}") + print(f" - Job pairs: {job_pairs}") + print(f" - Total pairs: {len(training_data)}") + print(f" - Ready for EM training: āœ…") + +if __name__ == "__main__": + demonstrate_english_conversion() \ No newline at end of file -- cgit v1.2.3