diff options
| author | haoyuren <13851610112@163.com> | 2025-07-04 03:17:39 -0700 |
|---|---|---|
| committer | haoyuren <13851610112@163.com> | 2025-07-04 03:17:39 -0700 |
| commit | 19228600f14eea433c54e17c164c4efe3a029d77 (patch) | |
| tree | 2a2d9b8ae78135823843e653d1ea56db4963edcf /genderbench/tests/test_probes.py | |
| parent | b2d2d05021de3aba1257fdeb69088a82c65a457f (diff) | |
Add GenderBench for group entropy equalization research
- Integrated GenderBench evaluation suite for gender bias testing
- Added modified MBPP.py for enhanced code evaluation
- Setup complete for implementing gender debiasing through entropy minimization
Diffstat (limited to 'genderbench/tests/test_probes.py')
| -rw-r--r-- | genderbench/tests/test_probes.py | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/genderbench/tests/test_probes.py b/genderbench/tests/test_probes.py new file mode 100644 index 0000000..01f8b12 --- /dev/null +++ b/genderbench/tests/test_probes.py @@ -0,0 +1,204 @@ +from genderbench.generators.random import RandomGenerator +from genderbench.probes import ( + BbqProbe, + BusinessVocabularyProbe, + DirectProbe, + DiscriminationTamkinProbe, + DiversityMedQaProbe, + DreadditProbe, + GestCreativeProbe, + GestProbe, + HiringAnProbe, + HiringBloombergProbe, + InventoriesProbe, + IsearProbe, + JobsLumProbe, + RelationshipLevyProbe, +) +from genderbench.probing.harness import Harness + + +def test_isear(): + probe = IsearProbe(calculate_cis=False, log_strategy="no") + generator = RandomGenerator(probe.emotions) + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["max_diff"]) < 0.03 + + +def test_gest_creative(): + generator = RandomGenerator(["she", "he", "..."]) + probe = GestCreativeProbe( + num_repetitions=10, + calculate_cis=False, + log_strategy="no", + ) + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["stereotype_rate"]) < 0.03 + assert abs(probe.metrics["masculine_rate"] - 1 / 2) < 0.03 + + +def test_inventories(): + generator = RandomGenerator(["she", "he", "..."]) + probe = InventoriesProbe( + num_repetitions=100, + calculate_cis=False, + log_strategy="no", + ) + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["stereotype_rate"]) < 0.05 + assert abs(probe.metrics["masculine_rate"] - 1 / 2) < 0.03 + + +def test_jobs_lum(): + generator = RandomGenerator(["she", "he", "..."]) + probe = JobsLumProbe( + num_repetitions=100, + calculate_cis=False, + log_strategy="no", + ) + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["stereotype_rate"]) < 0.03 + assert abs(probe.metrics["masculine_rate"] - 1 / 2) < 0.03 + + +def test_discrimination_tamkin(): + generator = RandomGenerator(["yes", "no", "..."]) + probe = DiscriminationTamkinProbe(calculate_cis=False, log_strategy="no") + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["max_diff"]) < 0.03 + assert abs(probe.metrics["male_success_rate"] - 1 / 2) < 0.03 + + +def test_hiring_an(): + generator = RandomGenerator( + ["regret to inform you", "you have been selected", "..."] + ) + probe = HiringAnProbe(sample_k=20_000, calculate_cis=False, log_strategy="no") + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["diff_acceptance_rate"]) < 0.03 + assert abs(probe.metrics["diff_regression"]) < 0.03 + + +def test_hiring_bloomberg(): + generator = RandomGenerator(["A1", "A2", "A3", "A4", "B1", "B2", "B3", "B4", "..."]) + probe = HiringBloombergProbe(calculate_cis=False, log_strategy="no") + probe.names = { + "A1": ("male", "asian"), + "A2": ("male", "asian"), + "A3": ("male", "asian"), + "A4": ("male", "asian"), + "B1": ("female", "asian"), + "B2": ("female", "asian"), + "B3": ("female", "asian"), + "B4": ("female", "asian"), + } + probe.races = ["asian"] + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["masculine_rate"] - 0.5) < 0.03 + assert abs(probe.metrics["stereotype_rate"]) < 0.03 + + +def test_dreaddit(): + generator = RandomGenerator(["yes", "no", "..."]) + probe = DreadditProbe(num_repetitions=5, calculate_cis=False, log_strategy="no") + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["max_diff_tpr"]) < 0.03 + assert abs(probe.metrics["max_diff_tnr"]) < 0.03 + + +def test_bbq(): + generator = RandomGenerator(["(a)", "(b)", "(c)", "..."]) + probe = BbqProbe(calculate_cis=False, log_strategy="no") + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["stereotype_rate"] - 1 / 3) < 0.03 + assert abs(probe.metrics["logical_rate"] - 1 / 3) < 0.03 + + +def test_direct(): + generator = RandomGenerator(["yes", "no", "..."]) + probe = DirectProbe(num_repetitions=10, calculate_cis=False, log_strategy="no") + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["sbic_fail_rate"] - 1 / 2) < 0.03 + + +def test_gest(): + generator = RandomGenerator(["(a)", "(b)", "(c)", "..."]) + probe = GestProbe(calculate_cis=False, log_strategy="no") + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["stereotype_rate"]) < 0.03 + assert abs(probe.metrics["frequency_male_option"] - 1 / 3) < 0.03 + + +def test_diversitymedqa(): + generator = RandomGenerator(["(a)", "(b)", "(c)", "..."]) + probe = DiversityMedQaProbe(calculate_cis=False, log_strategy="no") + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["diff_abs_success_rate"]) < 0.03 + assert abs(probe.metrics["male_success_rate"] - 1 / 5) < 0.03 + + +def test_relationship_levy(): + generator = RandomGenerator(["(a)", "(b)", "..."]) + probe = RelationshipLevyProbe(calculate_cis=False, log_strategy="no", sample_k=5000) + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["diff_abs_success_rate"]) < 0.03 + assert abs(probe.metrics["male_success_rate"] - 1 / 2) < 0.03 + + +def test_business_vocabulary(): + generator = RandomGenerator(["ambitious", "warm", "shy", "..."]) + probe = BusinessVocabularyProbe( + calculate_cis=False, log_strategy="no", num_repetitions=3 + ) + probe.run(generator) + print(probe.__class__, probe.metrics, end="\n\n") + assert abs(probe.metrics["bsri_male"] - 1 / 3) < 0.03 + assert abs(probe.metrics["mean_diff"]) < 0.03 + + +def test_marks(): + generator = RandomGenerator(["(a)", "(b)", "(c)", "..."]) + probe = GestProbe(calculate_cis=False, log_strategy="no") + probe.run(generator) + assert probe.marks["stereotype_rate"]["mark_value"] == 0 + + probe = BbqProbe(calculate_cis=True, log_strategy="no") + probe.run(generator) + assert probe.marks["stereotype_rate"]["mark_value"] == 2 + + +def test_harness(): + + class TestHarness(Harness): + + def __init__(self, **kwargs): + probes = [ + DiscriminationTamkinProbe(), + DreadditProbe(), + DirectProbe(), + ] + super().__init__(probes=probes, **kwargs) + + def log_results(self, results): + pass + + harness = TestHarness(calculate_cis=False, log_strategy="no") + generator = RandomGenerator(["yes", "no", "..."]) + marks, _ = harness.run(generator) + print(marks) + + assert 2 <= marks["DirectProbe"]["fail_rate"]["mark_value"] <= 3 + assert 0 <= marks["DiscriminationTamkinProbe"]["max_diff"]["mark_value"] <= 1 |
