1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
|
"""
Extended datasets for challenging personalization evaluation.
New datasets added:
- GPQA: PhD-level science questions
- TheoremQA: Theorem-based math proofs
- LiveCodeBench: Recent competitive programming
- AIME: American Invitational Mathematics Examination
- SciCode: Scientific computing problems
All datasets encourage step-by-step problem solving for longer sessions.
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
import json
from pathlib import Path
try:
from datasets import load_dataset
HF_AVAILABLE = True
except ImportError:
HF_AVAILABLE = False
print("Warning: huggingface datasets not available")
@dataclass
class DatasetSample:
"""A single sample from a dataset."""
problem: str
solution: str
problem_id: str
domain: str
difficulty: Optional[str] = None
metadata: Optional[Dict] = None
class BaseDataset(ABC):
"""Base class for all datasets."""
def __init__(self, eval_size: int = 100, train_size: int = 100, cache_dir: str = None):
self.eval_size = eval_size
self.train_size = train_size
self.cache_dir = cache_dir
self._test_data: Optional[List[DatasetSample]] = None
self._train_data: Optional[List[DatasetSample]] = None
@property
@abstractmethod
def name(self) -> str:
pass
@property
@abstractmethod
def domain(self) -> str:
pass
@property
@abstractmethod
def task_description(self) -> str:
"""Description of the task for user simulator."""
pass
@abstractmethod
def _load_data(self, split: str) -> List[DatasetSample]:
pass
def get_testset(self) -> List[DatasetSample]:
if self._test_data is None:
self._test_data = self._load_data("test")[:self.eval_size]
return self._test_data
def get_trainset(self) -> List[DatasetSample]:
if self._train_data is None:
self._train_data = self._load_data("train")[:self.train_size]
return self._train_data
# =============================================================================
# Existing Datasets (Enhanced with step-by-step prompts)
# =============================================================================
class MATH500Dataset(BaseDataset):
"""MATH-500 dataset with step-by-step encouragement."""
@property
def name(self) -> str:
return "math-500"
@property
def domain(self) -> str:
return "math"
@property
def task_description(self) -> str:
return """You are trying to solve a mathematics problem. The problem requires careful
reasoning and step-by-step work. You will collaborate with an AI assistant to understand
and solve the problem. Break the problem into parts and work through each step carefully.
Ask the assistant to explain their reasoning at each step."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
ds = load_dataset("HuggingFaceH4/MATH-500", split="test")
samples = []
for i, item in enumerate(ds):
samples.append(DatasetSample(
problem=item["problem"],
solution=item["answer"],
problem_id=f"math500_{i}",
domain="math",
difficulty=item.get("level"),
metadata={"type": item.get("type")}
))
return samples
class MATHHardDataset(BaseDataset):
"""MATH-Hard (Level 4-5 only)."""
@property
def name(self) -> str:
return "math-hard"
@property
def domain(self) -> str:
return "math"
@property
def task_description(self) -> str:
return """You are working on a challenging mathematics competition problem. These problems
require deep mathematical insight and careful reasoning. Work through the problem step by step,
explaining your approach clearly. Don't hesitate to ask for hints or verification of your reasoning."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
ds = load_dataset("lighteval/MATH-Hard", split="test" if split == "test" else "train")
samples = []
for i, item in enumerate(ds):
level = item.get("level", "")
if level not in ["Level 4", "Level 5"]:
continue
samples.append(DatasetSample(
problem=item["problem"],
solution=item.get("answer", item.get("solution", "")),
problem_id=f"mathhard_{i}",
domain="math",
difficulty=level,
metadata={"type": item.get("type")}
))
return samples
class HumanEvalDataset(BaseDataset):
"""HumanEval code generation."""
@property
def name(self) -> str:
return "humaneval"
@property
def domain(self) -> str:
return "code"
@property
def task_description(self) -> str:
return """You are implementing a Python function. Think through the problem carefully,
consider edge cases, and implement the solution step by step. Ask for clarification on any
ambiguous requirements. Discuss your approach before writing code."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
ds = load_dataset("openai/openai_humaneval", split="test")
samples = []
for item in ds:
samples.append(DatasetSample(
problem=item["prompt"],
solution=item["canonical_solution"],
problem_id=item["task_id"],
domain="code",
metadata={"entry_point": item["entry_point"], "test": item["test"]}
))
return samples
class BigCodeBenchDataset(BaseDataset):
"""BigCodeBench - harder code generation."""
@property
def name(self) -> str:
return "bigcodebench"
@property
def domain(self) -> str:
return "code"
@property
def task_description(self) -> str:
return """You are working on a complex programming task that requires multiple libraries
and careful implementation. Break down the problem, discuss the approach, and implement step by step.
Ask about library choices and implementation details."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
ds = load_dataset("bigcode/bigcodebench", split="v0.1.2")
samples = []
for item in ds:
samples.append(DatasetSample(
problem=item["instruct_prompt"],
solution=item["canonical_solution"],
problem_id=item["task_id"],
domain="code",
difficulty="hard",
metadata={"libs": item.get("libs", [])}
))
return samples
class LogiQADataset(BaseDataset):
"""LogiQA logical reasoning."""
@property
def name(self) -> str:
return "logiqa"
@property
def domain(self) -> str:
return "reasoning"
@property
def task_description(self) -> str:
return """You are solving a logical reasoning problem. Read the passage carefully,
analyze each answer choice, and reason through the logic step by step. Explain your
reasoning process clearly."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
ds = load_dataset("lucasmccabe/logiqa", split=split if split in ["train", "test"] else "test")
samples = []
for i, item in enumerate(ds):
options = item["options"]
options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)])
problem = f"{item['context']}\n\nQuestion: {item['query']}\n\nOptions:\n{options_str}"
answer = chr(65 + item["correct_option"])
samples.append(DatasetSample(
problem=problem,
solution=answer,
problem_id=f"logiqa_{i}",
domain="reasoning"
))
return samples
class MMLUDataset(BaseDataset):
"""MMLU multi-domain knowledge."""
@property
def name(self) -> str:
return "mmlu"
@property
def domain(self) -> str:
return "knowledge"
@property
def task_description(self) -> str:
return """You are answering a knowledge question that requires domain expertise.
Think through the question carefully, consider what you know about the topic, and
reason to the correct answer. Explain your thought process."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
# Load hard subjects
hard_subjects = ["abstract_algebra", "college_mathematics", "college_physics",
"formal_logic", "high_school_physics", "machine_learning"]
samples = []
for subject in hard_subjects:
try:
ds = load_dataset("cais/mmlu", subject, split="test")
for i, item in enumerate(ds):
choices = item["choices"]
options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)])
problem = f"{item['question']}\n\nOptions:\n{options_str}"
answer = chr(65 + item["answer"])
samples.append(DatasetSample(
problem=problem,
solution=answer,
problem_id=f"mmlu_{subject}_{i}",
domain="knowledge",
metadata={"subject": subject}
))
except Exception:
continue
return samples
class MedQADataset(BaseDataset):
"""MedQA medical knowledge."""
@property
def name(self) -> str:
return "medqa"
@property
def domain(self) -> str:
return "medical"
@property
def task_description(self) -> str:
return """You are answering a medical knowledge question. Consider the clinical
scenario carefully, think through the pathophysiology, and reason to the correct answer.
Explain your medical reasoning step by step."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
ds = load_dataset("bigbio/med_qa", "med_qa_en_source", split=split if split in ["train", "test"] else "test")
samples = []
for i, item in enumerate(ds):
options = item["options"]
if isinstance(options, dict):
options_str = "\n".join([f"{k}. {v}" for k, v in options.items()])
else:
options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(options)])
problem = f"{item['question']}\n\nOptions:\n{options_str}"
samples.append(DatasetSample(
problem=problem,
solution=item["answer_idx"],
problem_id=f"medqa_{i}",
domain="medical"
))
return samples
# =============================================================================
# NEW Challenging Datasets
# =============================================================================
class GPQADataset(BaseDataset):
"""GPQA - Graduate-level PhD science questions.
Extremely challenging questions that require deep domain expertise.
Perfect for testing complex, multi-step reasoning preferences.
"""
@property
def name(self) -> str:
return "gpqa"
@property
def domain(self) -> str:
return "science"
@property
def task_description(self) -> str:
return """You are working on a PhD-level science question that requires deep domain expertise.
These questions are extremely challenging and require careful, methodical reasoning.
Break the problem into parts, discuss the relevant concepts, and work through each step.
Don't hesitate to ask for clarification or verification of your reasoning at each step.
Consider multiple approaches before committing to an answer."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
# GPQA diamond is the hardest subset
try:
ds = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train")
except Exception:
return []
samples = []
for i, item in enumerate(ds):
# Format the multiple choice
choices = [item.get(f"choice_{c}", "") for c in ["A", "B", "C", "D"] if item.get(f"choice_{c}")]
options_str = "\n".join([f"{chr(65+j)}. {opt}" for j, opt in enumerate(choices)])
problem = f"{item['question']}\n\nOptions:\n{options_str}"
samples.append(DatasetSample(
problem=problem,
solution=item.get("correct_answer", "A"),
problem_id=f"gpqa_{i}",
domain="science",
difficulty="phd",
metadata={"subdomain": item.get("subdomain", "unknown")}
))
return samples
class TheoremQADataset(BaseDataset):
"""TheoremQA - Theorem-based mathematical reasoning.
Requires applying mathematical theorems to solve problems.
Tests formal mathematical reasoning and explanation preferences.
"""
@property
def name(self) -> str:
return "theoremqa"
@property
def domain(self) -> str:
return "math"
@property
def task_description(self) -> str:
return """You are solving a theorem-based mathematics problem. This requires identifying
the relevant mathematical theorems, understanding their conditions, and applying them correctly.
Work through the problem step by step:
1. Identify what theorems might apply
2. Verify the conditions are met
3. Apply the theorem carefully
4. Verify the result
Discuss your reasoning at each step and ask for verification when needed."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
try:
ds = load_dataset("TIGER-Lab/TheoremQA", split="test")
except Exception:
return []
samples = []
for i, item in enumerate(ds):
samples.append(DatasetSample(
problem=item["question"],
solution=str(item.get("answer", "")),
problem_id=f"theoremqa_{i}",
domain="math",
difficulty="hard",
metadata={
"theorem": item.get("theorem", ""),
"field": item.get("field", "")
}
))
return samples
class AIMEDataset(BaseDataset):
"""AIME - American Invitational Mathematics Examination.
Competition-level math problems requiring creative problem-solving.
Answers are integers from 0-999.
"""
@property
def name(self) -> str:
return "aime"
@property
def domain(self) -> str:
return "math"
@property
def task_description(self) -> str:
return """You are working on an AIME (American Invitational Mathematics Examination) problem.
These are competition math problems that require creative problem-solving approaches.
The answer is always an integer from 000 to 999.
Work through the problem systematically:
1. Understand what the problem is asking
2. Explore different approaches
3. Calculate carefully
4. Verify your answer
Discuss your thought process and ask for hints if you're stuck."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
# Try to load AIME from available sources
try:
ds = load_dataset("AI-MO/aimo-validation-aime", split="train")
except Exception:
# Fallback to MATH competition problems
try:
ds = load_dataset("hendrycks/competition_math", split="test")
ds = [item for item in ds if "AIME" in item.get("source", "")]
except Exception:
return []
samples = []
for i, item in enumerate(ds):
samples.append(DatasetSample(
problem=item.get("problem", item.get("question", "")),
solution=str(item.get("answer", item.get("solution", ""))),
problem_id=f"aime_{i}",
domain="math",
difficulty="competition",
metadata={"year": item.get("year", ""), "problem_num": item.get("problem_number", "")}
))
return samples
class LiveCodeBenchDataset(BaseDataset):
"""LiveCodeBench - Recent competitive programming problems.
Problems from recent programming contests (post-training cutoff).
Tests code generation on truly novel problems.
"""
@property
def name(self) -> str:
return "livecodebench"
@property
def domain(self) -> str:
return "code"
@property
def task_description(self) -> str:
return """You are solving a competitive programming problem from recent contests.
These problems require careful algorithm design and implementation.
Approach systematically:
1. Understand the problem constraints
2. Identify the algorithm pattern (DP, graphs, greedy, etc.)
3. Design the solution approach
4. Implement carefully with attention to edge cases
5. Analyze time/space complexity
Discuss your approach before coding and verify your logic at each step."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
try:
ds = load_dataset("livecodebench/livecodebench", split="test")
except Exception:
return []
samples = []
for i, item in enumerate(ds):
samples.append(DatasetSample(
problem=item.get("question_content", item.get("problem", "")),
solution=item.get("solution", ""),
problem_id=item.get("question_id", f"lcb_{i}"),
domain="code",
difficulty=item.get("difficulty", "unknown"),
metadata={
"contest": item.get("contest_name", ""),
"date": item.get("contest_date", ""),
"tags": item.get("tags", [])
}
))
return samples
class SciCodeDataset(BaseDataset):
"""SciCode - Scientific computing problems.
Requires domain knowledge + coding ability.
Tests both scientific reasoning and implementation preferences.
"""
@property
def name(self) -> str:
return "scicode"
@property
def domain(self) -> str:
return "science-code"
@property
def task_description(self) -> str:
return """You are implementing a scientific computing solution. This requires both
domain knowledge (physics, chemistry, biology, etc.) and programming expertise.
Approach the problem by:
1. Understanding the scientific concepts involved
2. Formulating the mathematical model
3. Designing the computational approach
4. Implementing with proper numerical methods
5. Validating the results make scientific sense
Discuss the science and the code at each step."""
def _load_data(self, split: str) -> List[DatasetSample]:
if not HF_AVAILABLE:
return []
try:
ds = load_dataset("xlangai/SciCode", split="test")
except Exception:
return []
samples = []
for i, item in enumerate(ds):
samples.append(DatasetSample(
problem=item.get("problem", ""),
solution=item.get("solution", ""),
problem_id=f"scicode_{i}",
domain="science-code",
difficulty="hard",
metadata={
"discipline": item.get("discipline", ""),
"libraries": item.get("libraries", [])
}
))
return samples
# =============================================================================
# Dataset Registry
# =============================================================================
DATASET_REGISTRY = {
# Existing (enhanced)
"math-500": MATH500Dataset,
"math-hard": MATHHardDataset,
"humaneval": HumanEvalDataset,
"bigcodebench": BigCodeBenchDataset,
"logiqa": LogiQADataset,
"mmlu": MMLUDataset,
"medqa": MedQADataset,
# New challenging datasets
"gpqa": GPQADataset,
"theoremqa": TheoremQADataset,
"aime": AIMEDataset,
"livecodebench": LiveCodeBenchDataset,
"scicode": SciCodeDataset,
}
def get_dataset(name: str, **kwargs) -> BaseDataset:
"""Get a dataset by name."""
if name not in DATASET_REGISTRY:
raise ValueError(f"Unknown dataset: {name}. Available: {list(DATASET_REGISTRY.keys())}")
return DATASET_REGISTRY[name](**kwargs)
def get_all_datasets(**kwargs) -> Dict[str, BaseDataset]:
"""Get all available datasets."""
return {name: cls(**kwargs) for name, cls in DATASET_REGISTRY.items()}
def get_challenging_datasets(**kwargs) -> Dict[str, BaseDataset]:
"""Get only the new challenging datasets."""
challenging = ["gpqa", "theoremqa", "aime", "livecodebench", "scicode"]
return {name: DATASET_REGISTRY[name](**kwargs) for name in challenging}
# =============================================================================
# Step-by-Step Query Wrapper
# =============================================================================
def wrap_with_step_by_step_prompt(sample: DatasetSample) -> str:
"""Wrap a problem with prompts encouraging step-by-step interaction.
This makes sessions longer and creates more opportunities for
preference expression/violation.
"""
domain_prompts = {
"math": """Let's solve this step by step. Please:
1. First, help me understand what the problem is asking
2. Then, let's identify the key concepts/formulas needed
3. Work through the solution one step at a time
4. Verify our answer at the end
Problem:
{problem}
Let's start by understanding the problem. What is it asking?""",
"code": """Let's implement this systematically. Please:
1. First, clarify the requirements and edge cases
2. Discuss the algorithm approach before coding
3. Implement step by step, explaining each part
4. Test with examples
Problem:
{problem}
Let's start by understanding the requirements. What are the inputs, outputs, and edge cases?""",
"reasoning": """Let's think through this carefully. Please:
1. Break down the key information in the passage
2. Analyze each answer choice
3. Eliminate wrong answers with clear reasoning
4. Verify the correct answer
Problem:
{problem}
Let's start by identifying the key facts in this passage.""",
"science": """Let's approach this PhD-level problem systematically. Please:
1. Identify the domain and key concepts involved
2. Recall relevant theories/equations
3. Work through the reasoning step by step
4. Verify our conclusion is scientifically sound
Problem:
{problem}
Let's start by identifying what field this question is from and what concepts we'll need.""",
"science-code": """This combines scientific knowledge with coding. Let's:
1. Understand the scientific concepts first
2. Formulate the mathematical approach
3. Design the algorithm
4. Implement and validate
Problem:
{problem}
Let's start by understanding the science behind this problem.""",
}
# Get the appropriate prompt or default to generic
domain = sample.domain
if domain in domain_prompts:
template = domain_prompts[domain]
else:
template = """Let's work through this step by step:
1. Understand the problem
2. Plan our approach
3. Execute carefully
4. Verify our answer
Problem:
{problem}
Let's start by understanding what we need to do."""
return template.format(problem=sample.problem)
# =============================================================================
# Conflict-Inducing Query Augmentation
# =============================================================================
def augment_for_conflict_testing(sample: DatasetSample, conflict_type: str) -> str:
"""Augment a query to trigger specific preference conflicts.
Args:
sample: The base problem
conflict_type: Type of conflict to trigger
Returns:
Augmented query that triggers the conflict
"""
conflict_augmentations = {
# Verbosity conflict: "quick" + complex problem
"verbosity": "Quick question - {problem}",
# Format conflict: asks for both structure types
"format": "Can you explain this with examples and also give me a summary? {problem}",
# Tone conflict: frustrated + learning context
"tone": "I'm so frustrated with this! But I really want to understand it properly. {problem}",
# Code style conflict: multi-language context
"code_style": "I need this in Python first, then JavaScript. {problem}",
# Detail conflict: overview + specifics requested
"detail": "Give me the big picture but also the specific details. {problem}",
# Guidance conflict: incremental + full solution
"guidance": "Walk me through this but also just show me the answer if it's simple. {problem}",
# Rushed + thorough
"time_pressure": "I'm in a hurry but this is important so don't skip anything. {problem}",
# My attempt + fresh perspective
"approach": "I tried [some approach] but maybe start fresh with a better way? {problem}",
}
if conflict_type in conflict_augmentations:
template = conflict_augmentations[conflict_type]
return template.format(problem=sample.problem)
return sample.problem
if __name__ == "__main__":
# Test loading datasets
print("Testing dataset loading...")
for name, cls in DATASET_REGISTRY.items():
try:
ds = cls(eval_size=5)
samples = ds.get_testset()
print(f"{name}: {len(samples)} samples loaded")
if samples:
print(f" Sample: {samples[0].problem[:100]}...")
except Exception as e:
print(f"{name}: Failed - {e}")
|