1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
|
import json
from itertools import zip_longest
import textwrap
def load_data(filepath):
"""Load user data from a JSONL file"""
users = []
with open(filepath, 'r') as f:
for line in f:
if line.strip():
users.append(json.loads(line))
return users
def format_conversation(conv, file_label):
"""Format a conversation into a list of lines"""
lines = []
lines.append(f">>> {file_label} <<<")
lines.append("")
if 'conversation' in conv:
for i, msg in enumerate(conv['conversation'], 1):
role = msg.get('role', 'unknown').upper()
content = msg.get('content', '')
lines.append(f"[{i}] {role}:")
# Split content into lines and indent
for content_line in content.split('\n'):
lines.append(f" {content_line}")
if i-2 < len(conv['full_conversation_log']):
full_conversation_log_msg = conv['full_conversation_log'][i-2]
if 'enforce_preferences' in full_conversation_log_msg and (full_conversation_log_msg['enforce_preferences'] == True or full_conversation_log_msg['enforce_preferences'] == "True"):
lines.append(f"<<<<< Enforced preference >>>>>")
lines.append("") # Empty line after each message
# Format evaluation
if 'evaluation' in conv:
lines.append("[EVALUATION]")
eval_data = conv['evaluation']
if 'final_answer' in eval_data:
lines.append(f"• Final Answer: {eval_data['final_answer']}")
if 'accuracy' in eval_data:
acc = eval_data['accuracy']['accuracy']
acc_symbol = "✓" if acc == 1 else "✗"
lines.append(f"• Accuracy: {acc} {acc_symbol}")
num_enforced_preferences = len([message for message in conv['full_conversation_log'] if 'enforce_preferences' in message and (message['enforce_preferences'] == True or message['enforce_preferences'] == "True")])
lines.append(f"• Number of enforced preferences: {num_enforced_preferences}")
if 'conversation_length' in eval_data:
lines.append(f"• Length: {eval_data['conversation_length']} msgs")
return lines
def wrap_lines(lines, width):
"""Wrap lines to specified width"""
wrapped = []
for line in lines:
if len(line) <= width:
wrapped.append(line)
else:
# Wrap the line
wrapped_line = textwrap.wrap(line, width=width, break_long_words=True, break_on_hyphens=False)
wrapped.extend(wrapped_line)
return wrapped
def calculate_aggregate_stats(users_data):
"""Calculate aggregate statistics across all users"""
all_accuracies = []
all_lengths = []
all_enforced_counts = []
for user_data in users_data:
if 'generated_conversations' in user_data:
for conv in user_data['generated_conversations']:
# Collect accuracy
if 'evaluation' in conv and 'accuracy' in conv['evaluation']:
all_accuracies.append(conv['evaluation']['accuracy']['accuracy'])
# Collect conversation length
if 'evaluation' in conv and 'conversation_length' in conv['evaluation']:
all_lengths.append(conv['evaluation']['conversation_length'])
# Collect enforced preferences count
if 'full_conversation_log' in conv:
count = len([msg for msg in conv['full_conversation_log']
if 'enforce_preferences' in msg and (msg['enforce_preferences'] == True or msg['enforce_preferences'] == "True")])
all_enforced_counts.append(count)
avg_accuracy = sum(all_accuracies) / len(all_accuracies) if all_accuracies else 0
avg_length = sum(all_lengths) / len(all_lengths) if all_lengths else 0
avg_enforced = sum(all_enforced_counts) / len(all_enforced_counts) if all_enforced_counts else 0
return avg_accuracy, avg_length, avg_enforced
def print_side_by_side(conv1, conv2, label1, label2, col_width=60):
"""Print two conversations side by side"""
lines1 = format_conversation(conv1, label1)
lines2 = format_conversation(conv2, label2)
# Wrap lines to fit column width
lines1 = wrap_lines(lines1, col_width)
lines2 = wrap_lines(lines2, col_width)
# Print header
print(f"\n{label1:<{col_width}} | {label2}")
print(f"{'-'*col_width} | {'-'*col_width}")
# Print lines side by side
for line1, line2 in zip_longest(lines1, lines2, fillvalue=''):
# Pad line1 to col_width
line1 = line1.ljust(col_width)
print(f"{line1} | {line2}")
def print_side_by_side_3(conv1, conv2, conv3, label1, label2, label3, col_width=42):
"""Print three conversations side by side"""
lines1 = format_conversation(conv1, label1)
lines2 = format_conversation(conv2, label2)
lines3 = format_conversation(conv3, label3)
# Wrap lines to fit column width
lines1 = wrap_lines(lines1, col_width)
lines2 = wrap_lines(lines2, col_width)
lines3 = wrap_lines(lines3, col_width)
# Print header
print(f"\n{label1:<{col_width}} | {label2:<{col_width}} | {label3}")
print(f"{'-'*col_width} | {'-'*col_width} | {'-'*col_width}")
# Print lines side by side
for line1, line2, line3 in zip_longest(lines1, lines2, lines3, fillvalue=''):
# Pad line1 and line2 to col_width
line1 = line1.ljust(col_width)
line2 = line2.ljust(col_width)
print(f"{line1} | {line2} | {line3}")
def format_detailed_full_log(conv, file_label):
"""Format detailed conversation including all fields from full_conversation_log"""
lines = []
lines.append(f">>> {file_label} — FULL LOG <<<")
lines.append("")
if 'full_conversation_log' in conv and conv['full_conversation_log']:
for j, msg in enumerate(conv['full_conversation_log'], 1):
# Alternate roles starting with USER
role_label = 'USER' if j % 2 == 1 else 'ASSISTANT'
lines.append(f"[{j}] {role_label}:")
def is_enforced(value):
return value is True or value == "True" or value == "true"
# 1) Response first (as plain text)
response_text = msg.get('response')
if response_text is not None:
for line in str(response_text).split('\n'):
lines.append(f"{line}")
# 1a) Enforcement tag if applicable
if 'enforce_preferences' in msg and is_enforced(msg['enforce_preferences']):
lines.append("<<<<< Preferences Enforced >>>>>")
# 2) Ordered keys as bulleted items
ordered_keys = [
'preference_1_satisfied',
'preference_2_satisfied',
'preference_3_satisfied',
'enforce_preferences',
'draft_answer',
'reasoning',
'should_terminate',
]
def append_bullet(key, value):
if isinstance(value, (dict, list)):
try:
pretty_value = json.dumps(value, indent=2, sort_keys=True, ensure_ascii=False)
except Exception:
pretty_value = str(value)
lines.append(f" - {key}:")
for ln in pretty_value.split('\n'):
lines.append(f" {ln}")
else:
value_str = str(value) if value is not None else ""
value_lines = value_str.split('\n') if value_str else [""]
# First line on the bullet
lines.append(f" - {key}: {value_lines[0]}")
# Continuation lines indented slightly further
for cont in value_lines[1:]:
lines.append(f" {cont}")
for key in ordered_keys:
if key in msg:
append_bullet(key, msg.get(key))
# 3) Remaining keys grouped under Other fields
shown_keys = set(['response'] + ordered_keys)
remaining_keys = [k for k in msg.keys() if k not in shown_keys]
if remaining_keys:
lines.append(" - Other fields:")
for k in sorted(remaining_keys):
v = msg[k]
if isinstance(v, (dict, list)):
try:
pretty_v = json.dumps(v, indent=2, sort_keys=True, ensure_ascii=False)
except Exception:
pretty_v = str(v)
lines.append(f" {k}:")
for ln in pretty_v.split('\n'):
lines.append(f" {ln}")
else:
v_str = str(v)
v_lines = v_str.split('\n') if v_str else [""]
lines.append(f" {k}: {v_lines[0]}")
for cont in v_lines[1:]:
lines.append(f" {cont}")
lines.append("")
else:
lines.append("[No full_conversation_log available]")
# Include evaluation details if present
if 'evaluation' in conv:
lines.append("[EVALUATION — FULL]")
try:
eval_pretty = json.dumps(conv['evaluation'], indent=2, sort_keys=True, ensure_ascii=False)
except Exception:
eval_pretty = str(conv['evaluation'])
for line in eval_pretty.split('\n'):
lines.append(f" {line}")
return lines
def print_detailed_logs_3(conv1, conv2, conv3, label1, label2, label3, col_width=42):
"""Print detailed logs for three conversations side by side"""
lines1 = wrap_lines(format_detailed_full_log(conv1, label1), col_width)
lines2 = wrap_lines(format_detailed_full_log(conv2, label2), col_width)
lines3 = wrap_lines(format_detailed_full_log(conv3, label3), col_width)
print(f"\n{label1:<{col_width}} | {label2:<{col_width}} | {label3}")
print(f"{'-'*col_width} | {'-'*col_width} | {'-'*col_width}")
for line1, line2, line3 in zip_longest(lines1, lines2, lines3, fillvalue=''):
print(f"{line1.ljust(col_width)} | {line2.ljust(col_width)} | {line3}")
def print_user_info(user_data):
"""Print user profile information"""
print(f"\n[USER PROFILE]")
if 'i' in user_data:
print(f"User ID: {user_data['i']}")
if 'persona' in user_data:
print(f"Persona: {user_data['persona']}")
if 'preferences' in user_data:
print(f"Preferences:")
for preference in user_data['preferences']:
print(f" - {preference}")
for task in ["bigcodebench"]: # ["math_500", "logiqa", "math_hard", "medqa", "mmlu"]:
user_profiles_without_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/user_profiles_without_preferences/logiqa_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_20.jsonl"
user_profiles_with_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/user_profiles_with_preferences/logiqa_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_20.jsonl"
agent_with_userpreferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/agent_with_user_preferences/logiqa_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_20_v2.jsonl"
agnet_with_reflection_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/agent_with_reflection_v3/logiqa_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_20.jsonl"
file1_path = user_profiles_without_preferences_path
file2_path = user_profiles_with_preferences_path
file3_path = agent_with_userpreferences_path
file4_path = agnet_with_reflection_path
# Load users from all three files
data1 = load_data(file1_path)
data2 = load_data(file2_path)
data3 = load_data(file3_path)
data4 = load_data(file4_path)
id_to_user_data1 = {elem['i']: elem for elem in data1+data2+data3+data4}
id_to_user_data2 = {elem['i']: elem for elem in data2}
id_to_user_data3 = {elem['i']: elem for elem in data3}
id_to_user_data4 = {elem['i']: elem for elem in data4}
for id in id_to_user_data1:
if id != 23: continue
user_avg_acc1, user_avg_len1, user_avg_enf1 = calculate_aggregate_stats([id_to_user_data1[id]])
user_avg_acc2, user_avg_len2, user_avg_enf2 = calculate_aggregate_stats([id_to_user_data2[id]])
user_avg_acc3, user_avg_len3, user_avg_enf3 = calculate_aggregate_stats([id_to_user_data3[id]])
user_avg_acc4, user_avg_len4, user_avg_enf4 = calculate_aggregate_stats([id_to_user_data4[id]])
# print user info
print("\n" + "="*125 + "\n")
print(f"### Task: {task}\n")
print("LOGGING FOR USER ID: ", id)
print_user_info(id_to_user_data1[id])
# Print the average performance for id_to_user_data1[id]
# Print the average performance for id_to_user_data2[id]
print("\n" + "-"*125)
print("COMPARISON FOR THIS USER")
print("-"*125)
print("\nUser Without Preferences:")
print(f" Average Accuracy: {user_avg_acc1:.2f}")
print(f" Average # Messages: {user_avg_len1:.2f}")
print(f" Average # Enforced Preferences: {user_avg_enf1:.2f}")
print("\nUser With Preferences:")
print(f" Average Accuracy: {user_avg_acc2:.2f}")
print(f" Average # Messages: {user_avg_len2:.2f}")
print(f" Average # Enforced Preferences: {user_avg_enf2:.2f}")
print("\nAgent With User Preferences:")
print(f" Average Accuracy: {user_avg_acc3:.2f}")
print(f" Average # Messages: {user_avg_len3:.2f}")
print(f" Average # Enforced Preferences: {user_avg_enf3:.2f}")
print("\nAgent With Reflection:")
print(f" Average Accuracy: {user_avg_acc4:.2f}")
print(f" Average # Messages: {user_avg_len4:.2f}")
print(f" Average # Enforced Preferences: {user_avg_enf4:.2f}")
# print conversations
problem_to_conversation1 = {conv['sample']['problem']: conv for conv in id_to_user_data1[id]['generated_conversations']}
problem_to_conversation2 = {conv['sample']['problem']: conv for conv in id_to_user_data2[id]['generated_conversations']}
problem_to_conversation3 = {conv['sample']['problem']: conv for conv in id_to_user_data3[id]['generated_conversations']}
for problem in problem_to_conversation1:
print("\n" + "="*125)
print(f"\n[PROBLEM]")
print(problem)
print(f"\n[SOLUTION]")
print(problem_to_conversation1[problem]['sample']['solution'])
print("\n" + "="*125)
print_side_by_side_3(
problem_to_conversation1[problem],
problem_to_conversation2[problem],
problem_to_conversation3.get(problem, {'conversation': [], 'evaluation': {}}),
"FILE 1 (WITHOUT PREFERENCES)",
"FILE 2 (WITH PREFERENCES)",
"FILE 3 (AGENT WITH USER PREFS)",
col_width=55
)
# Detailed logs below with all fields
print("\n" + "-"*125)
print("DETAILED FULL LOGS")
print("-"*125)
print_detailed_logs_3(
problem_to_conversation1[problem],
problem_to_conversation2[problem],
problem_to_conversation3.get(problem, {'conversation': [], 'evaluation': {}}),
"FILE 1 (WITHOUT PREFERENCES)",
"FILE 2 (WITH PREFERENCES)",
"FILE 3 (AGENT WITH USER PREFS)",
col_width=55
)
# break
# # ==============================================================================
# # SEPARATE SECTION: Per-User Statistics Averaged Over All Tasks
# # ==============================================================================
# print("\n" + "="*125)
# print("="*125)
# print("STATISTICS FOR EACH USER, AVERAGED OVER ALL TASKS")
# print("="*125)
# print("="*125 + "\n")
# # Dictionary to store all data for each user across all tasks
# user_to_all_data = {}
# # Collect data for all users across all tasks
# for task in ["math_500", "logiqa", "math_hard", "medqa", "mmlu"]:
# user_profiles_without_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/user_profiles_without_preferences/{task}_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_20.jsonl"
# user_profiles_with_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/user_profiles_with_preferences/{task}_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_20.jsonl"
# agent_with_userpreferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/agent_with_user_preferences/{task}_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_20_v2.jsonl"
# agent_with_reflection_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/agent_with_reflection/{task}_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_20.jsonl"
# data1 = load_data(user_profiles_without_preferences_path)
# data2 = load_data(user_profiles_with_preferences_path)
# data3 = load_data(agent_with_userpreferences_path)
# data4 = load_data(agent_with_reflection_path)
# # For each user in this task, store their data
# for user_data in data1:
# user_id = user_data['i']
# if user_id not in user_to_all_data:
# user_to_all_data[user_id] = {
# 'persona': user_data.get('persona'),
# 'preferences': user_data.get('preferences'),
# 'data1': [], # without preferences
# 'data2': [], # with preferences
# 'data3': [], # agent with user preferences
# 'data4': [] # agent with reflection
# }
# user_to_all_data[user_id]['data1'].append(user_data)
# for user_data in data2:
# user_id = user_data['i']
# if user_id in user_to_all_data:
# user_to_all_data[user_id]['data2'].append(user_data)
# for user_data in data3:
# user_id = user_data['i']
# if user_id in user_to_all_data:
# user_to_all_data[user_id]['data3'].append(user_data)
# for user_data in data4:
# user_id = user_data['i']
# if user_id in user_to_all_data:
# user_to_all_data[user_id]['data4'].append(user_data)
# # Now print statistics for each user, averaged over all tasks
# for user_id in sorted(user_to_all_data.keys()):
# user_info = user_to_all_data[user_id]
# # Calculate aggregate stats across all tasks for this user
# user_avg_acc1, user_avg_len1, user_avg_enf1 = calculate_aggregate_stats(user_info['data1'])
# user_avg_acc2, user_avg_len2, user_avg_enf2 = calculate_aggregate_stats(user_info['data2'])
# user_avg_acc3, user_avg_len3, user_avg_enf3 = calculate_aggregate_stats(user_info['data3'])
# user_avg_acc4, user_avg_len4, user_avg_enf4 = calculate_aggregate_stats(user_info['data4'])
# print("\n" + "="*125)
# print(f"USER ID: {user_id}")
# print("="*125)
# # Print user profile info
# if user_info['persona']:
# print(f"Persona: {user_info['persona']}")
# if user_info['preferences']:
# print(f"Preferences:")
# for preference in user_info['preferences']:
# print(f" - {preference}")
# print("\n" + "-"*125)
# print("STATISTICS AVERAGED OVER ALL TASKS")
# print("-"*125)
# print("\nUser Without Preferences:")
# print(f" Average Accuracy: {user_avg_acc1:.2f}")
# print(f" Average # Messages: {user_avg_len1:.2f}")
# print(f" Average # Enforced Preferences: {user_avg_enf1:.2f}")
# print("\nUser With Preferences:")
# print(f" Average Accuracy: {user_avg_acc2:.2f}")
# print(f" Average # Messages: {user_avg_len2:.2f}")
# print(f" Average # Enforced Preferences: {user_avg_enf2:.2f}")
# print("\nAgent With User Preferences:")
# print(f" Average Accuracy: {user_avg_acc3:.2f}")
# print(f" Average # Messages: {user_avg_len3:.2f}")
# print(f" Average # Enforced Preferences: {user_avg_enf3:.2f}")
# print("\nAgent With Reflection:")
# print(f" Average Accuracy: {user_avg_acc4:.2f}")
# print(f" Average # Messages: {user_avg_len4:.2f}")
# print(f" Average # Enforced Preferences: {user_avg_enf4:.2f}")
# print("\n" + "="*125)
# print("END OF PER-USER STATISTICS")
# print("="*125 + "\n")
|