import json from itertools import zip_longest import textwrap def load_data(filepath): """Load user data from a JSONL file""" users = [] with open(filepath, 'r') as f: for line in f: if line.strip(): users.append(json.loads(line)) return users def format_conversation(conv, file_label): """Format a conversation into a list of lines""" lines = [] lines.append(f">>> {file_label} <<<") lines.append("") if 'conversation' in conv: for i, msg in enumerate(conv['conversation'], 1): role = msg.get('role', 'unknown').upper() content = msg.get('content', '') lines.append(f"[{i}] {role}:") # Split content into lines and indent for content_line in content.split('\n'): lines.append(f" {content_line}") if i-2 < len(conv['full_conversation_log']): full_conversation_log_msg = conv['full_conversation_log'][i-2] if 'enforce_preferences' in full_conversation_log_msg and (full_conversation_log_msg['enforce_preferences'] == True or full_conversation_log_msg['enforce_preferences'] == "True"): lines.append(f"<<<<< Enforced preference >>>>>") lines.append("") # Empty line after each message # Format evaluation if 'evaluation' in conv: lines.append("[EVALUATION]") eval_data = conv['evaluation'] if 'final_answer' in eval_data: lines.append(f"• Final Answer: {eval_data['final_answer']}") if 'accuracy' in eval_data: acc = eval_data['accuracy']['accuracy'] acc_symbol = "✓" if acc == 1 else "✗" lines.append(f"• Accuracy: {acc} {acc_symbol}") num_enforced_preferences = len([message for message in conv['full_conversation_log'] if 'enforce_preferences' in message and (message['enforce_preferences'] == True or message['enforce_preferences'] == "True")]) lines.append(f"• Number of enforced preferences: {num_enforced_preferences}") if 'conversation_length' in eval_data: lines.append(f"• Length: {eval_data['conversation_length']} msgs") return lines def wrap_lines(lines, width): """Wrap lines to specified width""" wrapped = [] for line in lines: if len(line) <= width: wrapped.append(line) else: # Wrap the line wrapped_line = textwrap.wrap(line, width=width, break_long_words=True, break_on_hyphens=False) wrapped.extend(wrapped_line) return wrapped def calculate_aggregate_stats(users_data): """Calculate aggregate statistics across all users""" all_accuracies = [] all_lengths = [] all_enforced_counts = [] for user_data in users_data: if 'generated_conversations' in user_data: for conv in user_data['generated_conversations']: # Collect accuracy if 'evaluation' in conv and 'accuracy' in conv['evaluation']: all_accuracies.append(conv['evaluation']['accuracy']['accuracy']) # Collect conversation length if 'evaluation' in conv and 'conversation_length' in conv['evaluation']: all_lengths.append(conv['evaluation']['conversation_length']) # Collect enforced preferences count if 'full_conversation_log' in conv: count = len([msg for msg in conv['full_conversation_log'] if 'enforce_preferences' in msg and (msg['enforce_preferences'] == True or msg['enforce_preferences'] == "True")]) all_enforced_counts.append(count) avg_accuracy = sum(all_accuracies) / len(all_accuracies) if all_accuracies else 0 avg_length = sum(all_lengths) / len(all_lengths) if all_lengths else 0 avg_enforced = sum(all_enforced_counts) / len(all_enforced_counts) if all_enforced_counts else 0 return avg_accuracy, avg_length, avg_enforced def print_side_by_side(conv1, conv2, label1, label2, col_width=60): """Print two conversations side by side""" lines1 = format_conversation(conv1, label1) lines2 = format_conversation(conv2, label2) # Wrap lines to fit column width lines1 = wrap_lines(lines1, col_width) lines2 = wrap_lines(lines2, col_width) # Print header print(f"\n{label1:<{col_width}} | {label2}") print(f"{'-'*col_width} | {'-'*col_width}") # Print lines side by side for line1, line2 in zip_longest(lines1, lines2, fillvalue=''): # Pad line1 to col_width line1 = line1.ljust(col_width) print(f"{line1} | {line2}") def print_side_by_side_3(conv1, conv2, conv3, label1, label2, label3, col_width=42): """Print three conversations side by side""" lines1 = format_conversation(conv1, label1) lines2 = format_conversation(conv2, label2) lines3 = format_conversation(conv3, label3) # Wrap lines to fit column width lines1 = wrap_lines(lines1, col_width) lines2 = wrap_lines(lines2, col_width) lines3 = wrap_lines(lines3, col_width) # Print header print(f"\n{label1:<{col_width}} | {label2:<{col_width}} | {label3}") print(f"{'-'*col_width} | {'-'*col_width} | {'-'*col_width}") # Print lines side by side for line1, line2, line3 in zip_longest(lines1, lines2, lines3, fillvalue=''): # Pad line1 and line2 to col_width line1 = line1.ljust(col_width) line2 = line2.ljust(col_width) print(f"{line1} | {line2} | {line3}") def format_detailed_full_log(conv, file_label): """Format detailed conversation including all fields from full_conversation_log""" lines = [] lines.append(f">>> {file_label} — FULL LOG <<<") lines.append("") if 'full_conversation_log' in conv and conv['full_conversation_log']: for j, msg in enumerate(conv['full_conversation_log'], 1): # Alternate roles starting with USER role_label = 'USER' if j % 2 == 1 else 'ASSISTANT' lines.append(f"[{j}] {role_label}:") def is_enforced(value): return value is True or value == "True" or value == "true" # 1) Response first (as plain text) response_text = msg.get('response') if response_text is not None: for line in str(response_text).split('\n'): lines.append(f"{line}") # 1a) Enforcement tag if applicable if 'enforce_preferences' in msg and is_enforced(msg['enforce_preferences']): lines.append("<<<<< Preferences Enforced >>>>>") # 2) Ordered keys as bulleted items ordered_keys = [ 'preference_1_satisfied', 'preference_2_satisfied', 'preference_3_satisfied', 'enforce_preferences', 'draft_answer', 'reasoning', 'should_terminate', ] def append_bullet(key, value): if isinstance(value, (dict, list)): try: pretty_value = json.dumps(value, indent=2, sort_keys=True, ensure_ascii=False) except Exception: pretty_value = str(value) lines.append(f" - {key}:") for ln in pretty_value.split('\n'): lines.append(f" {ln}") else: value_str = str(value) if value is not None else "" value_lines = value_str.split('\n') if value_str else [""] # First line on the bullet lines.append(f" - {key}: {value_lines[0]}") # Continuation lines indented slightly further for cont in value_lines[1:]: lines.append(f" {cont}") for key in ordered_keys: if key in msg: append_bullet(key, msg.get(key)) # 3) Remaining keys grouped under Other fields shown_keys = set(['response'] + ordered_keys) remaining_keys = [k for k in msg.keys() if k not in shown_keys] if remaining_keys: lines.append(" - Other fields:") for k in sorted(remaining_keys): v = msg[k] if isinstance(v, (dict, list)): try: pretty_v = json.dumps(v, indent=2, sort_keys=True, ensure_ascii=False) except Exception: pretty_v = str(v) lines.append(f" {k}:") for ln in pretty_v.split('\n'): lines.append(f" {ln}") else: v_str = str(v) v_lines = v_str.split('\n') if v_str else [""] lines.append(f" {k}: {v_lines[0]}") for cont in v_lines[1:]: lines.append(f" {cont}") lines.append("") else: lines.append("[No full_conversation_log available]") # Include evaluation details if present if 'evaluation' in conv: lines.append("[EVALUATION — FULL]") try: eval_pretty = json.dumps(conv['evaluation'], indent=2, sort_keys=True, ensure_ascii=False) except Exception: eval_pretty = str(conv['evaluation']) for line in eval_pretty.split('\n'): lines.append(f" {line}") return lines def print_detailed_logs_3(conv1, conv2, conv3, label1, label2, label3, col_width=42): """Print detailed logs for three conversations side by side""" lines1 = wrap_lines(format_detailed_full_log(conv1, label1), col_width) lines2 = wrap_lines(format_detailed_full_log(conv2, label2), col_width) lines3 = wrap_lines(format_detailed_full_log(conv3, label3), col_width) print(f"\n{label1:<{col_width}} | {label2:<{col_width}} | {label3}") print(f"{'-'*col_width} | {'-'*col_width} | {'-'*col_width}") for line1, line2, line3 in zip_longest(lines1, lines2, lines3, fillvalue=''): print(f"{line1.ljust(col_width)} | {line2.ljust(col_width)} | {line3}") def print_user_info(user_data): """Print user profile information""" print(f"\n[USER PROFILE]") if 'i' in user_data: print(f"User ID: {user_data['i']}") if 'persona' in user_data: print(f"Persona: {user_data['persona']}") if 'preferences' in user_data: print(f"Preferences:") for preference in user_data['preferences']: print(f" - {preference}") for task in ["bigcodebench"]: # ["math_500", "logiqa", "math_hard", "medqa", "mmlu"]: user_profiles_without_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/user_profiles_without_preferences/logiqa_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_20.jsonl" user_profiles_with_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/user_profiles_with_preferences/logiqa_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_20.jsonl" agent_with_userpreferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/agent_with_user_preferences/logiqa_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_20_v2.jsonl" agnet_with_reflection_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b_temp_1/agent_with_reflection_v3/logiqa_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_20.jsonl" file1_path = user_profiles_without_preferences_path file2_path = user_profiles_with_preferences_path file3_path = agent_with_userpreferences_path file4_path = agnet_with_reflection_path # Load users from all three files data1 = load_data(file1_path) data2 = load_data(file2_path) data3 = load_data(file3_path) data4 = load_data(file4_path) id_to_user_data1 = {elem['i']: elem for elem in data1+data2+data3+data4} id_to_user_data2 = {elem['i']: elem for elem in data2} id_to_user_data3 = {elem['i']: elem for elem in data3} id_to_user_data4 = {elem['i']: elem for elem in data4} for id in id_to_user_data1: if id != 23: continue user_avg_acc1, user_avg_len1, user_avg_enf1 = calculate_aggregate_stats([id_to_user_data1[id]]) user_avg_acc2, user_avg_len2, user_avg_enf2 = calculate_aggregate_stats([id_to_user_data2[id]]) user_avg_acc3, user_avg_len3, user_avg_enf3 = calculate_aggregate_stats([id_to_user_data3[id]]) user_avg_acc4, user_avg_len4, user_avg_enf4 = calculate_aggregate_stats([id_to_user_data4[id]]) # print user info print("\n" + "="*125 + "\n") print(f"### Task: {task}\n") print("LOGGING FOR USER ID: ", id) print_user_info(id_to_user_data1[id]) # Print the average performance for id_to_user_data1[id] # Print the average performance for id_to_user_data2[id] print("\n" + "-"*125) print("COMPARISON FOR THIS USER") print("-"*125) print("\nUser Without Preferences:") print(f" Average Accuracy: {user_avg_acc1:.2f}") print(f" Average # Messages: {user_avg_len1:.2f}") print(f" Average # Enforced Preferences: {user_avg_enf1:.2f}") print("\nUser With Preferences:") print(f" Average Accuracy: {user_avg_acc2:.2f}") print(f" Average # Messages: {user_avg_len2:.2f}") print(f" Average # Enforced Preferences: {user_avg_enf2:.2f}") print("\nAgent With User Preferences:") print(f" Average Accuracy: {user_avg_acc3:.2f}") print(f" Average # Messages: {user_avg_len3:.2f}") print(f" Average # Enforced Preferences: {user_avg_enf3:.2f}") print("\nAgent With Reflection:") print(f" Average Accuracy: {user_avg_acc4:.2f}") print(f" Average # Messages: {user_avg_len4:.2f}") print(f" Average # Enforced Preferences: {user_avg_enf4:.2f}") # print conversations problem_to_conversation1 = {conv['sample']['problem']: conv for conv in id_to_user_data1[id]['generated_conversations']} problem_to_conversation2 = {conv['sample']['problem']: conv for conv in id_to_user_data2[id]['generated_conversations']} problem_to_conversation3 = {conv['sample']['problem']: conv for conv in id_to_user_data3[id]['generated_conversations']} for problem in problem_to_conversation1: print("\n" + "="*125) print(f"\n[PROBLEM]") print(problem) print(f"\n[SOLUTION]") print(problem_to_conversation1[problem]['sample']['solution']) print("\n" + "="*125) print_side_by_side_3( problem_to_conversation1[problem], problem_to_conversation2[problem], problem_to_conversation3.get(problem, {'conversation': [], 'evaluation': {}}), "FILE 1 (WITHOUT PREFERENCES)", "FILE 2 (WITH PREFERENCES)", "FILE 3 (AGENT WITH USER PREFS)", col_width=55 ) # Detailed logs below with all fields print("\n" + "-"*125) print("DETAILED FULL LOGS") print("-"*125) print_detailed_logs_3( problem_to_conversation1[problem], problem_to_conversation2[problem], problem_to_conversation3.get(problem, {'conversation': [], 'evaluation': {}}), "FILE 1 (WITHOUT PREFERENCES)", "FILE 2 (WITH PREFERENCES)", "FILE 3 (AGENT WITH USER PREFS)", col_width=55 ) # break # # ============================================================================== # # SEPARATE SECTION: Per-User Statistics Averaged Over All Tasks # # ============================================================================== # print("\n" + "="*125) # print("="*125) # print("STATISTICS FOR EACH USER, AVERAGED OVER ALL TASKS") # print("="*125) # print("="*125 + "\n") # # Dictionary to store all data for each user across all tasks # user_to_all_data = {} # # Collect data for all users across all tasks # for task in ["math_500", "logiqa", "math_hard", "medqa", "mmlu"]: # user_profiles_without_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/user_profiles_without_preferences/{task}_llama70b_user_llama70b_agent_user_profiles_without_preferences_eval_size_20.jsonl" # user_profiles_with_preferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/user_profiles_with_preferences/{task}_llama70b_user_llama70b_agent_user_profiles_with_preferences_eval_size_20.jsonl" # agent_with_userpreferences_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/agent_with_user_preferences/{task}_llama70b_user_llama70b_agent_agent_with_user_preferences_eval_size_20_v2.jsonl" # agent_with_reflection_path = f"/shared/storage-01/users/mehri2/mem/collaborativeagents/scripts/runs/llama70b/agent_with_reflection/{task}_llama70b_user_llama70b_agent_agent_with_reflection_eval_size_20.jsonl" # data1 = load_data(user_profiles_without_preferences_path) # data2 = load_data(user_profiles_with_preferences_path) # data3 = load_data(agent_with_userpreferences_path) # data4 = load_data(agent_with_reflection_path) # # For each user in this task, store their data # for user_data in data1: # user_id = user_data['i'] # if user_id not in user_to_all_data: # user_to_all_data[user_id] = { # 'persona': user_data.get('persona'), # 'preferences': user_data.get('preferences'), # 'data1': [], # without preferences # 'data2': [], # with preferences # 'data3': [], # agent with user preferences # 'data4': [] # agent with reflection # } # user_to_all_data[user_id]['data1'].append(user_data) # for user_data in data2: # user_id = user_data['i'] # if user_id in user_to_all_data: # user_to_all_data[user_id]['data2'].append(user_data) # for user_data in data3: # user_id = user_data['i'] # if user_id in user_to_all_data: # user_to_all_data[user_id]['data3'].append(user_data) # for user_data in data4: # user_id = user_data['i'] # if user_id in user_to_all_data: # user_to_all_data[user_id]['data4'].append(user_data) # # Now print statistics for each user, averaged over all tasks # for user_id in sorted(user_to_all_data.keys()): # user_info = user_to_all_data[user_id] # # Calculate aggregate stats across all tasks for this user # user_avg_acc1, user_avg_len1, user_avg_enf1 = calculate_aggregate_stats(user_info['data1']) # user_avg_acc2, user_avg_len2, user_avg_enf2 = calculate_aggregate_stats(user_info['data2']) # user_avg_acc3, user_avg_len3, user_avg_enf3 = calculate_aggregate_stats(user_info['data3']) # user_avg_acc4, user_avg_len4, user_avg_enf4 = calculate_aggregate_stats(user_info['data4']) # print("\n" + "="*125) # print(f"USER ID: {user_id}") # print("="*125) # # Print user profile info # if user_info['persona']: # print(f"Persona: {user_info['persona']}") # if user_info['preferences']: # print(f"Preferences:") # for preference in user_info['preferences']: # print(f" - {preference}") # print("\n" + "-"*125) # print("STATISTICS AVERAGED OVER ALL TASKS") # print("-"*125) # print("\nUser Without Preferences:") # print(f" Average Accuracy: {user_avg_acc1:.2f}") # print(f" Average # Messages: {user_avg_len1:.2f}") # print(f" Average # Enforced Preferences: {user_avg_enf1:.2f}") # print("\nUser With Preferences:") # print(f" Average Accuracy: {user_avg_acc2:.2f}") # print(f" Average # Messages: {user_avg_len2:.2f}") # print(f" Average # Enforced Preferences: {user_avg_enf2:.2f}") # print("\nAgent With User Preferences:") # print(f" Average Accuracy: {user_avg_acc3:.2f}") # print(f" Average # Messages: {user_avg_len3:.2f}") # print(f" Average # Enforced Preferences: {user_avg_enf3:.2f}") # print("\nAgent With Reflection:") # print(f" Average Accuracy: {user_avg_acc4:.2f}") # print(f" Average # Messages: {user_avg_len4:.2f}") # print(f" Average # Enforced Preferences: {user_avg_enf4:.2f}") # print("\n" + "="*125) # print("END OF PER-USER STATISTICS") # print("="*125 + "\n")