#!/usr/bin/env python3
"""
Generate additional profiles by remixing preferences from existing profiles.
This creates diverse profile combinations without requiring LLM calls.
"""

import json
import random
import hashlib
from pathlib import Path
from typing import List, Dict
import argparse


def load_profiles(path: Path) -> List[Dict]:
    """Load profiles from JSONL file."""
    profiles = []
    with open(path) as f:
        for line in f:
            profiles.append(json.loads(line.strip()))
    return profiles


def extract_all_preferences(profiles: List[Dict]) -> Dict[str, List[Dict]]:
    """Extract all unique preferences grouped by category (prefix)."""
    categories = {}
    seen_ids = set()

    for profile in profiles:
        for pref in profile.get("preferences", []):
            pref_id = pref.get("pref_id", "unknown")
            if pref_id in seen_ids:
                continue
            seen_ids.add(pref_id)

            # Extract category from prefix (e.g., "rf_001" -> "rf")
            prefix = pref_id.split("_")[0] if "_" in pref_id else "other"
            if prefix not in categories:
                categories[prefix] = []
            categories[prefix].append(pref)

    return categories


def extract_personas(profiles: List[Dict]) -> List[str]:
    """Extract unique personas from profiles."""
    personas = []
    seen = set()
    for profile in profiles:
        persona = profile.get("persona", "")
        if persona and persona not in seen:
            personas.append(persona)
            seen.add(persona)
    return personas


def generate_new_profile(
    user_id: str,
    preference_pool: Dict[str, List[Dict]],
    personas: List[str],
    target_prefs: int = 43,
    rng: random.Random = None
) -> Dict:
    """Generate a new profile by sampling from preference pool."""
    if rng is None:
        rng = random.Random()

    selected_prefs = []

    # Sample from each category to maintain diversity
    prefs_per_cat = max(1, target_prefs // len(preference_pool))

    for cat, prefs in preference_pool.items():
        # Sample with some randomness
        n_sample = min(len(prefs), prefs_per_cat + rng.randint(-1, 2))
        n_sample = max(1, n_sample)
        sampled = rng.sample(prefs, min(n_sample, len(prefs)))
        selected_prefs.extend(sampled)

    # Add/remove to hit target
    all_prefs = []
    for prefs in preference_pool.values():
        all_prefs.extend(prefs)

    while len(selected_prefs) < target_prefs:
        remaining = [p for p in all_prefs if p not in selected_prefs]
        if not remaining:
            break
        selected_prefs.append(rng.choice(remaining))

    while len(selected_prefs) > target_prefs:
        selected_prefs.pop(rng.randint(0, len(selected_prefs) - 1))

    # Build conflict groups
    conflict_groups = {}
    for pref in selected_prefs:
        cg = pref.get("conflict_group")
        if cg:
            if cg not in conflict_groups:
                conflict_groups[cg] = []
            conflict_groups[cg].append(pref["pref_id"])

    return {
        "user_id": user_id,
        "persona": rng.choice(personas),
        "preferences": selected_prefs,
        "conflict_groups": conflict_groups,
        "meta": {
            "total_preferences": len(selected_prefs),
            "total_conflict_groups": len(conflict_groups),
            "generator": "extend_profiles.py"
        }
    }


def main():
    parser = argparse.ArgumentParser(
        description="Generate additional profiles by remixing existing ones"
    )
    parser.add_argument("--input", type=str, required=True,
                        help="Path to existing profiles JSONL")
    parser.add_argument("--output", type=str, required=True,
                        help="Path for output profiles JSONL")
    parser.add_argument("--num-new", type=int, default=100,
                        help="Number of new profiles to generate")
    parser.add_argument("--seed", type=int, default=142,
                        help="Random seed (use different from original)")
    parser.add_argument("--target-prefs", type=int, default=43,
                        help="Target number of preferences per profile")
    parser.add_argument("--merge", action="store_true",
                        help="Merge with existing profiles in output")

    args = parser.parse_args()

    input_path = Path(args.input)
    output_path = Path(args.output)

    print(f"Loading profiles from: {input_path}")
    profiles = load_profiles(input_path)
    print(f"  Loaded {len(profiles)} profiles")

    # Extract preference pool and personas
    pref_pool = extract_all_preferences(profiles)
    personas = extract_personas(profiles)

    print(f"\nPreference pool:")
    for cat, prefs in pref_pool.items():
        print(f"  {cat}: {len(prefs)} preferences")
    print(f"  Total unique preferences: {sum(len(p) for p in pref_pool.values())}")
    print(f"  Unique personas: {len(personas)}")

    # Generate new profiles
    rng = random.Random(args.seed)
    new_profiles = []

    print(f"\nGenerating {args.num_new} new profiles...")
    for i in range(args.num_new):
        user_id = f"user_{hashlib.md5(f'{args.seed}_{i}'.encode()).hexdigest()[:8]}"
        profile = generate_new_profile(
            user_id=user_id,
            preference_pool=pref_pool,
            personas=personas,
            target_prefs=args.target_prefs,
            rng=rng
        )
        new_profiles.append(profile)

        if (i + 1) % 20 == 0:
            print(f"  Generated {i + 1}/{args.num_new}")

    # Optionally merge with original
    if args.merge:
        output_profiles = profiles + new_profiles
        print(f"\nMerging: {len(profiles)} original + {len(new_profiles)} new = {len(output_profiles)}")
    else:
        output_profiles = new_profiles

    # Save
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, 'w') as f:
        for profile in output_profiles:
            f.write(json.dumps(profile) + '\n')

    print(f"\nSaved {len(output_profiles)} profiles to: {output_path}")

    # Summary stats
    pref_counts = [p["meta"]["total_preferences"] for p in output_profiles]
    print(f"\nProfile statistics:")
    print(f"  Min preferences: {min(pref_counts)}")
    print(f"  Max preferences: {max(pref_counts)}")
    print(f"  Avg preferences: {sum(pref_counts)/len(pref_counts):.1f}")


if __name__ == "__main__":
    main()