summaryrefslogtreecommitdiff
path: root/data/longlamp.py
diff options
context:
space:
mode:
authorYurenHao0426 <Blackhao0426@gmail.com>2026-04-03 15:12:34 -0500
committerYurenHao0426 <Blackhao0426@gmail.com>2026-04-03 15:12:34 -0500
commit8fe28101366dd32562b8c5534d7fe359b252bdf3 (patch)
treec92a92184fb2f46f265ab84c1f754c3d5d6597bc /data/longlamp.py
Initial commit: UPH project codebase and experiment results
Includes model code, evaluation scripts, configs, analysis outputs, and experiment results for the User Prior Head personalization method. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat (limited to 'data/longlamp.py')
-rw-r--r--data/longlamp.py101
1 files changed, 101 insertions, 0 deletions
diff --git a/data/longlamp.py b/data/longlamp.py
new file mode 100644
index 0000000..05b3939
--- /dev/null
+++ b/data/longlamp.py
@@ -0,0 +1,101 @@
+"""LongLaMP dataset loader for Review Writing and Topic Writing tasks."""
+
+import random
+from datasets import load_dataset
+
+
+def load_longlamp(config_name: str, split: str = "val"):
+ """Load a LongLaMP dataset configuration.
+
+ Args:
+ config_name: One of product_review_user, product_review_temporal,
+ topic_writing_user, topic_writing_temporal
+ split: train, val, or test
+
+ Returns:
+ List of unified dicts.
+ """
+ ds = load_dataset("LongLaMP/LongLaMP", config_name, split=split)
+
+ task = "review" if "review" in config_name else "topic"
+ setting = "user" if "user" in config_name else "temporal"
+
+ examples = []
+ for idx, row in enumerate(ds):
+ profile_items = row["profile"]
+
+ if task == "review":
+ processed_profile = []
+ for p in profile_items:
+ processed_profile.append({
+ "support_input": _build_review_support_input(p),
+ "support_output": p["reviewText"],
+ "raw": p,
+ })
+ else: # topic
+ processed_profile = []
+ for p in profile_items:
+ processed_profile.append({
+ "support_input": _build_topic_support_input(p),
+ "support_output": p["content"],
+ "raw": p,
+ })
+
+ user_id = row.get("reviewerId", row.get("author", f"user_{idx}"))
+
+ examples.append({
+ "task": task,
+ "setting": setting,
+ "query_input": row["input"],
+ "target_output": row["output"],
+ "profile_items": processed_profile,
+ "user_id": user_id,
+ "example_id": f"{config_name}_{split}_{idx}",
+ })
+
+ return examples
+
+
+def _build_review_support_input(profile_item: dict) -> str:
+ """Build the input text for a review support example."""
+ overall = profile_item.get("overall", "5.0")
+ description = profile_item.get("description", "")
+ summary = profile_item.get("summary", "")
+ return (
+ f'Generate the review text written by a reviewer who has a given an overall '
+ f'rating of "{overall}" for a product with description "{description}". '
+ f'The summary of the review text is "{summary}".'
+ )
+
+
+def _build_topic_support_input(profile_item: dict) -> str:
+ """Build the input text for a topic support example."""
+ summary = profile_item.get("summary", "")
+ return f"Generate the content for a reddit post {summary}"
+
+
+def select_k_profile_items(profile_items: list, K: int, seed: int = 0) -> list:
+ """Select K profile items from the available profile.
+
+ If fewer than K items available, return all of them.
+ Uses random selection with a fixed seed for reproducibility.
+ """
+ if len(profile_items) <= K:
+ return profile_items
+ rng = random.Random(seed)
+ return rng.sample(profile_items, K)
+
+
+if __name__ == "__main__":
+ # Quick test
+ examples = load_longlamp("product_review_user", split="validation")
+ print(f"Loaded {len(examples)} review user validation examples")
+ ex = examples[0]
+ print(f"User: {ex['user_id']}")
+ print(f"Query: {ex['query_input'][:200]}...")
+ print(f"Target: {ex['target_output'][:200]}...")
+ print(f"Profile items: {len(ex['profile_items'])}")
+ if ex['profile_items']:
+ p = ex['profile_items'][0]
+ print(f" Support input: {p['support_input'][:200]}...")
+ print(f" Support output: {p['support_output'][:200]}...")