diff options
Diffstat (limited to 'data/longlamp.py')
| -rw-r--r-- | data/longlamp.py | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/data/longlamp.py b/data/longlamp.py new file mode 100644 index 0000000..05b3939 --- /dev/null +++ b/data/longlamp.py @@ -0,0 +1,101 @@ +"""LongLaMP dataset loader for Review Writing and Topic Writing tasks.""" + +import random +from datasets import load_dataset + + +def load_longlamp(config_name: str, split: str = "val"): + """Load a LongLaMP dataset configuration. + + Args: + config_name: One of product_review_user, product_review_temporal, + topic_writing_user, topic_writing_temporal + split: train, val, or test + + Returns: + List of unified dicts. + """ + ds = load_dataset("LongLaMP/LongLaMP", config_name, split=split) + + task = "review" if "review" in config_name else "topic" + setting = "user" if "user" in config_name else "temporal" + + examples = [] + for idx, row in enumerate(ds): + profile_items = row["profile"] + + if task == "review": + processed_profile = [] + for p in profile_items: + processed_profile.append({ + "support_input": _build_review_support_input(p), + "support_output": p["reviewText"], + "raw": p, + }) + else: # topic + processed_profile = [] + for p in profile_items: + processed_profile.append({ + "support_input": _build_topic_support_input(p), + "support_output": p["content"], + "raw": p, + }) + + user_id = row.get("reviewerId", row.get("author", f"user_{idx}")) + + examples.append({ + "task": task, + "setting": setting, + "query_input": row["input"], + "target_output": row["output"], + "profile_items": processed_profile, + "user_id": user_id, + "example_id": f"{config_name}_{split}_{idx}", + }) + + return examples + + +def _build_review_support_input(profile_item: dict) -> str: + """Build the input text for a review support example.""" + overall = profile_item.get("overall", "5.0") + description = profile_item.get("description", "") + summary = profile_item.get("summary", "") + return ( + f'Generate the review text written by a reviewer who has a given an overall ' + f'rating of "{overall}" for a product with description "{description}". ' + f'The summary of the review text is "{summary}".' + ) + + +def _build_topic_support_input(profile_item: dict) -> str: + """Build the input text for a topic support example.""" + summary = profile_item.get("summary", "") + return f"Generate the content for a reddit post {summary}" + + +def select_k_profile_items(profile_items: list, K: int, seed: int = 0) -> list: + """Select K profile items from the available profile. + + If fewer than K items available, return all of them. + Uses random selection with a fixed seed for reproducibility. + """ + if len(profile_items) <= K: + return profile_items + rng = random.Random(seed) + return rng.sample(profile_items, K) + + +if __name__ == "__main__": + # Quick test + examples = load_longlamp("product_review_user", split="validation") + print(f"Loaded {len(examples)} review user validation examples") + ex = examples[0] + print(f"User: {ex['user_id']}") + print(f"Query: {ex['query_input'][:200]}...") + print(f"Target: {ex['target_output'][:200]}...") + print(f"Profile items: {len(ex['profile_items'])}") + if ex['profile_items']: + p = ex['profile_items'][0] + print(f" Support input: {p['support_input'][:200]}...") + print(f" Support output: {p['support_output'][:200]}...") |
