From b6c3e4e51eeab703b40284459c6e9fff2151216c Mon Sep 17 00:00:00 2001 From: YurenHao0426 Date: Wed, 18 Mar 2026 18:25:09 -0500 Subject: Initial release: VARS - personalized LLM with RAG and user vector learning --- src/personalization/models/reranker/__init__.py | 0 src/personalization/models/reranker/base.py | 16 ++++ .../models/reranker/qwen3_reranker.py | 96 ++++++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 src/personalization/models/reranker/__init__.py create mode 100644 src/personalization/models/reranker/base.py create mode 100644 src/personalization/models/reranker/qwen3_reranker.py (limited to 'src/personalization/models/reranker') diff --git a/src/personalization/models/reranker/__init__.py b/src/personalization/models/reranker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/personalization/models/reranker/base.py b/src/personalization/models/reranker/base.py new file mode 100644 index 0000000..34cf6ce --- /dev/null +++ b/src/personalization/models/reranker/base.py @@ -0,0 +1,16 @@ +from typing import List, Protocol + +class Reranker(Protocol): + def score( + self, + query: str, + docs: List[str], + **kwargs, + ) -> List[float]: + """ + Score multiple candidate documents for the same query. + Higher score indicates higher relevance. + Returns a list of floats with length equal to len(docs). + """ + ... + diff --git a/src/personalization/models/reranker/qwen3_reranker.py b/src/personalization/models/reranker/qwen3_reranker.py new file mode 100644 index 0000000..b648421 --- /dev/null +++ b/src/personalization/models/reranker/qwen3_reranker.py @@ -0,0 +1,96 @@ +from typing import List +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from .base import Reranker +from personalization.config.settings import LocalModelsConfig +from personalization.config.registry import choose_dtype, choose_device_map + +class Qwen3Reranker(Reranker): + def __init__(self, model_path: str, device_map: str = "auto", dtype: torch.dtype = torch.bfloat16): + # Ensure we pass trust_remote_code=True for Qwen models + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + # Handle specific device assignment (e.g., "cuda:0", "cuda:1") + if device_map and device_map.startswith("cuda:"): + # Load to CPU first, then move to specific GPU + self.model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=dtype, + device_map=None, + trust_remote_code=True, + low_cpu_mem_usage=True, + ) + self.model = self.model.to(device_map) + else: + # Use accelerate's auto device mapping + self.model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=dtype, + device_map=device_map, + trust_remote_code=True, + ) + + self.yes_token_id = self.tokenizer("yes", add_special_tokens=False).input_ids[0] + + @classmethod + def from_config(cls, cfg: LocalModelsConfig) -> "Qwen3Reranker": + if not cfg.reranker or not cfg.reranker.qwen3_8b: + raise ValueError("Reranker config for qwen3_8b is missing") + spec = cfg.reranker.qwen3_8b + dtype = choose_dtype(spec.dtype) + device_map = choose_device_map(spec.device_map) + return cls(spec.local_path, device_map=device_map, dtype=dtype) + + def _build_prompt(self, query: str, doc: str) -> str: + return ( + "You are a reranker. " + "Given a user query and a memory note, answer 'yes' if the note is helpful " + "for answering the query, otherwise answer 'no'.\n\n" + f"Query: {query}\n" + f"Note: {doc}\n" + "Answer with a single token: yes or no." + ) + + @torch.inference_mode() + def score(self, query: str, docs: List[str], batch_size: int = 8, **kwargs) -> List[float]: + scores = [] + for i in range(0, len(docs), batch_size): + batch_docs = docs[i : i + batch_size] + prompts = [self._build_prompt(query, d) for d in batch_docs] + + inputs = self.tokenizer( + prompts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=512 + ).to(self.model.device) + + outputs = self.model(**inputs) + # Take logits of the last token + # shape: [batch, seq_len, vocab_size] + logits = outputs.logits + + # We want the logits for the token position immediately after the prompt ends. + # But since we generated inputs directly from tokenizer(prompts), + # we look at the last position of the input. + # For causal LM, we usually look at the logits of the last token + # to predict the *next* token (which we hope is 'yes' or 'no'). + + # Get logits for the next token prediction (last position) + # For each sequence in batch, select the last token's logits + # inputs['input_ids'] shape: [B, L] + # logits shape: [B, L, V] + # We want logits[:, -1, :] + + last_token_logits = logits[:, -1, :] + + # Calculate log prob of 'yes' + # We can use log_softmax over the vocab dimension + log_probs = torch.log_softmax(last_token_logits, dim=-1) + yes_log_probs = log_probs[:, self.yes_token_id] + + scores.extend(yes_log_probs.float().cpu().numpy().tolist()) + + return scores + -- cgit v1.2.3