From 5f26ab0f361e06e0a792c82436241883a11d80c5 Mon Sep 17 00:00:00 2001 From: blackhao <13851610112@163.com> Date: Sun, 13 Apr 2025 23:40:30 -0500 Subject: use gpt-4o --- scripts/fetch_papers.py | 178 +++++++++++++----------------------------------- 1 file changed, 49 insertions(+), 129 deletions(-) (limited to 'scripts/fetch_papers.py') diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index 79246c9..2c7d177 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -3,29 +3,30 @@ import requests import feedparser import datetime from github import Github +from openai import OpenAI ALLOWED_CATEGORIES = [ "cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO", "cs.IR", "stat.ML" ] +SYSTEM_PROMPT = ( + "You are a helpful assistant. The user will give you a paper title and abstract. " + "Your task: Decide if this paper is about large language models (or generative text models) AND about bias/fairness. " + "If yes, respond with just a single character: 1. Otherwise, respond with a single character: 0. " + "No extra explanation, no punctuation—only the number." +) + def advanced_filter(entry): - """ - 基于标题+摘要,本地进行“正面关键词 + 负面关键词”筛选 - """ title = getattr(entry, 'title', '').lower() summary = getattr(entry, 'summary', '').lower() full_text = title + " " + summary - # 正面关键词 general_terms = ["bias", "fairness"] model_terms = ["llm", "language model", "transformer", "gpt", "nlp", "pretrained", "embedding", "generation", "alignment", "ai"] - # 负面关键词 - negative_terms = [ - "estimation", "variance", "quantum", "physics", - "sensor", "circuit", "electronics", "hardware" - ] + negative_terms = ["estimation", "variance", "quantum", "physics", + "sensor", "circuit", "electronics", "hardware"] has_general = any(term in full_text for term in general_terms) has_model = any(term in full_text for term in model_terms) @@ -33,62 +34,31 @@ def advanced_filter(entry): return (has_general and has_model) and (not has_negative) -API_URL = "https://uiuc.chat/api/chat-api/chat" -MODEL_NAME = "qwen2.5:14b-instruct-fp16" -SYSTEM_PROMPT = ( - "You are a helpful assistant. The user will give you a paper title and abstract. " - "Your task: Decide if this paper is about large language models (or generative text models) AND about bias/fairness. " - "If yes, respond with just a single character: 1. Otherwise, respond with a single character: 0. " - "No extra explanation, no punctuation—only the number." -) - -def is_relevant_by_api(title, summary, api_key): - """ - 调用外部API,根据title+summary判别是否相关(返回 True/False), - 并打印调试信息。 - """ - headers = {"Content-Type": "application/json"} - data = { - "model": MODEL_NAME, - "messages": [ - { - "role": "system", - "content": SYSTEM_PROMPT - }, - { - "role": "user", - "content": SYSTEM_PROMPT + f"Title: {title}\nAbstract: {summary}" - } - ], - "api_key": api_key, - "course_name": "llm-bias-papers", - "stream": False, - "temperature": 0.0 - } +def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"): + prompt = f"Title: {title}\nAbstract: {summary}" try: - resp = requests.post(API_URL, headers=headers, json=data, timeout=30) - resp.raise_for_status() - full_json = resp.json() - # 获取API返回的message - response_msg = full_json.get("message", "") - print(f"[DEBUG][API] return message='{response_msg.strip()}' for paper title='{title[:60]}...'") - return (response_msg.strip() == "1") + dialogue = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": prompt} + ], + temperature=0.0, + max_tokens=1 + ) + response_msg = dialogue.choices[0].message.content.strip() + print(f"[DEBUG][API] return message='{response_msg}' for paper title='{title[:60]}...'") + return response_msg == "1" except Exception as e: - print("[ERROR][API] calling external API:", e) + print("[ERROR][API] calling OpenAI API:", e) return False def fetch_papers_combined(days=1): - """ - 1) 抓过去days天 arXiv论文(宽松) - 2) 本地先过滤(分类 + advanced_filter) - 3) 对“通过本地筛”的候选,调用API二次判定 + debug输出 - """ now_utc = datetime.datetime.now(datetime.timezone.utc) start_utc = now_utc - datetime.timedelta(days=days) start_str = start_utc.strftime("%Y%m%d%H%M") end_str = now_utc.strftime("%Y%m%d%H%M") - print(f"[DEBUG] date range (UTC): {start_str} ~ {end_str}, days={days}") search_query = f"submittedDate:[{start_str} TO {end_str}]" base_url = "http://export.arxiv.org/api/query" @@ -104,84 +74,43 @@ def fetch_papers_combined(days=1): "start": start, "max_results": step } - print(f"[DEBUG] fetching: {start} -> {start+step}") - try: - resp = requests.get(base_url, params=params, timeout=30) - if resp.status_code != 200: - print("[ERROR] HTTP Status:", resp.status_code) - break - feed = feedparser.parse(resp.content) - except Exception as e: - print("[ERROR] fetching arXiv:", e) + resp = requests.get(base_url, params=params, timeout=30) + if resp.status_code != 200: break - + feed = feedparser.parse(resp.content) batch = feed.entries - got_count = len(batch) - print(f"[DEBUG] got {got_count} entries in this batch.") - if got_count == 0: + if not batch: break all_entries.extend(batch) start += step if start >= 3000: - print("[DEBUG] reached 3000, stop.") break - print(f"[DEBUG] total retrieved in date range: {len(all_entries)}") - - # --- 本地过滤1: 分类 + advanced_filter --- - local_candidates = [] - for e in all_entries: - title = getattr(e, "title", "") - summary = getattr(e, "summary", "") - published = getattr(e, "published", "") - link = getattr(e, "link", "") - categories = [t.term for t in e.tags] if hasattr(e, 'tags') else [] - - if not any(cat in ALLOWED_CATEGORIES for cat in categories): - continue - - if advanced_filter(e): - local_candidates.append({ - "title": title, - "summary": summary, - "published": published, - "link": link, - "categories": categories - }) - - print(f"[DEBUG] local_candidates = {len(local_candidates)} after local filter") - - # Debug: 打印所有local_candidates的标题,看看是不是你预期的那几篇 - for idx, paper in enumerate(local_candidates, 1): - print(f"[DEBUG][LOCAL] #{idx}, title='{paper['title']}' cat={paper['categories']}") - - # --- 2) 调API二次判定 --- - api_key = os.getenv("UIUC_API_KEY") # 你在Secrets中配置 - if not api_key: - print("[WARNING] No UIUC_API_KEY found. Skip second filter.") + local_candidates = [ + { + "title": e.title, + "summary": e.summary, + "published": e.published, + "link": e.link, + "categories": [t.term for t in e.tags] + } + for e in all_entries + if any(cat in ALLOWED_CATEGORIES for cat in [t.term for t in e.tags]) and advanced_filter(e) + ] + + openai_api_key = os.getenv("OPENAI_API_KEY") + if not openai_api_key: + print("[WARNING] No OPENAI_API_KEY found. Skip second filter.") return local_candidates - final_matched = [] - for idx, paper in enumerate(local_candidates, 1): - relevant = is_relevant_by_api(paper["title"], paper["summary"], api_key) - if relevant: - final_matched.append({ - "title": paper["title"], - "published": paper["published"], - "link": paper["link"], - "categories": paper["categories"] - }) - else: - # 如果不相关,就打印个提示 - print(f"[DEBUG][API] => '0' => exclude paper #{idx}, title='{paper['title'][:60]}...'") - - print(f"[DEBUG] final_matched = {len(final_matched)} after API check") + client = OpenAI(api_key=openai_api_key) + final_matched = [p for p in local_candidates if is_relevant_by_api(p["title"], p["summary"], client)] + return final_matched def update_readme_in_repo(papers, token, repo_name): if not papers: - print("[INFO] No matched papers, skip README update.") return g = Github(token) @@ -208,24 +137,15 @@ def update_readme_in_repo(papers, token, repo_name): sha=readme_file.sha, branch="main" ) - print(f"[INFO] README updated with {len(papers)} papers.") def main(): - # 抓过去5天(你例子里是5) 或根据需要改 - days = 5 + days = 1 papers = fetch_papers_combined(days=days) - print(f"\n[RESULT] matched {len(papers)} papers total after double filter. Now update README if not empty...") github_token = os.getenv("TARGET_REPO_TOKEN") target_repo_name = os.getenv("TARGET_REPO_NAME") - if not github_token or not target_repo_name: - print("[ERROR] Missing environment variables: TARGET_REPO_TOKEN / TARGET_REPO_NAME.") - return - - if papers: + if github_token and target_repo_name and papers: update_readme_in_repo(papers, github_token, target_repo_name) - else: - print("[INFO] No matched papers, done without update.") if __name__ == "__main__": main() -- cgit v1.2.3