From 1a953287ba5ccf5c21be37259a70cb8cb5adc5cd Mon Sep 17 00:00:00 2001
From: Yuren Hao <97327730+YurenHao0426@users.noreply.github.com>
Date: Sun, 30 Mar 2025 20:03:57 -0700
Subject: Update fetch_papers.py

---
 scripts/fetch_papers.py | 155 +++++++++++++++++++++++++++---------------------
 1 file changed, 88 insertions(+), 67 deletions(-)

(limited to 'scripts/fetch_papers.py')

diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index a847518..1d2417c 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -4,32 +4,51 @@ import feedparser
 import datetime
 from github import Github
 
-#####################
-# 1. 配置/常量
-#####################
-
+# Step 1: 你的分类限制
 ALLOWED_CATEGORIES = [
     "cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO",
     "cs.IR", "stat.ML"
 ]
 
+# Step 2: 本地高级匹配 (正负关键词)
+def advanced_filter(entry):
+    """
+    基于标题+摘要，本地进行“正面关键词 + 负面关键词”筛选
+    """
+    title = getattr(entry, 'title', '').lower()
+    summary = getattr(entry, 'summary', '').lower()
+    full_text = title + " " + summary
+
+    # 正面关键词
+    general_terms = ["bias", "fairness"]
+    model_terms = ["llm", "language model", "transformer", "gpt", "nlp",
+                   "pretrained", "embedding", "generation", "alignment", "ai"]
+    # 负面关键词
+    negative_terms = [
+        "estimation", "variance", "quantum", "physics",
+        "sensor", "circuit", "electronics", "hardware"
+    ]
+
+    # 检查正面关键词
+    has_general = any(term in full_text for term in general_terms)
+    has_model   = any(term in full_text for term in model_terms)
+    # 检查负面关键词
+    has_negative = any(term in full_text for term in negative_terms)
+
+    return (has_general and has_model) and (not has_negative)
+
+# Step 3: 外部API判别
 API_URL = "https://uiuc.chat/api/chat-api/chat"
-API_KEY = os.getenv("UIUC_API_KEY")  # 你自己的密钥
-MODEL_NAME = "qwen2.5:14b-instruct-fp16"       # 你的model
-
+MODEL_NAME = "qwen2.5:14b-instruct-fp16"
 SYSTEM_PROMPT = (
     "Based on the given title and abstract, please determine if the paper "
     "is relevant to both language models and bias (or fairness). "
     "If yes, respond 1; otherwise respond 0."
 )
 
-#####################
-# 2. 函数: 调用外部API 判别
-#####################
-
-def is_relevant_by_api(title, abstract):
+def is_relevant_by_api(title, summary, api_key):
     """
-    调用外部API, 给一段title+abstract, 返回 True(1) or False(0).
+    调用外部API，根据title+summary判别是否相关（返回True/False）
     """
     headers = {"Content-Type": "application/json"}
     data = {
@@ -41,39 +60,32 @@ def is_relevant_by_api(title, abstract):
             },
             {
                 "role": "user",
-                # 填入我们的标题+摘要, 作为"content"
-                "content": f"Title: {title}\nAbstract: {abstract}"
+                "content": SYSTEM_PROMPT + f"Title: {title}\nAbstract: {summary}"
             }
         ],
-        "api_key": API_KEY,
+        "api_key": api_key,
         "course_name": "llm-bias-papers",
         "stream": False,
-        "temperature": 0.0,
-        "retrieval_only": False
+        "temperature": 0.0
     }
     try:
         resp = requests.post(API_URL, headers=headers, json=data, timeout=30)
         resp.raise_for_status()
-        # resp.json() 应该包含 'message'
-        response_msg = resp.json().get('message','')
-        # 如果 message="1", 就 True, 否则 False
+        response_msg = resp.json().get("message", "")
         return (response_msg.strip() == "1")
-    except requests.RequestException as e:
+    except Exception as e:
         print("[ERROR] calling external API:", e)
-        # 如果出错, 默认返回 False or do something
         return False
 
-#####################
-# 3. 函数: 抓论文, 调API判别
-#####################
-
-def fetch_arxiv_papers_with_api(days=1):
+# Step 4: 抓arXiv, 先本地筛, 再API筛
+def fetch_papers_combined(days=1):
     """
-    宽松抓 + 本地分类过滤 + 外部API做判别
+    1) 抓过去days天 arXiv论文(宽松)
+    2) 本地先过滤(分类 + advanced_filter)
+    3) 对“通过本地筛”的候选，调用API二次判定
     """
     now_utc = datetime.datetime.now(datetime.timezone.utc)
     start_utc = now_utc - datetime.timedelta(days=days)
-
     start_str = start_utc.strftime("%Y%m%d%H%M")
     end_str = now_utc.strftime("%Y%m%d%H%M")
 
@@ -118,40 +130,55 @@ def fetch_arxiv_papers_with_api(days=1):
 
     print(f"[DEBUG] total retrieved in date range: {len(all_entries)}")
 
-    matched = []
-    for entry in all_entries:
-        title = getattr(entry, 'title', '')
-        summary = getattr(entry, 'summary', '')
-        published = getattr(entry, 'published', '')
-        link = getattr(entry, 'link', '')
-        # 先检查分类
-        if hasattr(entry, 'tags'):
-            categories = [t.term for t in entry.tags]
-        else:
-            categories = []
-
-        # 是否有至少一个分类在 ALLOWED_CATEGORIES 里
-        in_allowed_cat = any(cat in ALLOWED_CATEGORIES for cat in categories)
-        if not in_allowed_cat:
+    # --- 本地过滤1: 分类 + advanced_filter ---
+    local_candidates = []
+    for e in all_entries:
+        title = getattr(e, "title", "")
+        summary = getattr(e, "summary", "")
+        published = getattr(e, "published", "")
+        link = getattr(e, "link", "")
+        categories = [t.term for t in e.tags] if hasattr(e, 'tags') else []
+
+        # 分类是否允许
+        if not any(cat in ALLOWED_CATEGORIES for cat in categories):
+            continue
+
+        # 是否通过 advanced_filter
+        if not advanced_filter(e):
             continue
 
-        # 调用外部 API 判别: relevant or not
-        relevant = is_relevant_by_api(title, summary)
+        local_candidates.append({
+            "title": title,
+            "summary": summary,
+            "published": published,
+            "link": link,
+            "categories": categories
+        })
+
+    print(f"[DEBUG] local_candidates = {len(local_candidates)} after local filter")
+
+    # --- 2) 调API二次判定 ---
+    api_key = os.getenv("UIUC_API_KEY")  # 你在Secrets中配置
+    if not api_key:
+        print("[WARNING] No UIUC_API_KEY found. Skip second filter.")
+        # 如果没api key，就直接return本地候选
+        return local_candidates
+
+    final_matched = []
+    for paper in local_candidates:
+        relevant = is_relevant_by_api(paper["title"], paper["summary"], api_key)
         if relevant:
-            matched.append({
-                "title": title,
-                "published": published,
-                "link": link,
-                "categories": categories
+            final_matched.append({
+                "title": paper["title"],
+                "published": paper["published"],
+                "link": paper["link"],
+                "categories": paper["categories"]
             })
 
-    print(f"[DEBUG] matched {len(matched)} papers after external API check.")
-    return matched
-
-#####################
-# 4. 函数: update README
-#####################
+    print(f"[DEBUG] final_matched = {len(final_matched)} after API check")
+    return final_matched
 
+# Step 5: 写README
 def update_readme_in_repo(papers, token, repo_name):
     if not papers:
         print("[INFO] No matched papers, skip README update.")
@@ -160,7 +187,6 @@ def update_readme_in_repo(papers, token, repo_name):
     g = Github(token)
     repo = g.get_repo(repo_name)
 
-    # 获取 README
     readme_file = repo.get_contents("README.md", ref="main")
     old_content = readme_file.decoded_content.decode("utf-8")
 
@@ -184,18 +210,13 @@ def update_readme_in_repo(papers, token, repo_name):
     )
     print(f"[INFO] README updated with {len(papers)} papers.")
 
-#####################
-# 5. main
-#####################
-
 def main():
-    days = 7
-    papers = fetch_arxiv_papers_with_api(days=days)
-    print(f"[RESULT] matched {len(papers)} papers. Will update README if not empty.")
+    days = 1
+    papers = fetch_papers_combined(days=days)
+    print(f"\n[RESULT] matched {len(papers)} papers total after double filter. Now update README if not empty...")
 
     github_token = os.getenv("TARGET_REPO_TOKEN")
     target_repo_name = os.getenv("TARGET_REPO_NAME")
-
     if not github_token or not target_repo_name:
         print("[ERROR] Missing environment variables: TARGET_REPO_TOKEN / TARGET_REPO_NAME.")
         return
-- 
cgit v1.2.3