From f64820f164a1c727e499c8f1353a63e5a04ce158 Mon Sep 17 00:00:00 2001
From: blackhao <13851610112@163.com>
Date: Sun, 30 Mar 2025 04:28:29 -0500
Subject: test

---
 scripts/fetch_papers.py | 87 +++++++++++++++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 31 deletions(-)

(limited to 'scripts')

diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index 713e6b4..1b86d58 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -4,13 +4,28 @@ import feedparser
 import datetime
 from github import Github
 
+# 你想要的分类列表
+ALLOWED_CATEGORIES = [
+    "cs.AI",  # Artificial Intelligence
+    "cs.CL",  # Computation and Language
+    "cs.CV",  # Computer Vision and Pattern Recognition
+    "cs.LG",  # Machine Learning
+    "cs.NE",  # Neural and Evolutionary Computing
+    "cs.RO",  # Robotics
+    "cs.CY",  # Computers and Society
+    "cs.HC",  # Human-Computer Interaction
+    "cs.IR",  # Information Retrieval
+    "cs.GL",  # General Literature
+    "cs.SI",  # Social and Information Networks
+    "stat.ML" # Stat.ML
+]
+
 def fetch_papers_wide_then_filter(days=1, keywords=None):
     """
-    抓过去 N 天的论文(只限制 submittedDate)，然后本地判断:
-     - 是否 cs.* 或 stat.*
-     - 标题/摘要是否含 keywords
-    返回一个列表，每个元素是字典:
-      { 'title':..., 'published':..., 'link':..., 'categories':[...] }
+    从 arXiv 中抓取过去 N 天内提交的所有论文（只限制时间 submittedDate），
+    然后在本地过滤：
+      1) 只保留 tags 中包含 ALLOWED_CATEGORIES（若论文有多分类，只要有任意一个符合就OK）
+      2) 标题或摘要里包含指定关键词
     """
     if keywords is None:
         keywords = ["bias", "fairness"]
@@ -20,15 +35,16 @@ def fetch_papers_wide_then_filter(days=1, keywords=None):
 
     start_str = start_utc.strftime("%Y%m%d%H%M")
     end_str = now_utc.strftime("%Y%m%d%H%M")
+    print(f"[DEBUG] date range (UTC): {start_str} ~ {end_str} (past {days} days)")
 
+    # 构造 search_query，仅用时间
     search_query = f"submittedDate:[{start_str} TO {end_str}]"
-    base_url = "http://export.arxiv.org/api/query"
 
+    base_url = "http://export.arxiv.org/api/query"
     step = 100
     start = 0
     all_entries = []
 
-    print(f"[DEBUG] Time range: {start_str} ~ {end_str}, days={days}")
     while True:
         params = {
             "search_query": search_query,
@@ -38,45 +54,54 @@ def fetch_papers_wide_then_filter(days=1, keywords=None):
             "max_results": step
         }
         print(f"[DEBUG] fetching: {start} -> {start+step}")
-        r = requests.get(base_url, params=params)
-        if r.status_code != 200:
-            print("[ERROR] HTTP status:", r.status_code)
+        resp = requests.get(base_url, params=params)
+        if resp.status_code != 200:
+            print("[ERROR] HTTP Status:", resp.status_code)
             break
 
-        feed = feedparser.parse(r.content)
-        got = len(feed.entries)
-        print(f"[DEBUG] got {got} entries this batch.")
-        if got == 0:
+        feed = feedparser.parse(resp.content)
+        batch = feed.entries
+        got_count = len(batch)
+        print(f"[DEBUG] got {got_count} entries in this batch")
+        if got_count == 0:
+            # 没有更多了
             break
 
-        all_entries.extend(feed.entries)
+        all_entries.extend(batch)
         start += step
 
+        # 安全上限
         if start >= 3000:
             print("[DEBUG] reached 3000, stop.")
             break
 
-    print(f"[DEBUG] total in date range: {len(all_entries)}")
+    print(f"[DEBUG] total retrieved in date range: {len(all_entries)}")
 
+    # -- 本地过滤 --
     matched = []
     for e in all_entries:
         title = getattr(e, 'title', '')
         summary = getattr(e, 'summary', '')
         published = getattr(e, 'published', '')
         link = getattr(e, 'link', '')
+
         if hasattr(e, 'tags'):
+            # e.tags: a list of objects with .term
             categories = [t.term for t in e.tags]
         else:
             categories = []
 
-        # 判定分类
-        has_cs_stat = any(c.startswith("cs.") or c.startswith("stat.") for c in categories)
-        if not has_cs_stat:
+        # 1) 是否属于 ALLOWED_CATEGORIES
+        #    有些论文有多分类，只要其中一个在 ALLOWED_CATEGORIES 里就OK
+        #    例如 "cs.IR", "cs.AI"
+        in_allowed_cat = any(cat in ALLOWED_CATEGORIES for cat in categories)
+        if not in_allowed_cat:
             continue
 
-        # 判定关键词
+        # 2) 是否含关键词
         text_lower = (title + " " + summary).lower()
-        if any(kw.lower() in text_lower for kw in keywords):
+        has_keyword = any(kw.lower() in text_lower for kw in keywords)
+        if has_keyword:
             matched.append({
                 "title": title,
                 "published": published,
@@ -84,12 +109,12 @@ def fetch_papers_wide_then_filter(days=1, keywords=None):
                 "categories": categories
             })
 
-    print(f"[DEBUG] matched {len(matched)} papers after local filter (cs./stat.+keywords)")
+    print(f"[DEBUG] matched {len(matched)} papers after local filtering (categories + keywords)")
     return matched
 
 def update_readme_in_repo(papers, token, repo_name):
     """
-    将匹配到的论文列表写入目标repo的 README.md
+    将匹配到的论文列表追加到目标repo的 README.md (main分支)
     """
     if not papers:
         print("[INFO] No matched papers, skip README update.")
@@ -98,9 +123,9 @@ def update_readme_in_repo(papers, token, repo_name):
     g = Github(token)
     repo = g.get_repo(repo_name)
 
-    # 获取 README 内容
+    # 读取现有 README
     readme_file = repo.get_contents("README.md", ref="main")
-    readme_content = readme_file.decoded_content.decode("utf-8")
+    old_content = readme_file.decoded_content.decode("utf-8")
 
     now_utc_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
     new_section = f"\n\n### Auto-captured papers on {now_utc_str}\n"
@@ -110,7 +135,7 @@ def update_readme_in_repo(papers, token, repo_name):
         new_section += f"  - Categories: {cat_str}  \n"
         new_section += f"  - Link: {p['link']}\n\n"
 
-    updated_content = readme_content + new_section
+    updated_content = old_content + new_section
     commit_msg = f"Auto update README with {len(papers)} new papers"
 
     repo.update_file(
@@ -123,16 +148,16 @@ def update_readme_in_repo(papers, token, repo_name):
     print(f"[INFO] README updated with {len(papers)} papers.")
 
 def main():
-    # 1. 获取过去3天, keywords=["bias","fairness"] 的论文
-    days = 1
+    # 1) 抓取过去3天, 关键词=["bias","fairness"]
+    days = 3
     keywords = ["bias", "fairness"]
     papers = fetch_papers_wide_then_filter(days=days, keywords=keywords)
-    print(f"[RESULT] matched {len(papers)} papers. Now let's update README in target repo if any.")
+    print(f"\n[RESULT] matched {len(papers)} papers. Will update README if not empty.")
 
-    # 2. 如果有匹配论文,更新 README
-    #    需要在 secrets 或 env 里获取 token, repo name
+    # 2) 更新README
     github_token = os.getenv("TARGET_REPO_TOKEN")
     target_repo_name = os.getenv("TARGET_REPO_NAME")
+
     if not github_token or not target_repo_name:
         print("[ERROR] Missing environment variables: TARGET_REPO_TOKEN / TARGET_REPO_NAME.")
         return
-- 
cgit v1.2.3