import os import requests import feedparser import datetime from github import Github # 你想要的分类列表 ALLOWED_CATEGORIES = [ "cs.AI", # Artificial Intelligence "cs.CL", # Computation and Language "cs.CV", # Computer Vision and Pattern Recognition "cs.LG", # Machine Learning "cs.NE", # Neural and Evolutionary Computing "cs.RO", # Robotics "cs.CY", # Computers and Society "cs.HC", # Human-Computer Interaction "cs.IR", # Information Retrieval "cs.GL", # General Literature "cs.SI", # Social and Information Networks "stat.ML" # Stat.ML ] def fetch_papers_wide_then_filter(days=1, keywords=None): """ 从 arXiv 中抓取过去 N 天内提交的所有论文(只限制时间 submittedDate), 然后在本地过滤: 1) 只保留 tags 中包含 ALLOWED_CATEGORIES(若论文有多分类,只要有任意一个符合就OK) 2) 标题或摘要里包含指定关键词 """ if keywords is None: keywords = ["bias", "fairness"] now_utc = datetime.datetime.now(datetime.timezone.utc) start_utc = now_utc - datetime.timedelta(days=days) start_str = start_utc.strftime("%Y%m%d%H%M") end_str = now_utc.strftime("%Y%m%d%H%M") print(f"[DEBUG] date range (UTC): {start_str} ~ {end_str} (past {days} days)") # 构造 search_query,仅用时间 search_query = f"submittedDate:[{start_str} TO {end_str}]" base_url = "http://export.arxiv.org/api/query" step = 100 start = 0 all_entries = [] while True: params = { "search_query": search_query, "sortBy": "submittedDate", "sortOrder": "descending", "start": start, "max_results": step } print(f"[DEBUG] fetching: {start} -> {start+step}") resp = requests.get(base_url, params=params) if resp.status_code != 200: print("[ERROR] HTTP Status:", resp.status_code) break feed = feedparser.parse(resp.content) batch = feed.entries got_count = len(batch) print(f"[DEBUG] got {got_count} entries in this batch") if got_count == 0: # 没有更多了 break all_entries.extend(batch) start += step # 安全上限 if start >= 3000: print("[DEBUG] reached 3000, stop.") break print(f"[DEBUG] total retrieved in date range: {len(all_entries)}") # -- 本地过滤 -- matched = [] for e in all_entries: title = getattr(e, 'title', '') summary = getattr(e, 'summary', '') published = getattr(e, 'published', '') link = getattr(e, 'link', '') if hasattr(e, 'tags'): # e.tags: a list of objects with .term categories = [t.term for t in e.tags] else: categories = [] # 1) 是否属于 ALLOWED_CATEGORIES # 有些论文有多分类,只要其中一个在 ALLOWED_CATEGORIES 里就OK # 例如 "cs.IR", "cs.AI" in_allowed_cat = any(cat in ALLOWED_CATEGORIES for cat in categories) if not in_allowed_cat: continue # 2) 是否含关键词 text_lower = (title + " " + summary).lower() has_keyword = any(kw.lower() in text_lower for kw in keywords) if has_keyword: matched.append({ "title": title, "published": published, "link": link, "categories": categories }) print(f"[DEBUG] matched {len(matched)} papers after local filtering (categories + keywords)") return matched def update_readme_in_repo(papers, token, repo_name): """ 将匹配到的论文列表追加到目标repo的 README.md (main分支) """ if not papers: print("[INFO] No matched papers, skip README update.") return g = Github(token) repo = g.get_repo(repo_name) # 读取现有 README readme_file = repo.get_contents("README.md", ref="main") old_content = readme_file.decoded_content.decode("utf-8") now_utc_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC") new_section = f"\n\n### Auto-captured papers on {now_utc_str}\n" for p in papers: cat_str = ", ".join(p["categories"]) new_section += f"- **{p['title']}** (Published={p['published']}) \n" new_section += f" - Categories: {cat_str} \n" new_section += f" - Link: {p['link']}\n\n" updated_content = old_content + new_section commit_msg = f"Auto update README with {len(papers)} new papers" repo.update_file( path="README.md", message=commit_msg, content=updated_content, sha=readme_file.sha, branch="main" ) print(f"[INFO] README updated with {len(papers)} papers.") def main(): # 1) 抓取过去3天, 关键词=["bias","fairness"] days = 3 keywords = ["bias", "fairness"] papers = fetch_papers_wide_then_filter(days=days, keywords=keywords) print(f"\n[RESULT] matched {len(papers)} papers. Will update README if not empty.") # 2) 更新README github_token = os.getenv("TARGET_REPO_TOKEN") target_repo_name = os.getenv("TARGET_REPO_NAME") if not github_token or not target_repo_name: print("[ERROR] Missing environment variables: TARGET_REPO_TOKEN / TARGET_REPO_NAME.") return if papers: update_readme_in_repo(papers, github_token, target_repo_name) else: print("[INFO] No matched papers, done without update.") if __name__ == "__main__": main()