import os import requests import feedparser import datetime from github import Github ##################### # 1. 配置/常量 ##################### ALLOWED_CATEGORIES = [ "cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO", "cs.IR", "stat.ML" ] API_URL = "https://uiuc.chat/api/chat-api/chat" API_KEY = os.getenv("UIUC_API_KEY") # 你自己的密钥 MODEL_NAME = "qwen2.5:14b-instruct-fp16" # 你的model SYSTEM_PROMPT = ( "Based on the given title and abstract, please determine if the paper " "is relevant to both language models and bias (or fairness). " "If yes, respond 1; otherwise respond 0." ) ##################### # 2. 函数: 调用外部API 判别 ##################### def is_relevant_by_api(title, abstract): """ 调用外部API, 给一段title+abstract, 返回 True(1) or False(0). """ headers = {"Content-Type": "application/json"} data = { "model": MODEL_NAME, "messages": [ { "role": "system", "content": SYSTEM_PROMPT }, { "role": "user", # 填入我们的标题+摘要, 作为"content" "content": f"Title: {title}\nAbstract: {abstract}" } ], "api_key": API_KEY, "course_name": "llm-bias-papers", "stream": False, "temperature": 0.0, "retrieval_only": False } try: resp = requests.post(API_URL, headers=headers, json=data, timeout=30) resp.raise_for_status() # resp.json() 应该包含 'message' response_msg = resp.json().get('message','') # 如果 message="1", 就 True, 否则 False return (response_msg.strip() == "1") except requests.RequestException as e: print("[ERROR] calling external API:", e) # 如果出错, 默认返回 False or do something return False ##################### # 3. 函数: 抓论文, 调API判别 ##################### def fetch_arxiv_papers_with_api(days=1): """ 宽松抓 + 本地分类过滤 + 外部API做判别 """ now_utc = datetime.datetime.now(datetime.timezone.utc) start_utc = now_utc - datetime.timedelta(days=days) start_str = start_utc.strftime("%Y%m%d%H%M") end_str = now_utc.strftime("%Y%m%d%H%M") print(f"[DEBUG] date range (UTC): {start_str} ~ {end_str}, days={days}") search_query = f"submittedDate:[{start_str} TO {end_str}]" base_url = "http://export.arxiv.org/api/query" step = 100 start = 0 all_entries = [] while True: params = { "search_query": search_query, "sortBy": "submittedDate", "sortOrder": "descending", "start": start, "max_results": step } print(f"[DEBUG] fetching: {start} -> {start+step}") try: resp = requests.get(base_url, params=params, timeout=30) if resp.status_code != 200: print("[ERROR] HTTP Status:", resp.status_code) break feed = feedparser.parse(resp.content) except Exception as e: print("[ERROR] fetching arXiv:", e) break batch = feed.entries got_count = len(batch) print(f"[DEBUG] got {got_count} entries in this batch.") if got_count == 0: break all_entries.extend(batch) start += step if start >= 3000: print("[DEBUG] reached 3000, stop.") break print(f"[DEBUG] total retrieved in date range: {len(all_entries)}") matched = [] for entry in all_entries: title = getattr(entry, 'title', '') summary = getattr(entry, 'summary', '') published = getattr(entry, 'published', '') link = getattr(entry, 'link', '') # 先检查分类 if hasattr(entry, 'tags'): categories = [t.term for t in entry.tags] else: categories = [] # 是否有至少一个分类在 ALLOWED_CATEGORIES 里 in_allowed_cat = any(cat in ALLOWED_CATEGORIES for cat in categories) if not in_allowed_cat: continue # 调用外部 API 判别: relevant or not relevant = is_relevant_by_api(title, summary) if relevant: matched.append({ "title": title, "published": published, "link": link, "categories": categories }) print(f"[DEBUG] matched {len(matched)} papers after external API check.") return matched ##################### # 4. 函数: update README ##################### def update_readme_in_repo(papers, token, repo_name): if not papers: print("[INFO] No matched papers, skip README update.") return g = Github(token) repo = g.get_repo(repo_name) # 获取 README readme_file = repo.get_contents("README.md", ref="main") old_content = readme_file.decoded_content.decode("utf-8") now_utc_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC") new_section = f"\n\n### Auto-captured papers on {now_utc_str}\n" for p in papers: cat_str = ", ".join(p["categories"]) new_section += f"- **{p['title']}** (Published={p['published']}) \n" new_section += f" - Categories: {cat_str} \n" new_section += f" - Link: {p['link']}\n\n" updated_content = old_content + new_section commit_msg = f"Auto update README with {len(papers)} new papers" repo.update_file( path="README.md", message=commit_msg, content=updated_content, sha=readme_file.sha, branch="main" ) print(f"[INFO] README updated with {len(papers)} papers.") ##################### # 5. main ##################### def main(): days = 1 papers = fetch_arxiv_papers_with_api(days=days) print(f"[RESULT] matched {len(papers)} papers. Will update README if not empty.") github_token = os.getenv("TARGET_REPO_TOKEN") target_repo_name = os.getenv("TARGET_REPO_NAME") if not github_token or not target_repo_name: print("[ERROR] Missing environment variables: TARGET_REPO_TOKEN / TARGET_REPO_NAME.") return if papers: update_readme_in_repo(papers, github_token, target_repo_name) else: print("[INFO] No matched papers, done without update.") if __name__ == "__main__": main()