import os import requests import feedparser import datetime from github import Github ALLOWED_CATEGORIES = [ "cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO", "cs.IR", "stat.ML" ] def advanced_filter(entry): """ 基于标题+摘要,本地进行“正面关键词 + 负面关键词”筛选 """ title = getattr(entry, 'title', '').lower() summary = getattr(entry, 'summary', '').lower() full_text = title + " " + summary # 正面关键词 general_terms = ["bias", "fairness"] model_terms = ["llm", "language model", "transformer", "gpt", "nlp", "pretrained", "embedding", "generation", "alignment", "ai"] # 负面关键词 negative_terms = [ "estimation", "variance", "quantum", "physics", "sensor", "circuit", "electronics", "hardware" ] has_general = any(term in full_text for term in general_terms) has_model = any(term in full_text for term in model_terms) has_negative = any(term in full_text for term in negative_terms) return (has_general and has_model) and (not has_negative) API_URL = "https://uiuc.chat/api/chat-api/chat" MODEL_NAME = "qwen2.5:14b-instruct-fp16" SYSTEM_PROMPT = ( "Based on the given title and abstract, please determine if the paper " "is relevant to both language models and bias (or fairness). " "If yes, respond 1; otherwise respond 0." ) def is_relevant_by_api(title, summary, api_key): """ 调用外部API,根据title+summary判别是否相关(返回 True/False), 并打印调试信息。 """ headers = {"Content-Type": "application/json"} data = { "model": MODEL_NAME, "messages": [ { "role": "system", "content": SYSTEM_PROMPT }, { "role": "user", "content": f"Title: {title}\nAbstract: {summary}" } ], "api_key": api_key, "course_name": "llm-bias-papers", "stream": False, "temperature": 0.0 } try: resp = requests.post(API_URL, headers=headers, json=data, timeout=30) resp.raise_for_status() full_json = resp.json() # 获取API返回的message response_msg = full_json.get("message", "") print(f"[DEBUG][API] return message='{response_msg.strip()}' for paper title='{title[:60]}...'") return (response_msg.strip() == "1") except Exception as e: print("[ERROR][API] calling external API:", e) return False def fetch_papers_combined(days=1): """ 1) 抓过去days天 arXiv论文(宽松) 2) 本地先过滤(分类 + advanced_filter) 3) 对“通过本地筛”的候选,调用API二次判定 + debug输出 """ now_utc = datetime.datetime.now(datetime.timezone.utc) start_utc = now_utc - datetime.timedelta(days=days) start_str = start_utc.strftime("%Y%m%d%H%M") end_str = now_utc.strftime("%Y%m%d%H%M") print(f"[DEBUG] date range (UTC): {start_str} ~ {end_str}, days={days}") search_query = f"submittedDate:[{start_str} TO {end_str}]" base_url = "http://export.arxiv.org/api/query" step = 100 start = 0 all_entries = [] while True: params = { "search_query": search_query, "sortBy": "submittedDate", "sortOrder": "descending", "start": start, "max_results": step } print(f"[DEBUG] fetching: {start} -> {start+step}") try: resp = requests.get(base_url, params=params, timeout=30) if resp.status_code != 200: print("[ERROR] HTTP Status:", resp.status_code) break feed = feedparser.parse(resp.content) except Exception as e: print("[ERROR] fetching arXiv:", e) break batch = feed.entries got_count = len(batch) print(f"[DEBUG] got {got_count} entries in this batch.") if got_count == 0: break all_entries.extend(batch) start += step if start >= 3000: print("[DEBUG] reached 3000, stop.") break print(f"[DEBUG] total retrieved in date range: {len(all_entries)}") # --- 本地过滤1: 分类 + advanced_filter --- local_candidates = [] for e in all_entries: title = getattr(e, "title", "") summary = getattr(e, "summary", "") published = getattr(e, "published", "") link = getattr(e, "link", "") categories = [t.term for t in e.tags] if hasattr(e, 'tags') else [] if not any(cat in ALLOWED_CATEGORIES for cat in categories): continue if advanced_filter(e): local_candidates.append({ "title": title, "summary": summary, "published": published, "link": link, "categories": categories }) print(f"[DEBUG] local_candidates = {len(local_candidates)} after local filter") # Debug: 打印所有local_candidates的标题,看看是不是你预期的那几篇 for idx, paper in enumerate(local_candidates, 1): print(f"[DEBUG][LOCAL] #{idx}, title='{paper['title']}' cat={paper['categories']}") # --- 2) 调API二次判定 --- api_key = os.getenv("UIUC_API_KEY") # 你在Secrets中配置 if not api_key: print("[WARNING] No UIUC_API_KEY found. Skip second filter.") return local_candidates final_matched = [] for idx, paper in enumerate(local_candidates, 1): relevant = is_relevant_by_api(paper["title"], paper["summary"], api_key) if relevant: final_matched.append({ "title": paper["title"], "published": paper["published"], "link": paper["link"], "categories": paper["categories"] }) else: # 如果不相关,就打印个提示 print(f"[DEBUG][API] => '0' => exclude paper #{idx}, title='{paper['title'][:60]}...'") print(f"[DEBUG] final_matched = {len(final_matched)} after API check") return final_matched def update_readme_in_repo(papers, token, repo_name): if not papers: print("[INFO] No matched papers, skip README update.") return g = Github(token) repo = g.get_repo(repo_name) readme_file = repo.get_contents("README.md", ref="main") old_content = readme_file.decoded_content.decode("utf-8") now_utc_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC") new_section = f"\n\n### Auto-captured papers on {now_utc_str}\n" for p in papers: cat_str = ", ".join(p["categories"]) new_section += f"- **{p['title']}** (Published={p['published']}) \n" new_section += f" - Categories: {cat_str} \n" new_section += f" - Link: {p['link']}\n\n" updated_content = old_content + new_section commit_msg = f"Auto update README with {len(papers)} new papers" repo.update_file( path="README.md", message=commit_msg, content=updated_content, sha=readme_file.sha, branch="main" ) print(f"[INFO] README updated with {len(papers)} papers.") def main(): # 抓过去5天(你例子里是5) 或根据需要改 days = 5 papers = fetch_papers_combined(days=days) print(f"\n[RESULT] matched {len(papers)} papers total after double filter. Now update README if not empty...") github_token = os.getenv("TARGET_REPO_TOKEN") target_repo_name = os.getenv("TARGET_REPO_NAME") if not github_token or not target_repo_name: print("[ERROR] Missing environment variables: TARGET_REPO_TOKEN / TARGET_REPO_NAME.") return if papers: update_readme_in_repo(papers, github_token, target_repo_name) else: print("[INFO] No matched papers, done without update.") if __name__ == "__main__": main()