import os import requests import feedparser import datetime from github import Github from openai import OpenAI ALLOWED_CATEGORIES = [ "cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO", "cs.IR", "stat.ML" ] SYSTEM_PROMPT = ( "You are a helpful assistant. The user will give you a paper title and abstract. " "Your task: Decide if this paper is about large language models (or generative text models) AND about bias/fairness. " "If yes, respond with just a single character: 1. Otherwise, respond with a single character: 0. " "No extra explanation, no punctuation—only the number." ) def advanced_filter(entry): title = getattr(entry, 'title', '').lower() summary = getattr(entry, 'summary', '').lower() full_text = title + " " + summary general_terms = ["bias", "fairness"] model_terms = ["llm", "language model", "transformer", "gpt", "nlp", "pretrained", "embedding", "generation", "alignment", "ai"] negative_terms = ["estimation", "variance", "quantum", "physics", "sensor", "circuit", "electronics", "hardware"] has_general = any(term in full_text for term in general_terms) has_model = any(term in full_text for term in model_terms) has_negative = any(term in full_text for term in negative_terms) return (has_general and has_model) and (not has_negative) def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"): prompt = f"Title: {title}\nAbstract: {summary}" try: dialogue = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt} ], temperature=0.0, max_tokens=1 ) response_msg = dialogue.choices[0].message.content.strip() print(f"[DEBUG][API] return message='{response_msg}' for paper title='{title[:60]}...'") return response_msg == "1" except Exception as e: print("[ERROR][API] calling OpenAI API:", e) return False def fetch_papers_combined(days=1): now_utc = datetime.datetime.now(datetime.timezone.utc) start_utc = now_utc - datetime.timedelta(days=days) base_url = "http://export.arxiv.org/api/query" step = 100 start = 0 all_entries = [] while True: params = { "search_query": "cat:cs.* OR cat:stat.ML", "sortBy": "submittedDate", "sortOrder": "descending", "start": start, "max_results": step } resp = requests.get(base_url, params=params, timeout=30) if resp.status_code != 200: print(f"[ERROR] Failed fetching from arXiv. Status code: {resp.status_code}") break feed = feedparser.parse(resp.content) batch = feed.entries if not batch: break # 本地过滤日期 for e in batch: published_dt = datetime.datetime.strptime(e.published, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=datetime.timezone.utc) if published_dt < start_utc: continue # 超出日期范围 all_entries.append(e) if len(batch) < step: break # 已经抓取到底了 start += step if start >= 3000: break print(f"[DEBUG] arXiv returned total {len(all_entries)} papers after filtering by published date.") local_candidates = [ { "title": e.title, "summary": e.summary, "published": e.published, "link": e.link, "categories": [t.term for t in e.tags] } for e in all_entries if any(cat in ALLOWED_CATEGORIES for cat in [t.term for t in e.tags]) and advanced_filter(e) ] print(f"[DEBUG] Number of papers after local filtering: {len(local_candidates)}") openai_api_key = os.getenv("OPENAI_API_KEY") if not openai_api_key: print("[WARNING] No OPENAI_API_KEY found. Skip second filter.") return local_candidates client = OpenAI(api_key=openai_api_key) final_matched = [] for p in local_candidates: if is_relevant_by_api(p["title"], p["summary"], client): final_matched.append(p) print(f"[DEBUG] Number of papers after OpenAI API filtering: {len(final_matched)}") return final_matched def update_readme_in_repo(papers, token, repo_name): if not papers: return g = Github(token) repo = g.get_repo(repo_name) readme_file = repo.get_contents("README.md", ref="main") old_content = readme_file.decoded_content.decode("utf-8") now_utc_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC") new_section = f"\n\n### Auto-captured papers on {now_utc_str}\n" for p in papers: cat_str = ", ".join(p["categories"]) new_section += f"- **{p['title']}** (Published={p['published']}) \n" new_section += f" - Categories: {cat_str} \n" new_section += f" - Link: {p['link']}\n\n" updated_content = old_content + new_section commit_msg = f"Auto update README with {len(papers)} new papers" repo.update_file( path="README.md", message=commit_msg, content=updated_content, sha=readme_file.sha, branch="main" ) def main(): days = 1 print(f"[DEBUG] Starting fetch_papers_combined with days={days}") papers = fetch_papers_combined(days=days) print(f"[DEBUG] After fetch_papers_combined: {len(papers)} papers matched.") if not papers: print("[DEBUG] No papers matched after both local and API filters.") github_token = os.getenv("TARGET_REPO_TOKEN") target_repo_name = os.getenv("TARGET_REPO_NAME") print(f"[DEBUG] Github Token Set: {'Yes' if github_token else 'No'}") print(f"[DEBUG] Target Repo Name: {target_repo_name if target_repo_name else 'Not Set'}") if github_token and target_repo_name and papers: print("[DEBUG] Proceeding to update README in repo...") update_readme_in_repo(papers, github_token, target_repo_name) print("[DEBUG] README update completed.") else: print("[INFO] Skipped README update due to missing credentials or no papers matched.") if __name__ == "__main__": main()