import os import requests import feedparser import datetime from github import Github from openai import OpenAI ALLOWED_CATEGORIES = [ "cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO", "cs.IR", "stat.ML" ] SYSTEM_PROMPT = ( "You are a helpful assistant. The user will give you a paper title and abstract. " "Your task: Decide if this paper is about large language models (or generative text models) AND about bias/fairness. " "If yes, respond with just a single character: 1. Otherwise, respond with a single character: 0. " "No extra explanation, no punctuation—only the number." ) def advanced_filter(entry): title = getattr(entry, 'title', '').lower() summary = getattr(entry, 'summary', '').lower() full_text = title + " " + summary general_terms = ["bias", "fairness"] model_terms = ["llm", "language model", "transformer", "gpt", "nlp", "pretrained", "embedding", "generation", "alignment", "ai"] negative_terms = ["estimation", "variance", "quantum", "physics", "sensor", "circuit", "electronics", "hardware"] has_general = any(term in full_text for term in general_terms) has_model = any(term in full_text for term in model_terms) has_negative = any(term in full_text for term in negative_terms) return (has_general and has_model) and (not has_negative) def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"): prompt = f"Title: {title}\nAbstract: {summary}" try: dialogue = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt} ], temperature=0.0, max_tokens=1 ) response_msg = dialogue.choices[0].message.content.strip() print(f"[DEBUG][API] OpenAI response='{response_msg}' for paper '{title[:60]}...'") return response_msg == "1" except Exception as e: print("[ERROR][API] calling OpenAI API:", e) return False def fetch_papers_combined(days=1): now_utc = datetime.datetime.now(datetime.timezone.utc) start_utc = now_utc - datetime.timedelta(days=days) start_str = start_utc.strftime("%Y%m%d%H%M") end_str = now_utc.strftime("%Y%m%d%H%M") search_query = f"submittedDate:[{start_str} TO {end_str}]" base_url = "http://export.arxiv.org/api/query" step = 100 start = 0 all_entries = [] while True: params = { "search_query": search_query, "sortBy": "submittedDate", "sortOrder": "descending", "start": start, "max_results": step } print(f"[DEBUG] fetching arXiv entries: {start} to {start+step}") try: resp = requests.get(base_url, params=params, timeout=30) if resp.status_code != 200: print(f"[ERROR] HTTP Status Code: {resp.status_code}") break feed = feedparser.parse(resp.content) batch = feed.entries print(f"[DEBUG] fetched batch size: {len(batch)}") if not batch: break all_entries.extend(batch) start += step if start >= 3000: print("[DEBUG] Reached 3000 entries limit, stopping.") break except Exception as e: print("[ERROR] Exception during fetching from arXiv:", e) break print(f"[DEBUG] total fetched papers: {len(all_entries)}") local_candidates = [] for e in all_entries: categories = [t.term for t in e.tags] if hasattr(e, 'tags') else [] if not any(cat in ALLOWED_CATEGORIES for cat in categories): continue if advanced_filter(e): local_candidates.append({ "title": e.title, "summary": e.summary, "published": e.published, "link": e.link, "categories": categories }) print(f"[DEBUG] candidates after local filter: {len(local_candidates)}") openai_api_key = os.getenv("OPENAI_API_KEY") if not openai_api_key: print("[WARNING] No OPENAI_API_KEY found. Skip second filter.") return local_candidates client = OpenAI(api_key=openai_api_key) final_matched = [] for idx, paper in enumerate(local_candidates, 1): if is_relevant_by_api(paper["title"], paper["summary"], client): final_matched.append(paper) else: print(f"[DEBUG][API] Excluded paper #{idx}: {paper['title'][:60]}...") print(f"[DEBUG] final matched papers after OpenAI filter: {len(final_matched)}") return final_matched def update_readme_in_repo(papers, token, repo_name): if not papers: print("[INFO] No matched papers, skip README update.") return g = Github(token) repo = g.get_repo(repo_name) readme_file = repo.get_contents("README.md", ref="main") old_content = readme_file.decoded_content.decode("utf-8") now_utc_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC") new_section = f"\n\n### Auto-captured papers on {now_utc_str}\n" for p in papers: cat_str = ", ".join(p["categories"]) new_section += f"- **{p['title']}** (Published={p['published']}) \n" new_section += f" - Categories: {cat_str} \n" new_section += f" - Link: {p['link']}\n\n" updated_content = old_content + new_section commit_msg = f"Auto update README with {len(papers)} new papers" repo.update_file( path="README.md", message=commit_msg, content=updated_content, sha=readme_file.sha, branch="main" ) print(f"[INFO] README updated with {len(papers)} papers.") def main(): days = 1 print(f"[DEBUG] Starting fetch_papers_combined with days={days}") papers = fetch_papers_combined(days=days) print(f"[DEBUG] After fetch_papers_combined: {len(papers)} papers matched.") github_token = os.getenv("TARGET_REPO_TOKEN") target_repo_name = os.getenv("TARGET_REPO_NAME") print(f"[DEBUG] Github Token Set: {'Yes' if github_token else 'No'}") print(f"[DEBUG] Target Repo Name: {target_repo_name}") if github_token and target_repo_name and papers: update_readme_in_repo(papers, github_token, target_repo_name) else: print("[INFO] Skipped README update due to missing credentials or no papers matched.") if __name__ == "__main__": main()