diff options
Diffstat (limited to 'scripts/fetch_papers.py')
| -rw-r--r-- | scripts/fetch_papers.py | 91 |
1 files changed, 48 insertions, 43 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index 00da4f2..7de3f12 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -54,76 +54,81 @@ def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"): return False def fetch_papers_combined(days=1): - now_utc = datetime.datetime.now(datetime.timezone.utc) - start_utc = now_utc - datetime.timedelta(days=days) + import datetime + import requests + import feedparser - start_str = start_utc.strftime("%Y%m%d%H%M") - end_str = now_utc.strftime("%Y%m%d%H%M") + now_utc = datetime.datetime.now(datetime.timezone.utc) + cutoff_utc = now_utc - datetime.timedelta(days=days) - search_query = f"submittedDate:[{start_str} TO {end_str}]" - base_url = "http://export.arxiv.org/api/query" + # 1. Build an OR‑joined category filter: + cat_query = " OR ".join(f"cat:{c}" for c in ALLOWED_CATEGORIES) + # If you really want *no* category filtering, just set: cat_query = "all:*" - step = 100 - start = 0 + base_url = "http://export.arxiv.org/api/query" + step = 100 + start = 0 all_entries = [] while True: params = { - "search_query": search_query, - "sortBy": "submittedDate", - "sortOrder": "descending", - "start": start, - "max_results": step + "search_query": cat_query, + "sortBy": "submittedDate", + "sortOrder": "descending", + "start": start, + "max_results": step } print(f"[DEBUG] fetching arXiv entries: {start} to {start+step}") + resp = requests.get(base_url, params=params, timeout=30) + resp.raise_for_status() + feed = feedparser.parse(resp.content) + batch = feed.entries + print(f"[DEBUG] fetched batch size: {len(batch)}") - try: - resp = requests.get(base_url, params=params, timeout=30) - if resp.status_code != 200: - print(f"[ERROR] HTTP Status Code: {resp.status_code}") - break - feed = feedparser.parse(resp.content) - batch = feed.entries - print(f"[DEBUG] fetched batch size: {len(batch)}") - - if not batch: - break - - all_entries.extend(batch) - start += step + if not batch: + break - if start >= 3000: - print("[DEBUG] Reached 3000 entries limit, stopping.") + # 2. Filter by published date >= cutoff + for entry in batch: + published = datetime.datetime.fromisoformat(entry.published) + if published >= cutoff_utc: + all_entries.append(entry) + else: + # since sorted descending, once we hit older papers we can stop entirely + start = None break - except Exception as e: - print("[ERROR] Exception during fetching from arXiv:", e) + if start is None or len(batch) < step: break - print(f"[DEBUG] total fetched papers from arXiv: {len(all_entries)}") + start += step + print(f"[DEBUG] total fetched papers from arXiv in last {days} day(s): {len(all_entries)}") + + # …then proceed with OpenAI filtering exactly as before… openai_api_key = os.getenv("OPENAI_API_KEY") if not openai_api_key: print("[ERROR] OPENAI_API_KEY missing, aborting.") return [] client = OpenAI(api_key=openai_api_key) - final_matched = [] - for idx, entry in enumerate(all_entries, 1): - title = entry.title - summary = entry.summary - categories = [t.term for t in entry.tags] if hasattr(entry, 'tags') else [] + for idx, entry in enumerate(all_entries, 1): + title = entry.title + summary = entry.summary + # if you *really* want to disable *all* filtering aside from the LLM check, + # you can comment out the category check below: + categories = [t.term for t in getattr(entry, 'tags', [])] if not any(cat in ALLOWED_CATEGORIES for cat in categories): - continue # 保留分类过滤,更精准高效(也可去掉) + continue if is_relevant_by_api(title, summary, client): final_matched.append({ - "title": title, - "summary": summary, - "published": entry.published, - "link": entry.link, + "title": title, + "summary": summary, + "published": entry.published, + "link": entry.link, "categories": categories }) print(f"[DEBUG][API] Included #{idx}: {title[:60]}...") @@ -131,10 +136,10 @@ def fetch_papers_combined(days=1): print(f"[DEBUG][API] Excluded #{idx}: {title[:60]}...") print(f"[DEBUG] final matched papers after OpenAI filter: {len(final_matched)}") - return final_matched + def update_readme_in_repo(papers, token, repo_name): if not papers: print("[INFO] No matched papers, skip README update.") |
