From 42820c9c646c499f95aabf9a2b8e3de8d293ee2d Mon Sep 17 00:00:00 2001 From: Yuren Hao <97327730+YurenHao0426@users.noreply.github.com> Date: Sun, 13 Apr 2025 22:04:40 -0700 Subject: Update fetch_papers.py --- scripts/fetch_papers.py | 54 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index c91f95d..c592805 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -70,29 +70,42 @@ def fetch_papers_combined(days=1): "start": start, "max_results": step } - resp = requests.get(base_url, params=params, timeout=30) - if resp.status_code != 200: - print(f"[ERROR] Failed fetching from arXiv. Status code: {resp.status_code}") - break - feed = feedparser.parse(resp.content) - batch = feed.entries - if not batch: - break + print(f"[DEBUG] Fetching batch {start} to {start+step}") + try: + resp = requests.get(base_url, params=params, timeout=30) + if resp.status_code != 200: + print(f"[ERROR] HTTP Status Code: {resp.status_code}") + break + feed = feedparser.parse(resp.content) + batch = feed.entries + if not batch: + print("[DEBUG] No entries returned, stopping fetch.") + break + + for e in batch: + published_dt = datetime.datetime.strptime( + e.published, "%Y-%m-%dT%H:%M:%SZ" + ).replace(tzinfo=datetime.timezone.utc) + + if published_dt >= start_utc: + all_entries.append(e) + else: + print("[DEBUG] Reached older entries beyond date range.") + break # 超出范围,停止继续获取 - # 本地过滤日期 - for e in batch: - published_dt = datetime.datetime.strptime(e.published, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=datetime.timezone.utc) if published_dt < start_utc: - continue # 超出日期范围 - all_entries.append(e) + break # 日期已经超过,完全停止外层循环 - if len(batch) < step: - break # 已经抓取到底了 - start += step - if start >= 3000: + start += step + if start >= 3000: + print("[DEBUG] Reached max limit (3000), stopping fetch.") + break + + except Exception as e: + print(f"[ERROR] Exception during fetching: {e}") break - print(f"[DEBUG] arXiv returned total {len(all_entries)} papers after filtering by published date.") + print(f"[DEBUG] Total papers fetched: {len(all_entries)} after date filtering.") local_candidates = [ { @@ -115,9 +128,11 @@ def fetch_papers_combined(days=1): client = OpenAI(api_key=openai_api_key) final_matched = [] - for p in local_candidates: + for idx, p in enumerate(local_candidates, 1): if is_relevant_by_api(p["title"], p["summary"], client): final_matched.append(p) + else: + print(f"[DEBUG][API] Paper #{idx} excluded by API.") print(f"[DEBUG] Number of papers after OpenAI API filtering: {len(final_matched)}") @@ -125,6 +140,7 @@ def fetch_papers_combined(days=1): + def update_readme_in_repo(papers, token, repo_name): if not papers: return -- cgit v1.2.3