diff options
| author | Yuren Hao <97327730+YurenHao0426@users.noreply.github.com> | 2025-04-13 22:14:51 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-04-13 22:14:51 -0700 |
| commit | afd449584d56e62e2658fc2afba6020edaed83a5 (patch) | |
| tree | 25fab83a8d2e6b9837d99cf8772d72a3213ff09a | |
| parent | 42820c9c646c499f95aabf9a2b8e3de8d293ee2d (diff) | |
Update fetch_papers.py
| -rw-r--r-- | scripts/fetch_papers.py | 78 |
1 files changed, 16 insertions, 62 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index c592805..2c7d177 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -56,6 +56,10 @@ def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"): def fetch_papers_combined(days=1): now_utc = datetime.datetime.now(datetime.timezone.utc) start_utc = now_utc - datetime.timedelta(days=days) + start_str = start_utc.strftime("%Y%m%d%H%M") + end_str = now_utc.strftime("%Y%m%d%H%M") + + search_query = f"submittedDate:[{start_str} TO {end_str}]" base_url = "http://export.arxiv.org/api/query" step = 100 @@ -64,48 +68,24 @@ def fetch_papers_combined(days=1): while True: params = { - "search_query": "cat:cs.* OR cat:stat.ML", + "search_query": search_query, "sortBy": "submittedDate", "sortOrder": "descending", "start": start, "max_results": step } - print(f"[DEBUG] Fetching batch {start} to {start+step}") - try: - resp = requests.get(base_url, params=params, timeout=30) - if resp.status_code != 200: - print(f"[ERROR] HTTP Status Code: {resp.status_code}") - break - feed = feedparser.parse(resp.content) - batch = feed.entries - if not batch: - print("[DEBUG] No entries returned, stopping fetch.") - break - - for e in batch: - published_dt = datetime.datetime.strptime( - e.published, "%Y-%m-%dT%H:%M:%SZ" - ).replace(tzinfo=datetime.timezone.utc) - - if published_dt >= start_utc: - all_entries.append(e) - else: - print("[DEBUG] Reached older entries beyond date range.") - break # 超出范围,停止继续获取 - - if published_dt < start_utc: - break # 日期已经超过,完全停止外层循环 - - start += step - if start >= 3000: - print("[DEBUG] Reached max limit (3000), stopping fetch.") - break - - except Exception as e: - print(f"[ERROR] Exception during fetching: {e}") + resp = requests.get(base_url, params=params, timeout=30) + if resp.status_code != 200: + break + feed = feedparser.parse(resp.content) + batch = feed.entries + if not batch: break - print(f"[DEBUG] Total papers fetched: {len(all_entries)} after date filtering.") + all_entries.extend(batch) + start += step + if start >= 3000: + break local_candidates = [ { @@ -119,28 +99,16 @@ def fetch_papers_combined(days=1): if any(cat in ALLOWED_CATEGORIES for cat in [t.term for t in e.tags]) and advanced_filter(e) ] - print(f"[DEBUG] Number of papers after local filtering: {len(local_candidates)}") - openai_api_key = os.getenv("OPENAI_API_KEY") if not openai_api_key: print("[WARNING] No OPENAI_API_KEY found. Skip second filter.") return local_candidates client = OpenAI(api_key=openai_api_key) - final_matched = [] - for idx, p in enumerate(local_candidates, 1): - if is_relevant_by_api(p["title"], p["summary"], client): - final_matched.append(p) - else: - print(f"[DEBUG][API] Paper #{idx} excluded by API.") - - print(f"[DEBUG] Number of papers after OpenAI API filtering: {len(final_matched)}") + final_matched = [p for p in local_candidates if is_relevant_by_api(p["title"], p["summary"], client)] return final_matched - - - def update_readme_in_repo(papers, token, repo_name): if not papers: return @@ -172,26 +140,12 @@ def update_readme_in_repo(papers, token, repo_name): def main(): days = 1 - print(f"[DEBUG] Starting fetch_papers_combined with days={days}") papers = fetch_papers_combined(days=days) - print(f"[DEBUG] After fetch_papers_combined: {len(papers)} papers matched.") - if not papers: - print("[DEBUG] No papers matched after both local and API filters.") - github_token = os.getenv("TARGET_REPO_TOKEN") target_repo_name = os.getenv("TARGET_REPO_NAME") - - print(f"[DEBUG] Github Token Set: {'Yes' if github_token else 'No'}") - print(f"[DEBUG] Target Repo Name: {target_repo_name if target_repo_name else 'Not Set'}") - if github_token and target_repo_name and papers: - print("[DEBUG] Proceeding to update README in repo...") update_readme_in_repo(papers, github_token, target_repo_name) - print("[DEBUG] README update completed.") - else: - print("[INFO] Skipped README update due to missing credentials or no papers matched.") - if __name__ == "__main__": main() |
