diff options
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/fetch_papers.py | 18 |
1 files changed, 12 insertions, 6 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index 2f45e7a..c91f95d 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -56,9 +56,6 @@ def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"): def fetch_papers_combined(days=1): now_utc = datetime.datetime.now(datetime.timezone.utc) start_utc = now_utc - datetime.timedelta(days=days) - start_str = start_utc.strftime("%Y%m%d%H%M") - end_str = now_utc.strftime("%Y%m%d%H%M") - search_query = f"submittedDate:[{start_str} TO {end_str}]" base_url = "http://export.arxiv.org/api/query" step = 100 @@ -67,7 +64,7 @@ def fetch_papers_combined(days=1): while True: params = { - "search_query": search_query, + "search_query": "cat:cs.* OR cat:stat.ML", "sortBy": "submittedDate", "sortOrder": "descending", "start": start, @@ -82,12 +79,20 @@ def fetch_papers_combined(days=1): if not batch: break - all_entries.extend(batch) + # 本地过滤日期 + for e in batch: + published_dt = datetime.datetime.strptime(e.published, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=datetime.timezone.utc) + if published_dt < start_utc: + continue # 超出日期范围 + all_entries.append(e) + + if len(batch) < step: + break # 已经抓取到底了 start += step if start >= 3000: break - print(f"[DEBUG] arXiv returned total {len(all_entries)} papers from initial fetch.") + print(f"[DEBUG] arXiv returned total {len(all_entries)} papers after filtering by published date.") local_candidates = [ { @@ -119,6 +124,7 @@ def fetch_papers_combined(days=1): return final_matched + def update_readme_in_repo(papers, token, repo_name): if not papers: return |
