summaryrefslogtreecommitdiff
path: root/scripts/fetch_papers.py
diff options
context:
space:
mode:
authorYuren Hao <97327730+YurenHao0426@users.noreply.github.com>2025-04-13 22:00:30 -0700
committerGitHub <noreply@github.com>2025-04-13 22:00:30 -0700
commit54a3aa1f3b45b2f7a7033332e1a1b9fe15a6f5fa (patch)
tree9f6496b0cdd13660b46b8674318ca6eb5a23f9c3 /scripts/fetch_papers.py
parent5dc7cc9432d6950bae69a4e75137a9858ad327c2 (diff)
Update fetch_papers.py
Diffstat (limited to 'scripts/fetch_papers.py')
-rw-r--r--scripts/fetch_papers.py18
1 files changed, 12 insertions, 6 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index 2f45e7a..c91f95d 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -56,9 +56,6 @@ def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"):
def fetch_papers_combined(days=1):
now_utc = datetime.datetime.now(datetime.timezone.utc)
start_utc = now_utc - datetime.timedelta(days=days)
- start_str = start_utc.strftime("%Y%m%d%H%M")
- end_str = now_utc.strftime("%Y%m%d%H%M")
- search_query = f"submittedDate:[{start_str} TO {end_str}]"
base_url = "http://export.arxiv.org/api/query"
step = 100
@@ -67,7 +64,7 @@ def fetch_papers_combined(days=1):
while True:
params = {
- "search_query": search_query,
+ "search_query": "cat:cs.* OR cat:stat.ML",
"sortBy": "submittedDate",
"sortOrder": "descending",
"start": start,
@@ -82,12 +79,20 @@ def fetch_papers_combined(days=1):
if not batch:
break
- all_entries.extend(batch)
+ # 本地过滤日期
+ for e in batch:
+ published_dt = datetime.datetime.strptime(e.published, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=datetime.timezone.utc)
+ if published_dt < start_utc:
+ continue # 超出日期范围
+ all_entries.append(e)
+
+ if len(batch) < step:
+ break # 已经抓取到底了
start += step
if start >= 3000:
break
- print(f"[DEBUG] arXiv returned total {len(all_entries)} papers from initial fetch.")
+ print(f"[DEBUG] arXiv returned total {len(all_entries)} papers after filtering by published date.")
local_candidates = [
{
@@ -119,6 +124,7 @@ def fetch_papers_combined(days=1):
return final_matched
+
def update_readme_in_repo(papers, token, repo_name):
if not papers:
return