1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
import requests
import feedparser
import datetime
def fetch_arxiv_full_range():
"""
不限制分类、关键词,仅根据 submittedDate 做一个宽区间。
分批次抓取,每批 100 条,直到再也拿不到新条目或达到我们设定的安全上限。
同时演示如何在循环中检测如果发布时间超过了上限,就可以提前退出。
"""
base_url = "http://export.arxiv.org/api/query"
# 宽松的日期范围 [202503250000 TO 202504020000]
# 你可以改成更广或更精确
start_date_str = "202503250000"
end_date_str = "202504020000"
search_query = f"submittedDate:[{start_date_str} TO {end_date_str}]"
# 分批抓取
step = 100
start = 0
all_entries = []
while True:
params = {
"search_query": search_query,
"sortBy": "submittedDate",
"sortOrder": "descending",
"start": start,
"max_results": step
}
print(f"[DEBUG] Fetching from index={start} to {start+step}, date range = {start_date_str} ~ {end_date_str}")
resp = requests.get(base_url, params=params)
if resp.status_code != 200:
print("[ERROR] HTTP status:", resp.status_code)
break
feed = feedparser.parse(resp.content)
entries = feed.entries
got_count = len(entries)
print(f"[DEBUG] Got {got_count} entries this batch.")
if got_count == 0:
# 没有更多数据了
break
# 把本批加入总list
all_entries.extend(entries)
# 下一批
start += step
# 自定义一个安全上限,防止无限循环或极大数据
if start >= 3000:
# 3k 只是举例
print("[DEBUG] Over 3000 entries, stopping to avoid extremely large dataset.")
break
print("[DEBUG] total retrieved:", len(all_entries))
# 现在 all_entries 就是我们抓到的全部。
# 可以查看是否包含 "Bias-Aware Agent..." 或做后续处理
found = False
for idx, e in enumerate(all_entries, 1):
title_lower = e.title.lower()
if "bias-aware agent" in title_lower:
found = True
print(f"\n[FOUND] Index={idx}, Title={e.title}, published={e.published}, updated={e.updated}")
break
if not found:
print("\n[INFO] 'Bias-Aware Agent...' not found in the entire set.")
if __name__ == "__main__":
fetch_arxiv_full_range()
|