summaryrefslogtreecommitdiff
path: root/scripts/test_arxiv_widest.py
blob: 466c62d7647c62051bb8cb6b9dfb7db28ee57c4d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
import feedparser
import datetime

def fetch_arxiv_full_range():
    """
    不限制分类、关键词,仅根据 submittedDate 做一个宽区间。
    分批次抓取,每批 100 条,直到再也拿不到新条目或达到我们设定的安全上限。
    同时演示如何在循环中检测如果发布时间超过了上限,就可以提前退出。
    """

    base_url = "http://export.arxiv.org/api/query"

    # 宽松的日期范围 [202503250000 TO 202504020000]
    # 你可以改成更广或更精确
    start_date_str = "202503250000"
    end_date_str = "202504020000"

    search_query = f"submittedDate:[{start_date_str} TO {end_date_str}]"

    # 分批抓取
    step = 100
    start = 0
    all_entries = []

    while True:
        params = {
            "search_query": search_query,
            "sortBy": "submittedDate",
            "sortOrder": "descending",
            "start": start,
            "max_results": step
        }
        print(f"[DEBUG] Fetching from index={start} to {start+step}, date range = {start_date_str} ~ {end_date_str}")
        resp = requests.get(base_url, params=params)
        if resp.status_code != 200:
            print("[ERROR] HTTP status:", resp.status_code)
            break

        feed = feedparser.parse(resp.content)
        entries = feed.entries
        got_count = len(entries)
        print(f"[DEBUG] Got {got_count} entries this batch.")

        if got_count == 0:
            # 没有更多数据了
            break

        # 把本批加入总list
        all_entries.extend(entries)
        # 下一批
        start += step

        # 自定义一个安全上限,防止无限循环或极大数据
        if start >= 3000:
            # 3k 只是举例
            print("[DEBUG] Over 3000 entries, stopping to avoid extremely large dataset.")
            break

    print("[DEBUG] total retrieved:", len(all_entries))

    # 现在 all_entries 就是我们抓到的全部。
    # 可以查看是否包含 "Bias-Aware Agent..." 或做后续处理

    found = False
    for idx, e in enumerate(all_entries, 1):
        title_lower = e.title.lower()
        if "bias-aware agent" in title_lower:
            found = True
            print(f"\n[FOUND]  Index={idx}, Title={e.title}, published={e.published}, updated={e.updated}")
            break

    if not found:
        print("\n[INFO] 'Bias-Aware Agent...' not found in the entire set.")


if __name__ == "__main__":
    fetch_arxiv_full_range()