summaryrefslogtreecommitdiff
path: root/scripts/test_arxiv_widest.py
diff options
context:
space:
mode:
authorblackhao <13851610112@163.com>2025-03-30 04:15:12 -0500
committerblackhao <13851610112@163.com>2025-03-30 04:15:12 -0500
commitd5b56b1a018c13f3f55e5808f671210e40d8d5e0 (patch)
tree8cb0b6d6557ce44819a21d6dad3b051d6d4a5bdf /scripts/test_arxiv_widest.py
parent2e73e07df97231838e94addac37ac8484fb4d08e (diff)
fuck arxiv api
Diffstat (limited to 'scripts/test_arxiv_widest.py')
-rw-r--r--scripts/test_arxiv_widest.py78
1 files changed, 78 insertions, 0 deletions
diff --git a/scripts/test_arxiv_widest.py b/scripts/test_arxiv_widest.py
new file mode 100644
index 0000000..466c62d
--- /dev/null
+++ b/scripts/test_arxiv_widest.py
@@ -0,0 +1,78 @@
+import requests
+import feedparser
+import datetime
+
+def fetch_arxiv_full_range():
+ """
+ 不限制分类、关键词,仅根据 submittedDate 做一个宽区间。
+ 分批次抓取,每批 100 条,直到再也拿不到新条目或达到我们设定的安全上限。
+ 同时演示如何在循环中检测如果发布时间超过了上限,就可以提前退出。
+ """
+
+ base_url = "http://export.arxiv.org/api/query"
+
+ # 宽松的日期范围 [202503250000 TO 202504020000]
+ # 你可以改成更广或更精确
+ start_date_str = "202503250000"
+ end_date_str = "202504020000"
+
+ search_query = f"submittedDate:[{start_date_str} TO {end_date_str}]"
+
+ # 分批抓取
+ step = 100
+ start = 0
+ all_entries = []
+
+ while True:
+ params = {
+ "search_query": search_query,
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "start": start,
+ "max_results": step
+ }
+ print(f"[DEBUG] Fetching from index={start} to {start+step}, date range = {start_date_str} ~ {end_date_str}")
+ resp = requests.get(base_url, params=params)
+ if resp.status_code != 200:
+ print("[ERROR] HTTP status:", resp.status_code)
+ break
+
+ feed = feedparser.parse(resp.content)
+ entries = feed.entries
+ got_count = len(entries)
+ print(f"[DEBUG] Got {got_count} entries this batch.")
+
+ if got_count == 0:
+ # 没有更多数据了
+ break
+
+ # 把本批加入总list
+ all_entries.extend(entries)
+ # 下一批
+ start += step
+
+ # 自定义一个安全上限,防止无限循环或极大数据
+ if start >= 3000:
+ # 3k 只是举例
+ print("[DEBUG] Over 3000 entries, stopping to avoid extremely large dataset.")
+ break
+
+ print("[DEBUG] total retrieved:", len(all_entries))
+
+ # 现在 all_entries 就是我们抓到的全部。
+ # 可以查看是否包含 "Bias-Aware Agent..." 或做后续处理
+
+ found = False
+ for idx, e in enumerate(all_entries, 1):
+ title_lower = e.title.lower()
+ if "bias-aware agent" in title_lower:
+ found = True
+ print(f"\n[FOUND] Index={idx}, Title={e.title}, published={e.published}, updated={e.updated}")
+ break
+
+ if not found:
+ print("\n[INFO] 'Bias-Aware Agent...' not found in the entire set.")
+
+
+if __name__ == "__main__":
+ fetch_arxiv_full_range()