summaryrefslogtreecommitdiff
path: root/scripts/test_arxiv_widest.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/test_arxiv_widest.py')
-rw-r--r--scripts/test_arxiv_widest.py78
1 files changed, 0 insertions, 78 deletions
diff --git a/scripts/test_arxiv_widest.py b/scripts/test_arxiv_widest.py
deleted file mode 100644
index 466c62d..0000000
--- a/scripts/test_arxiv_widest.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import requests
-import feedparser
-import datetime
-
-def fetch_arxiv_full_range():
- """
- 不限制分类、关键词,仅根据 submittedDate 做一个宽区间。
- 分批次抓取,每批 100 条,直到再也拿不到新条目或达到我们设定的安全上限。
- 同时演示如何在循环中检测如果发布时间超过了上限,就可以提前退出。
- """
-
- base_url = "http://export.arxiv.org/api/query"
-
- # 宽松的日期范围 [202503250000 TO 202504020000]
- # 你可以改成更广或更精确
- start_date_str = "202503250000"
- end_date_str = "202504020000"
-
- search_query = f"submittedDate:[{start_date_str} TO {end_date_str}]"
-
- # 分批抓取
- step = 100
- start = 0
- all_entries = []
-
- while True:
- params = {
- "search_query": search_query,
- "sortBy": "submittedDate",
- "sortOrder": "descending",
- "start": start,
- "max_results": step
- }
- print(f"[DEBUG] Fetching from index={start} to {start+step}, date range = {start_date_str} ~ {end_date_str}")
- resp = requests.get(base_url, params=params)
- if resp.status_code != 200:
- print("[ERROR] HTTP status:", resp.status_code)
- break
-
- feed = feedparser.parse(resp.content)
- entries = feed.entries
- got_count = len(entries)
- print(f"[DEBUG] Got {got_count} entries this batch.")
-
- if got_count == 0:
- # 没有更多数据了
- break
-
- # 把本批加入总list
- all_entries.extend(entries)
- # 下一批
- start += step
-
- # 自定义一个安全上限,防止无限循环或极大数据
- if start >= 3000:
- # 3k 只是举例
- print("[DEBUG] Over 3000 entries, stopping to avoid extremely large dataset.")
- break
-
- print("[DEBUG] total retrieved:", len(all_entries))
-
- # 现在 all_entries 就是我们抓到的全部。
- # 可以查看是否包含 "Bias-Aware Agent..." 或做后续处理
-
- found = False
- for idx, e in enumerate(all_entries, 1):
- title_lower = e.title.lower()
- if "bias-aware agent" in title_lower:
- found = True
- print(f"\n[FOUND] Index={idx}, Title={e.title}, published={e.published}, updated={e.updated}")
- break
-
- if not found:
- print("\n[INFO] 'Bias-Aware Agent...' not found in the entire set.")
-
-
-if __name__ == "__main__":
- fetch_arxiv_full_range()