diff options
Diffstat (limited to 'scripts/test_arxiv_widest.py')
| -rw-r--r-- | scripts/test_arxiv_widest.py | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/scripts/test_arxiv_widest.py b/scripts/test_arxiv_widest.py new file mode 100644 index 0000000..466c62d --- /dev/null +++ b/scripts/test_arxiv_widest.py @@ -0,0 +1,78 @@ +import requests +import feedparser +import datetime + +def fetch_arxiv_full_range(): + """ + 不限制分类、关键词,仅根据 submittedDate 做一个宽区间。 + 分批次抓取,每批 100 条,直到再也拿不到新条目或达到我们设定的安全上限。 + 同时演示如何在循环中检测如果发布时间超过了上限,就可以提前退出。 + """ + + base_url = "http://export.arxiv.org/api/query" + + # 宽松的日期范围 [202503250000 TO 202504020000] + # 你可以改成更广或更精确 + start_date_str = "202503250000" + end_date_str = "202504020000" + + search_query = f"submittedDate:[{start_date_str} TO {end_date_str}]" + + # 分批抓取 + step = 100 + start = 0 + all_entries = [] + + while True: + params = { + "search_query": search_query, + "sortBy": "submittedDate", + "sortOrder": "descending", + "start": start, + "max_results": step + } + print(f"[DEBUG] Fetching from index={start} to {start+step}, date range = {start_date_str} ~ {end_date_str}") + resp = requests.get(base_url, params=params) + if resp.status_code != 200: + print("[ERROR] HTTP status:", resp.status_code) + break + + feed = feedparser.parse(resp.content) + entries = feed.entries + got_count = len(entries) + print(f"[DEBUG] Got {got_count} entries this batch.") + + if got_count == 0: + # 没有更多数据了 + break + + # 把本批加入总list + all_entries.extend(entries) + # 下一批 + start += step + + # 自定义一个安全上限,防止无限循环或极大数据 + if start >= 3000: + # 3k 只是举例 + print("[DEBUG] Over 3000 entries, stopping to avoid extremely large dataset.") + break + + print("[DEBUG] total retrieved:", len(all_entries)) + + # 现在 all_entries 就是我们抓到的全部。 + # 可以查看是否包含 "Bias-Aware Agent..." 或做后续处理 + + found = False + for idx, e in enumerate(all_entries, 1): + title_lower = e.title.lower() + if "bias-aware agent" in title_lower: + found = True + print(f"\n[FOUND] Index={idx}, Title={e.title}, published={e.published}, updated={e.updated}") + break + + if not found: + print("\n[INFO] 'Bias-Aware Agent...' not found in the entire set.") + + +if __name__ == "__main__": + fetch_arxiv_full_range() |
