diff options
| author | haoyuren <13851610112@163.com> | 2025-06-29 16:19:06 -0700 |
|---|---|---|
| committer | haoyuren <13851610112@163.com> | 2025-06-29 16:19:06 -0700 |
| commit | 388f0407ef8c9f68866509f722491fcfd44afa11 (patch) | |
| tree | e90b933967670d08be1134e80bdee0187ec606f3 /scripts/test_arxiv_widest.py | |
| parent | da629fd626deb304ac8c562bfafbf4922c997b18 (diff) | |
fix bugs
Diffstat (limited to 'scripts/test_arxiv_widest.py')
| -rw-r--r-- | scripts/test_arxiv_widest.py | 78 |
1 files changed, 0 insertions, 78 deletions
diff --git a/scripts/test_arxiv_widest.py b/scripts/test_arxiv_widest.py deleted file mode 100644 index 466c62d..0000000 --- a/scripts/test_arxiv_widest.py +++ /dev/null @@ -1,78 +0,0 @@ -import requests -import feedparser -import datetime - -def fetch_arxiv_full_range(): - """ - 不限制分类、关键词,仅根据 submittedDate 做一个宽区间。 - 分批次抓取,每批 100 条,直到再也拿不到新条目或达到我们设定的安全上限。 - 同时演示如何在循环中检测如果发布时间超过了上限,就可以提前退出。 - """ - - base_url = "http://export.arxiv.org/api/query" - - # 宽松的日期范围 [202503250000 TO 202504020000] - # 你可以改成更广或更精确 - start_date_str = "202503250000" - end_date_str = "202504020000" - - search_query = f"submittedDate:[{start_date_str} TO {end_date_str}]" - - # 分批抓取 - step = 100 - start = 0 - all_entries = [] - - while True: - params = { - "search_query": search_query, - "sortBy": "submittedDate", - "sortOrder": "descending", - "start": start, - "max_results": step - } - print(f"[DEBUG] Fetching from index={start} to {start+step}, date range = {start_date_str} ~ {end_date_str}") - resp = requests.get(base_url, params=params) - if resp.status_code != 200: - print("[ERROR] HTTP status:", resp.status_code) - break - - feed = feedparser.parse(resp.content) - entries = feed.entries - got_count = len(entries) - print(f"[DEBUG] Got {got_count} entries this batch.") - - if got_count == 0: - # 没有更多数据了 - break - - # 把本批加入总list - all_entries.extend(entries) - # 下一批 - start += step - - # 自定义一个安全上限,防止无限循环或极大数据 - if start >= 3000: - # 3k 只是举例 - print("[DEBUG] Over 3000 entries, stopping to avoid extremely large dataset.") - break - - print("[DEBUG] total retrieved:", len(all_entries)) - - # 现在 all_entries 就是我们抓到的全部。 - # 可以查看是否包含 "Bias-Aware Agent..." 或做后续处理 - - found = False - for idx, e in enumerate(all_entries, 1): - title_lower = e.title.lower() - if "bias-aware agent" in title_lower: - found = True - print(f"\n[FOUND] Index={idx}, Title={e.title}, published={e.published}, updated={e.updated}") - break - - if not found: - print("\n[INFO] 'Bias-Aware Agent...' not found in the entire set.") - - -if __name__ == "__main__": - fetch_arxiv_full_range() |
