diff options
| author | blackhao <13851610112@163.com> | 2025-03-30 04:15:12 -0500 |
|---|---|---|
| committer | blackhao <13851610112@163.com> | 2025-03-30 04:15:12 -0500 |
| commit | d5b56b1a018c13f3f55e5808f671210e40d8d5e0 (patch) | |
| tree | 8cb0b6d6557ce44819a21d6dad3b051d6d4a5bdf | |
| parent | 2e73e07df97231838e94addac37ac8484fb4d08e (diff) | |
fuck arxiv api
| -rw-r--r-- | scripts/fetch_papers.py | 187 | ||||
| -rw-r--r-- | scripts/single_run_test.py | 36 | ||||
| -rw-r--r-- | scripts/test_arxiv_widest.py | 78 |
3 files changed, 246 insertions, 55 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index 70b7b6b..6666d72 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -1,69 +1,146 @@ +import os import requests import feedparser import datetime +from github import Github -def fetch_arxiv_bias_fairness(days=3): +def fetch_papers_wide_then_filter(days=1, keywords=None): """ - 从 arXiv 中搜索过去 N 天内包含 'bias' OR 'fairness' 等关键词的论文 - 分类限定为 cs.IR (可自行改),使用 all: 字段 + submittedDate range + 本地过滤 + 抓过去 N 天的论文(只限制 submittedDate),然后本地判断: + - 是否 cs.* 或 stat.* + - 标题/摘要是否含 keywords + 返回一个列表,每个元素是字典: + { 'title':..., 'published':..., 'link':..., 'categories':[...] } """ - now = datetime.datetime.utcnow() - start_day = now - datetime.timedelta(days=days) - # 构造日期范围 (只精确到天就行) - # 格式: [YYYYMMDD0000 TO YYYYMMDD2359] - start_str = start_day.strftime("%Y%m%d0000") - end_str = now.strftime("%Y%m%d2359") - - # arXiv 布尔搜索表达式 - # 这里演示2个关键词 bias, fairness - # 用 (all:bias OR all:fairness) - # 同时限制分类 cat:cs.IR - # 同时限制日期 submittedDate:[start_str TO end_str] - # 并指定 sortBy=submittedDate - search_query = f"(all:bias+OR+all:fairness)+AND+cat:cs.IR+AND+submittedDate:[{start_str}+TO+{end_str}]" + if keywords is None: + keywords = ["bias", "fairness"] + now_utc = datetime.datetime.now(datetime.timezone.utc) + start_utc = now_utc - datetime.timedelta(days=days) + + start_str = start_utc.strftime("%Y%m%d%H%M") + end_str = now_utc.strftime("%Y%m%d%H%M") + + search_query = f"submittedDate:[{start_str} TO {end_str}]" base_url = "http://export.arxiv.org/api/query" - params = { - "search_query": search_query, - "sortBy": "submittedDate", - "sortOrder": "descending", - "max_results": 100 - } - print("[DEBUG] search_query=", search_query) - - response = requests.get(base_url, params=params) - print("[DEBUG] Full URL =", response.url) - if response.status_code != 200: - print("[ERROR] HTTP Status:", response.status_code) - return [] - - feed = feedparser.parse(response.content) - entries = feed.entries - print("[DEBUG] arXiv 返回条数:", len(entries)) - - papers = [] - for e in entries: - title = e.title - summary = e.summary - published = e.published - link = e.link - - # 在本地再做一个严格的匹配 - # 看标题或摘要中是否真的含 bias/fairness - # 以免 all:bias 命中其他字段 - text = (title + " " + summary).lower() - if ("bias" in text) or ("fairness" in text): - papers.append({ + + step = 100 + start = 0 + all_entries = [] + + print(f"[DEBUG] Time range: {start_str} ~ {end_str}, days={days}") + while True: + params = { + "search_query": search_query, + "sortBy": "submittedDate", + "sortOrder": "descending", + "start": start, + "max_results": step + } + print(f"[DEBUG] fetching: {start} -> {start+step}") + r = requests.get(base_url, params=params) + if r.status_code != 200: + print("[ERROR] HTTP status:", r.status_code) + break + + feed = feedparser.parse(r.content) + got = len(feed.entries) + print(f"[DEBUG] got {got} entries this batch.") + if got == 0: + break + + all_entries.extend(feed.entries) + start += step + + if start >= 3000: + print("[DEBUG] reached 3000, stop.") + break + + print(f"[DEBUG] total in date range: {len(all_entries)}") + + matched = [] + for e in all_entries: + title = getattr(e, 'title', '') + summary = getattr(e, 'summary', '') + published = getattr(e, 'published', '') + link = getattr(e, 'link', '') + if hasattr(e, 'tags'): + categories = [t.term for t in e.tags] + else: + categories = [] + + # 判定分类 + has_cs_stat = any(c.startswith("cs.") or c.startswith("stat.") for c in categories) + if not has_cs_stat: + continue + + # 判定关键词 + text_lower = (title + " " + summary).lower() + if any(kw.lower() in text_lower for kw in keywords): + matched.append({ "title": title, "published": published, - "link": link + "link": link, + "categories": categories }) - return papers + print(f"[DEBUG] matched {len(matched)} papers after local filter (cs./stat.+keywords)") + return matched + +def update_readme_in_repo(papers, token, repo_name): + """ + 将匹配到的论文列表写入目标repo的 README.md + """ + if not papers: + print("[INFO] No matched papers, skip README update.") + return + + g = Github(token) + repo = g.get_repo(repo_name) + + # 获取 README 内容 + readme_file = repo.get_contents("README.md", ref="main") + readme_content = readme_file.decoded_content.decode("utf-8") + + now_utc_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + new_section = f"\n\n### Auto-captured papers on {now_utc_str}\n" + for p in papers: + cat_str = ", ".join(p["categories"]) + new_section += f"- **{p['title']}** (Published={p['published']}) \n" + new_section += f" - Categories: {cat_str} \n" + new_section += f" - Link: {p['link']}\n\n" + + updated_content = readme_content + new_section + commit_msg = f"Auto update README with {len(papers)} new papers" + + repo.update_file( + path="README.md", + message=commit_msg, + content=updated_content, + sha=readme_file.sha, + branch="main" + ) + print(f"[INFO] README updated with {len(papers)} papers.") + +def main(): + # 1. 获取过去3天, keywords=["bias","fairness"] 的论文 + days = 3 + keywords = ["bias", "fairness"] + papers = fetch_papers_wide_then_filter(days=days, keywords=keywords) + print(f"[RESULT] matched {len(papers)} papers. Now let's update README in target repo if any.") + + # 2. 如果有匹配论文,更新 README + # 需要在 secrets 或 env 里获取 token, repo name + github_token = os.getenv("TARGET_REPO_TOKEN") + target_repo_name = os.getenv("TARGET_REPO_NAME") + if not github_token or not target_repo_name: + print("[ERROR] Missing environment variables: TARGET_REPO_TOKEN / TARGET_REPO_NAME.") + return + + if papers: + update_readme_in_repo(papers, github_token, target_repo_name) + else: + print("[INFO] No matched papers, done without update.") if __name__ == "__main__": - # 测试过去3天 - results = fetch_arxiv_bias_fairness(days=3) - print(f"找到 {len(results)} 篇论文:") - for i, p in enumerate(results, 1): - print(f"{i}. {p['title']} - {p['published']} - {p['link']}") + main() diff --git a/scripts/single_run_test.py b/scripts/single_run_test.py new file mode 100644 index 0000000..fe52bb4 --- /dev/null +++ b/scripts/single_run_test.py @@ -0,0 +1,36 @@ +import requests +import feedparser + +def test_arxiv(): + base_url = "http://export.arxiv.org/api/query" + # 时间段设为 2025-03-27 00:00 到 2025-03-29 00:00 + # 注意: 论文在 3月27日 07:54Z 提交,应该在这个区间之内 + search_query = ( + "(all:bias+OR+all:fairness)" + "+AND+cat:cs.IR" + "+AND+submittedDate:[202503270000+TO+202503290000]" + ) + + params = { + "search_query": search_query, + "sortBy": "submittedDate", + "sortOrder": "descending", + "max_results": 100 + } + print("[DEBUG] search_query =", search_query) + + r = requests.get(base_url, params=params) + print("[DEBUG] Full URL =", r.url) + if r.status_code != 200: + print("[ERROR] HTTP Status:", r.status_code) + return + + feed = feedparser.parse(r.content) + print("[DEBUG] Returned entries:", len(feed.entries)) + + # 打印出标题和发布时间供检查 + for i, entry in enumerate(feed.entries, start=1): + print(f"{i}. Title: {entry.title} | updated: {entry.updated} | published: {entry.published}") + +if __name__ == "__main__": + test_arxiv() diff --git a/scripts/test_arxiv_widest.py b/scripts/test_arxiv_widest.py new file mode 100644 index 0000000..466c62d --- /dev/null +++ b/scripts/test_arxiv_widest.py @@ -0,0 +1,78 @@ +import requests +import feedparser +import datetime + +def fetch_arxiv_full_range(): + """ + 不限制分类、关键词,仅根据 submittedDate 做一个宽区间。 + 分批次抓取,每批 100 条,直到再也拿不到新条目或达到我们设定的安全上限。 + 同时演示如何在循环中检测如果发布时间超过了上限,就可以提前退出。 + """ + + base_url = "http://export.arxiv.org/api/query" + + # 宽松的日期范围 [202503250000 TO 202504020000] + # 你可以改成更广或更精确 + start_date_str = "202503250000" + end_date_str = "202504020000" + + search_query = f"submittedDate:[{start_date_str} TO {end_date_str}]" + + # 分批抓取 + step = 100 + start = 0 + all_entries = [] + + while True: + params = { + "search_query": search_query, + "sortBy": "submittedDate", + "sortOrder": "descending", + "start": start, + "max_results": step + } + print(f"[DEBUG] Fetching from index={start} to {start+step}, date range = {start_date_str} ~ {end_date_str}") + resp = requests.get(base_url, params=params) + if resp.status_code != 200: + print("[ERROR] HTTP status:", resp.status_code) + break + + feed = feedparser.parse(resp.content) + entries = feed.entries + got_count = len(entries) + print(f"[DEBUG] Got {got_count} entries this batch.") + + if got_count == 0: + # 没有更多数据了 + break + + # 把本批加入总list + all_entries.extend(entries) + # 下一批 + start += step + + # 自定义一个安全上限,防止无限循环或极大数据 + if start >= 3000: + # 3k 只是举例 + print("[DEBUG] Over 3000 entries, stopping to avoid extremely large dataset.") + break + + print("[DEBUG] total retrieved:", len(all_entries)) + + # 现在 all_entries 就是我们抓到的全部。 + # 可以查看是否包含 "Bias-Aware Agent..." 或做后续处理 + + found = False + for idx, e in enumerate(all_entries, 1): + title_lower = e.title.lower() + if "bias-aware agent" in title_lower: + found = True + print(f"\n[FOUND] Index={idx}, Title={e.title}, published={e.published}, updated={e.updated}") + break + + if not found: + print("\n[INFO] 'Bias-Aware Agent...' not found in the entire set.") + + +if __name__ == "__main__": + fetch_arxiv_full_range() |
