summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorblackhao <13851610112@163.com>2025-03-30 04:15:12 -0500
committerblackhao <13851610112@163.com>2025-03-30 04:15:12 -0500
commitd5b56b1a018c13f3f55e5808f671210e40d8d5e0 (patch)
tree8cb0b6d6557ce44819a21d6dad3b051d6d4a5bdf
parent2e73e07df97231838e94addac37ac8484fb4d08e (diff)
fuck arxiv api
-rw-r--r--scripts/fetch_papers.py187
-rw-r--r--scripts/single_run_test.py36
-rw-r--r--scripts/test_arxiv_widest.py78
3 files changed, 246 insertions, 55 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index 70b7b6b..6666d72 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -1,69 +1,146 @@
+import os
import requests
import feedparser
import datetime
+from github import Github
-def fetch_arxiv_bias_fairness(days=3):
+def fetch_papers_wide_then_filter(days=1, keywords=None):
"""
- 从 arXiv 中搜索过去 N 天内包含 'bias' OR 'fairness' 等关键词的论文
- 分类限定为 cs.IR (可自行改),使用 all: 字段 + submittedDate range + 本地过滤
+ 抓过去 N 天的论文(只限制 submittedDate),然后本地判断:
+ - 是否 cs.* 或 stat.*
+ - 标题/摘要是否含 keywords
+ 返回一个列表,每个元素是字典:
+ { 'title':..., 'published':..., 'link':..., 'categories':[...] }
"""
- now = datetime.datetime.utcnow()
- start_day = now - datetime.timedelta(days=days)
- # 构造日期范围 (只精确到天就行)
- # 格式: [YYYYMMDD0000 TO YYYYMMDD2359]
- start_str = start_day.strftime("%Y%m%d0000")
- end_str = now.strftime("%Y%m%d2359")
-
- # arXiv 布尔搜索表达式
- # 这里演示2个关键词 bias, fairness
- # 用 (all:bias OR all:fairness)
- # 同时限制分类 cat:cs.IR
- # 同时限制日期 submittedDate:[start_str TO end_str]
- # 并指定 sortBy=submittedDate
- search_query = f"(all:bias+OR+all:fairness)+AND+cat:cs.IR+AND+submittedDate:[{start_str}+TO+{end_str}]"
+ if keywords is None:
+ keywords = ["bias", "fairness"]
+ now_utc = datetime.datetime.now(datetime.timezone.utc)
+ start_utc = now_utc - datetime.timedelta(days=days)
+
+ start_str = start_utc.strftime("%Y%m%d%H%M")
+ end_str = now_utc.strftime("%Y%m%d%H%M")
+
+ search_query = f"submittedDate:[{start_str} TO {end_str}]"
base_url = "http://export.arxiv.org/api/query"
- params = {
- "search_query": search_query,
- "sortBy": "submittedDate",
- "sortOrder": "descending",
- "max_results": 100
- }
- print("[DEBUG] search_query=", search_query)
-
- response = requests.get(base_url, params=params)
- print("[DEBUG] Full URL =", response.url)
- if response.status_code != 200:
- print("[ERROR] HTTP Status:", response.status_code)
- return []
-
- feed = feedparser.parse(response.content)
- entries = feed.entries
- print("[DEBUG] arXiv 返回条数:", len(entries))
-
- papers = []
- for e in entries:
- title = e.title
- summary = e.summary
- published = e.published
- link = e.link
-
- # 在本地再做一个严格的匹配
- # 看标题或摘要中是否真的含 bias/fairness
- # 以免 all:bias 命中其他字段
- text = (title + " " + summary).lower()
- if ("bias" in text) or ("fairness" in text):
- papers.append({
+
+ step = 100
+ start = 0
+ all_entries = []
+
+ print(f"[DEBUG] Time range: {start_str} ~ {end_str}, days={days}")
+ while True:
+ params = {
+ "search_query": search_query,
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "start": start,
+ "max_results": step
+ }
+ print(f"[DEBUG] fetching: {start} -> {start+step}")
+ r = requests.get(base_url, params=params)
+ if r.status_code != 200:
+ print("[ERROR] HTTP status:", r.status_code)
+ break
+
+ feed = feedparser.parse(r.content)
+ got = len(feed.entries)
+ print(f"[DEBUG] got {got} entries this batch.")
+ if got == 0:
+ break
+
+ all_entries.extend(feed.entries)
+ start += step
+
+ if start >= 3000:
+ print("[DEBUG] reached 3000, stop.")
+ break
+
+ print(f"[DEBUG] total in date range: {len(all_entries)}")
+
+ matched = []
+ for e in all_entries:
+ title = getattr(e, 'title', '')
+ summary = getattr(e, 'summary', '')
+ published = getattr(e, 'published', '')
+ link = getattr(e, 'link', '')
+ if hasattr(e, 'tags'):
+ categories = [t.term for t in e.tags]
+ else:
+ categories = []
+
+ # 判定分类
+ has_cs_stat = any(c.startswith("cs.") or c.startswith("stat.") for c in categories)
+ if not has_cs_stat:
+ continue
+
+ # 判定关键词
+ text_lower = (title + " " + summary).lower()
+ if any(kw.lower() in text_lower for kw in keywords):
+ matched.append({
"title": title,
"published": published,
- "link": link
+ "link": link,
+ "categories": categories
})
- return papers
+ print(f"[DEBUG] matched {len(matched)} papers after local filter (cs./stat.+keywords)")
+ return matched
+
+def update_readme_in_repo(papers, token, repo_name):
+ """
+ 将匹配到的论文列表写入目标repo的 README.md
+ """
+ if not papers:
+ print("[INFO] No matched papers, skip README update.")
+ return
+
+ g = Github(token)
+ repo = g.get_repo(repo_name)
+
+ # 获取 README 内容
+ readme_file = repo.get_contents("README.md", ref="main")
+ readme_content = readme_file.decoded_content.decode("utf-8")
+
+ now_utc_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+ new_section = f"\n\n### Auto-captured papers on {now_utc_str}\n"
+ for p in papers:
+ cat_str = ", ".join(p["categories"])
+ new_section += f"- **{p['title']}** (Published={p['published']}) \n"
+ new_section += f" - Categories: {cat_str} \n"
+ new_section += f" - Link: {p['link']}\n\n"
+
+ updated_content = readme_content + new_section
+ commit_msg = f"Auto update README with {len(papers)} new papers"
+
+ repo.update_file(
+ path="README.md",
+ message=commit_msg,
+ content=updated_content,
+ sha=readme_file.sha,
+ branch="main"
+ )
+ print(f"[INFO] README updated with {len(papers)} papers.")
+
+def main():
+ # 1. 获取过去3天, keywords=["bias","fairness"] 的论文
+ days = 3
+ keywords = ["bias", "fairness"]
+ papers = fetch_papers_wide_then_filter(days=days, keywords=keywords)
+ print(f"[RESULT] matched {len(papers)} papers. Now let's update README in target repo if any.")
+
+ # 2. 如果有匹配论文,更新 README
+ # 需要在 secrets 或 env 里获取 token, repo name
+ github_token = os.getenv("TARGET_REPO_TOKEN")
+ target_repo_name = os.getenv("TARGET_REPO_NAME")
+ if not github_token or not target_repo_name:
+ print("[ERROR] Missing environment variables: TARGET_REPO_TOKEN / TARGET_REPO_NAME.")
+ return
+
+ if papers:
+ update_readme_in_repo(papers, github_token, target_repo_name)
+ else:
+ print("[INFO] No matched papers, done without update.")
if __name__ == "__main__":
- # 测试过去3天
- results = fetch_arxiv_bias_fairness(days=3)
- print(f"找到 {len(results)} 篇论文:")
- for i, p in enumerate(results, 1):
- print(f"{i}. {p['title']} - {p['published']} - {p['link']}")
+ main()
diff --git a/scripts/single_run_test.py b/scripts/single_run_test.py
new file mode 100644
index 0000000..fe52bb4
--- /dev/null
+++ b/scripts/single_run_test.py
@@ -0,0 +1,36 @@
+import requests
+import feedparser
+
+def test_arxiv():
+ base_url = "http://export.arxiv.org/api/query"
+ # 时间段设为 2025-03-27 00:00 到 2025-03-29 00:00
+ # 注意: 论文在 3月27日 07:54Z 提交,应该在这个区间之内
+ search_query = (
+ "(all:bias+OR+all:fairness)"
+ "+AND+cat:cs.IR"
+ "+AND+submittedDate:[202503270000+TO+202503290000]"
+ )
+
+ params = {
+ "search_query": search_query,
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "max_results": 100
+ }
+ print("[DEBUG] search_query =", search_query)
+
+ r = requests.get(base_url, params=params)
+ print("[DEBUG] Full URL =", r.url)
+ if r.status_code != 200:
+ print("[ERROR] HTTP Status:", r.status_code)
+ return
+
+ feed = feedparser.parse(r.content)
+ print("[DEBUG] Returned entries:", len(feed.entries))
+
+ # 打印出标题和发布时间供检查
+ for i, entry in enumerate(feed.entries, start=1):
+ print(f"{i}. Title: {entry.title} | updated: {entry.updated} | published: {entry.published}")
+
+if __name__ == "__main__":
+ test_arxiv()
diff --git a/scripts/test_arxiv_widest.py b/scripts/test_arxiv_widest.py
new file mode 100644
index 0000000..466c62d
--- /dev/null
+++ b/scripts/test_arxiv_widest.py
@@ -0,0 +1,78 @@
+import requests
+import feedparser
+import datetime
+
+def fetch_arxiv_full_range():
+ """
+ 不限制分类、关键词,仅根据 submittedDate 做一个宽区间。
+ 分批次抓取,每批 100 条,直到再也拿不到新条目或达到我们设定的安全上限。
+ 同时演示如何在循环中检测如果发布时间超过了上限,就可以提前退出。
+ """
+
+ base_url = "http://export.arxiv.org/api/query"
+
+ # 宽松的日期范围 [202503250000 TO 202504020000]
+ # 你可以改成更广或更精确
+ start_date_str = "202503250000"
+ end_date_str = "202504020000"
+
+ search_query = f"submittedDate:[{start_date_str} TO {end_date_str}]"
+
+ # 分批抓取
+ step = 100
+ start = 0
+ all_entries = []
+
+ while True:
+ params = {
+ "search_query": search_query,
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "start": start,
+ "max_results": step
+ }
+ print(f"[DEBUG] Fetching from index={start} to {start+step}, date range = {start_date_str} ~ {end_date_str}")
+ resp = requests.get(base_url, params=params)
+ if resp.status_code != 200:
+ print("[ERROR] HTTP status:", resp.status_code)
+ break
+
+ feed = feedparser.parse(resp.content)
+ entries = feed.entries
+ got_count = len(entries)
+ print(f"[DEBUG] Got {got_count} entries this batch.")
+
+ if got_count == 0:
+ # 没有更多数据了
+ break
+
+ # 把本批加入总list
+ all_entries.extend(entries)
+ # 下一批
+ start += step
+
+ # 自定义一个安全上限,防止无限循环或极大数据
+ if start >= 3000:
+ # 3k 只是举例
+ print("[DEBUG] Over 3000 entries, stopping to avoid extremely large dataset.")
+ break
+
+ print("[DEBUG] total retrieved:", len(all_entries))
+
+ # 现在 all_entries 就是我们抓到的全部。
+ # 可以查看是否包含 "Bias-Aware Agent..." 或做后续处理
+
+ found = False
+ for idx, e in enumerate(all_entries, 1):
+ title_lower = e.title.lower()
+ if "bias-aware agent" in title_lower:
+ found = True
+ print(f"\n[FOUND] Index={idx}, Title={e.title}, published={e.published}, updated={e.updated}")
+ break
+
+ if not found:
+ print("\n[INFO] 'Bias-Aware Agent...' not found in the entire set.")
+
+
+if __name__ == "__main__":
+ fetch_arxiv_full_range()