From 1718045290bddf35463572bf0d1df72ecb372b97 Mon Sep 17 00:00:00 2001 From: blackhao <13851610112@163.com> Date: Sat, 29 Mar 2025 07:18:32 -0500 Subject: enhance fetch method --- scripts/fetch_papers.py | 203 +++++++++++++++++++++++++++++------------------- 1 file changed, 123 insertions(+), 80 deletions(-) (limited to 'scripts/fetch_papers.py') diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index 1bc837a..7e54854 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -4,89 +4,129 @@ import feedparser import datetime from github import Github -ARXIV_CATEGORIES = "cs.CL OR cs.AI OR stat.ML OR cs.IR" +# ========== 你可以根据需要修改的参数 ========== + +# 多分类 +CATEGORIES = ["cs.CL", "cs.AI", "stat.ML", "cs.IR"] + +# 多关键词,这里用的纯文本搜索,尽量只用单词或简单词组 +# 注意:arXiv 搜索并不支持很复杂的布尔运算,也不支持真正的模糊匹配 KEYWORDS = ["bias", "debias", "fairness", "equity", "inclusivity", "diversity", "ethical AI", "responsible AI"] +# 在构造查询时,我们会把 KEYWORDS 用 (ti:xxx OR abs:xxx) OR (ti:yyy OR abs:yyy) 组合起来 +# 这样 arXiv 只返回标题或摘要中出现这些关键词的论文 + +# 最多要处理多少结果(翻页抓取上限) +MAX_PER_CATEGORY = 300 + +# GitHub 相关 TARGET_REPO_TOKEN = os.getenv("TARGET_REPO_TOKEN") TARGET_REPO_NAME = os.getenv("TARGET_REPO_NAME") -def fetch_arxiv_papers(): +def build_search_query(keywords, category): """ - 从arXiv获取过去24小时的新论文,并打印一些调试信息 + 构造 arXiv API 中的 search_query 字符串(示例: (ti:bias OR abs:bias) OR (ti:fairness OR abs:fairness) ... AND cat:cs.AI) + """ + # 针对多关键词,构造括号表达式 + # (ti:bias OR abs:bias) OR (ti:debias OR abs:debias) OR ... + kw_expressions = [] + for kw in keywords: + kw_escaped = kw.replace(" ", "+") # 避免空格等符号出问题 + part = f"(ti:{kw_escaped}+OR+abs:{kw_escaped})" + kw_expressions.append(part) + # 用 +OR+ 串联每个关键词 + combined_keywords = "+OR+".join(kw_expressions) + + # 最终把分类也加进来: AND cat:cs.AI + query = f"({combined_keywords})+AND+cat:{category}" + return query + +def search_category_for_24h(category, keywords, hours=24): + """ + 在给定分类下,用关键词搜索,按提交时间降序遍历,直到遇到超过24小时的论文就停止。 + 返回满足条件的论文列表 """ - # 使用带时区的UTC时间 now_utc = datetime.datetime.now(datetime.timezone.utc) - yesterday_utc = now_utc - datetime.timedelta(days=1) - - print("DEBUG: 当前UTC时间:", now_utc) - print("DEBUG: 过去24小时阈值:", yesterday_utc) + cutoff = now_utc - datetime.timedelta(hours=hours) base_url = "http://export.arxiv.org/api/query" - params = { - "search_query": f"cat:{ARXIV_CATEGORIES}", - "sortBy": "submittedDate", - "sortOrder": "descending", - "max_results": 100 - } - - # 发起请求 - r = requests.get(base_url, params=params) - print(f"DEBUG: 请求URL = {r.url}") - print(f"DEBUG: HTTP状态码 = {r.status_code}") - - feed = feedparser.parse(r.content) - print(f"DEBUG: feed.entries 长度 = {len(feed.entries)}") - - papers = [] - for idx, entry in enumerate(feed.entries): - published_naive = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ") - published_utc = published_naive.replace(tzinfo=datetime.timezone.utc) - - # 调试输出 - print(f"\n=== 第 {idx+1} 篇论文 ===") - print("标题: ", entry.title) - print("发布时间(UTC): ", published_utc) - print("链接: ", entry.link) - print("摘要(前200字符): ", entry.summary[:200], "...") - - if published_utc > yesterday_utc: - papers.append({ + query_str = build_search_query(keywords, category) + + all_papers = [] + start = 0 + step = 50 # 每次取多少条,可以 50 或 100 + while True: + params = { + "search_query": query_str, + "sortBy": "submittedDate", + "sortOrder": "descending", + "start": start, + "max_results": step + } + print(f"\n[DEBUG] 正在请求分类: {category}, start={start}, 查询URL参数: {params}") + + resp = requests.get(base_url, params=params) + print("[DEBUG] HTTP状态码:", resp.status_code) + feed = feedparser.parse(resp.content) + + entries = feed.entries + print(f"[DEBUG] 本批返回 {len(entries)} 篇论文 (start={start}).") + + if not entries: + break # 没有更多结果,退出 + + for i, entry in enumerate(entries): + # 解析发布时间 + published_naive = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ") + published_utc = published_naive.replace(tzinfo=datetime.timezone.utc) + + if published_utc < cutoff: + # 一旦发现超过24小时的论文,因为是降序,所以后面都没必要看了 + print(f"[DEBUG] 论文 {entry.title} 时间 {published_utc} 已超过 {cutoff}, 提前退出本分类循环") + break + + # 如果在24小时内,加入列表 + all_papers.append({ "title": entry.title, "url": entry.link, - "abstract": entry.summary + "abstract": entry.summary, + "published": published_utc }) - print("--> 该论文在24小时之内,加入papers列表") - else: - print("--> 该论文超过24小时,不加入") - print(f"\nDEBUG: 最终 papers 长度 = {len(papers)}") - return papers - -def filter_papers(papers): + else: + # 如果 for 循环是正常结束(没有 break),说明本批都在24小时内 + # 需要翻页继续抓取下一批 + # 但要防止无限翻页:如果超过我们设定的上限,也跳出 + start += step + if start >= MAX_PER_CATEGORY: + print(f"[DEBUG] 已达本分类抓取上限 {MAX_PER_CATEGORY} 篇, 停止继续翻页。") + break + continue + + # 如果执行到了 break,则要退出 while True + break + + print(f"[DEBUG] 分类 {category} 最终收集到 {len(all_papers)} 篇论文(24小时内)") + return all_papers + +def fetch_arxiv_papers_24h(): """ - 用关键词匹配,返回与 KEYWORDS 匹配的论文,并打印调试信息 + 遍历多个分类,对每个分类做关键词搜索并合并结果 + 去重逻辑(简单版本)可按title+published来判断是否已出现 """ - relevant = [] - for idx, p in enumerate(papers): - abstract_lower = p["abstract"].lower() - title_lower = p["title"].lower() - - # 调试打印 - found_keywords = [] - for kw in KEYWORDS: - if kw.lower() in abstract_lower or kw.lower() in title_lower: - found_keywords.append(kw) - - if found_keywords: - print(f"\n[匹配] 第 {idx+1} 篇论文: {p['title']}") - print("匹配到的关键词: ", found_keywords) - relevant.append(p) - else: - print(f"\n[未匹配] 第 {idx+1} 篇论文: {p['title']}") - print("没有匹配到任何关键词") + unique_papers = [] + seen_set = set() # 用来去重,存放 (title, published) + + for cat in CATEGORIES: + cat_papers = search_category_for_24h(cat, KEYWORDS, hours=24) + for p in cat_papers: + key = (p["title"], p["published"]) + if key not in seen_set: + seen_set.add(key) + unique_papers.append(p) - print(f"\nDEBUG: relevant_papers 长度 = {len(relevant)}") - return relevant + print(f"[DEBUG] 全部分类合并后共 {len(unique_papers)} 篇论文。") + return unique_papers def update_readme_in_target(relevant_papers): """ @@ -96,25 +136,25 @@ def update_readme_in_target(relevant_papers): print("No relevant papers found. Skipping README update.") return - # 创建 GitHub 客户端 + # 按提交时间降序排序再写入,也许你想先写最新的 + relevant_papers.sort(key=lambda x: x["published"], reverse=True) + g = Github(TARGET_REPO_TOKEN) repo = g.get_repo(TARGET_REPO_NAME) - # 获取 README - print(f"DEBUG: 获取目标仓库 {TARGET_REPO_NAME} 的 README.md") readme_file = repo.get_contents("README.md", ref="main") readme_content = readme_file.decoded_content.decode("utf-8") date_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d") - new_section = f"\n\n### {date_str}\n" - + new_section = f"\n\n### {date_str} (自动关键词搜索)\n" for p in relevant_papers: - new_section += f"- **[{p['title']}]({p['url']})**\n" + # 时间显示一下 + pub_str = p["published"].strftime("%Y-%m-%d %H:%M UTC") + new_section += f"- **[{p['title']}]({p['url']})** (Published: {pub_str})\n" updated_content = readme_content + new_section - print(f"DEBUG: 即将在 README.md 添加的内容:\n{new_section}") + print(f"[DEBUG] 即将在 README.md 添加的内容:\n{new_section}") - # 更新 README repo.update_file( path="README.md", message=f"Auto Update README with {len(relevant_papers)} papers ({date_str})", @@ -122,14 +162,17 @@ def update_readme_in_target(relevant_papers): sha=readme_file.sha, branch="main" ) - print("DEBUG: README.md 更新完成") + print("[DEBUG] README.md 更新完成") def main(): - print("DEBUG: 开始执行 main() ...") - papers = fetch_arxiv_papers() - relevant_papers = filter_papers(papers) - update_readme_in_target(relevant_papers) - print("DEBUG: 脚本执行结束.") + print("[DEBUG] 开始执行 main() ...") + # 1. 从 arXiv 获取过去24小时的论文(带关键词搜索 & 多分类 & 自动停止) + papers_24h = fetch_arxiv_papers_24h() + + # 2. 将结果写入目标仓库 + update_readme_in_target(papers_24h) + + print("[DEBUG] 脚本执行结束.") if __name__ == "__main__": main() -- cgit v1.2.3