diff options
| -rw-r--r-- | scripts/fetch_papers.py | 222 |
1 files changed, 111 insertions, 111 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index 7e54854..3ab09ec 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -9,135 +9,138 @@ from github import Github # 多分类 CATEGORIES = ["cs.CL", "cs.AI", "stat.ML", "cs.IR"] -# 多关键词,这里用的纯文本搜索,尽量只用单词或简单词组 -# 注意:arXiv 搜索并不支持很复杂的布尔运算,也不支持真正的模糊匹配 -KEYWORDS = ["bias", "debias", "fairness", "equity", "inclusivity", "diversity", "ethical AI", "responsible AI"] - -# 在构造查询时,我们会把 KEYWORDS 用 (ti:xxx OR abs:xxx) OR (ti:yyy OR abs:yyy) 组合起来 -# 这样 arXiv 只返回标题或摘要中出现这些关键词的论文 - -# 最多要处理多少结果(翻页抓取上限) +# 单个关键词列表(避免多词短语搜索不到) +# 如果想搜索短语,可尝试 "ti:\"ethical AI\"+OR+abs:\"ethical AI\"" 但效果可能并不好 +KEYWORDS = [ + "bias", + "debias", + "fairness", + "equity", + "inclusivity", + "diversity", + "ethical", + "responsible", + # 如果想搜索 AI 这个单词,可以加进去,但“AI”几乎会命中太多论文 + # "AI" +] + +# 设置要抓取的时间范围(过去 7 天,便于测试) +HOURS_TO_FETCH = 24 * 7 # 7天 + +# 最大抓取数 MAX_PER_CATEGORY = 300 # GitHub 相关 TARGET_REPO_TOKEN = os.getenv("TARGET_REPO_TOKEN") TARGET_REPO_NAME = os.getenv("TARGET_REPO_NAME") -def build_search_query(keywords, category): + +def build_search_query_single_word(keyword, category): """ - 构造 arXiv API 中的 search_query 字符串(示例: (ti:bias OR abs:bias) OR (ti:fairness OR abs:fairness) ... AND cat:cs.AI) + 对单个词构造类似于 (ti:bias OR abs:bias) AND cat:cs.AI + 注意:arXiv搜索要求把空格等转成 "+" """ - # 针对多关键词,构造括号表达式 - # (ti:bias OR abs:bias) OR (ti:debias OR abs:debias) OR ... - kw_expressions = [] - for kw in keywords: - kw_escaped = kw.replace(" ", "+") # 避免空格等符号出问题 - part = f"(ti:{kw_escaped}+OR+abs:{kw_escaped})" - kw_expressions.append(part) - # 用 +OR+ 串联每个关键词 - combined_keywords = "+OR+".join(kw_expressions) - - # 最终把分类也加进来: AND cat:cs.AI - query = f"({combined_keywords})+AND+cat:{category}" + kw_escaped = keyword.replace(" ", "+") + query = f"(ti:{kw_escaped}+OR+abs:{kw_escaped})+AND+cat:{category}" return query -def search_category_for_24h(category, keywords, hours=24): +def search_category_7days(category, keywords, hours=HOURS_TO_FETCH): """ - 在给定分类下,用关键词搜索,按提交时间降序遍历,直到遇到超过24小时的论文就停止。 - 返回满足条件的论文列表 + 在给定分类下,用【单词关键词】循环搜索,每个keyword都请求一次,然后合并去重。 + 这样避免了 (keyword1 OR keyword2) 对arXiv API可能无效的问题。 + 依旧按提交时间降序遍历,直到遇到超过 hours 范围的论文就停。 """ now_utc = datetime.datetime.now(datetime.timezone.utc) cutoff = now_utc - datetime.timedelta(hours=hours) - base_url = "http://export.arxiv.org/api/query" - query_str = build_search_query(keywords, category) - all_papers = [] - start = 0 - step = 50 # 每次取多少条,可以 50 或 100 - while True: - params = { - "search_query": query_str, - "sortBy": "submittedDate", - "sortOrder": "descending", - "start": start, - "max_results": step - } - print(f"\n[DEBUG] 正在请求分类: {category}, start={start}, 查询URL参数: {params}") - - resp = requests.get(base_url, params=params) - print("[DEBUG] HTTP状态码:", resp.status_code) - feed = feedparser.parse(resp.content) - - entries = feed.entries - print(f"[DEBUG] 本批返回 {len(entries)} 篇论文 (start={start}).") - - if not entries: - break # 没有更多结果,退出 - - for i, entry in enumerate(entries): - # 解析发布时间 - published_naive = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ") - published_utc = published_naive.replace(tzinfo=datetime.timezone.utc) - - if published_utc < cutoff: - # 一旦发现超过24小时的论文,因为是降序,所以后面都没必要看了 - print(f"[DEBUG] 论文 {entry.title} 时间 {published_utc} 已超过 {cutoff}, 提前退出本分类循环") + seen = set() # 用来去重 + + for kw in keywords: + base_url = "http://export.arxiv.org/api/query" + start = 0 + step = 50 + + while True: + query_str = build_search_query_single_word(kw, category) + params = { + "search_query": query_str, + "sortBy": "submittedDate", + "sortOrder": "descending", + "start": start, + "max_results": step + } + + print(f"\n[DEBUG] 分类={category}, 关键词={kw}, start={start}, 请求参数={params}") + resp = requests.get(base_url, params=params) + print("[DEBUG] HTTP状态码:", resp.status_code) + feed = feedparser.parse(resp.content) + entries = feed.entries + print(f"[DEBUG] 本批返回 {len(entries)} 篇论文.") + + if not entries: + break # 没有更多结果了 + + stop_fetch = False # 如果遇到超过7天的论文,就置为 True 并跳出 + + for entry in entries: + published_naive = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ") + published_utc = published_naive.replace(tzinfo=datetime.timezone.utc) + + if published_utc < cutoff: + # 这篇及后面都不用看了,按提交时间降序 + print(f"[DEBUG] 论文 {entry.title[:60]}... 已超过 {cutoff} 提交时间") + stop_fetch = True + break + + # 去重判断 + unique_key = (entry.id, published_utc) + if unique_key in seen: + continue + + seen.add(unique_key) + all_papers.append({ + "title": entry.title, + "url": entry.link, + "abstract": entry.summary, + "published": published_utc + }) + + if stop_fetch: + # 跳出 while True break - # 如果在24小时内,加入列表 - all_papers.append({ - "title": entry.title, - "url": entry.link, - "abstract": entry.summary, - "published": published_utc - }) - - else: - # 如果 for 循环是正常结束(没有 break),说明本批都在24小时内 - # 需要翻页继续抓取下一批 - # 但要防止无限翻页:如果超过我们设定的上限,也跳出 start += step if start >= MAX_PER_CATEGORY: print(f"[DEBUG] 已达本分类抓取上限 {MAX_PER_CATEGORY} 篇, 停止继续翻页。") break - continue - - # 如果执行到了 break,则要退出 while True - break - print(f"[DEBUG] 分类 {category} 最终收集到 {len(all_papers)} 篇论文(24小时内)") + print(f"[DEBUG] 分类 {category} 最终收集到 {len(all_papers)} 篇论文(过去{hours}小时内,含重复去重)") return all_papers -def fetch_arxiv_papers_24h(): +def fetch_arxiv_papers(): """ - 遍历多个分类,对每个分类做关键词搜索并合并结果 - 去重逻辑(简单版本)可按title+published来判断是否已出现 + 遍历多个分类,对每个分类做关键词搜索再合并 """ - unique_papers = [] - seen_set = set() # 用来去重,存放 (title, published) - + total_papers = [] for cat in CATEGORIES: - cat_papers = search_category_for_24h(cat, KEYWORDS, hours=24) - for p in cat_papers: - key = (p["title"], p["published"]) - if key not in seen_set: - seen_set.add(key) - unique_papers.append(p) + cat_papers = search_category_7days(cat, KEYWORDS, HOURS_TO_FETCH) + total_papers.extend(cat_papers) - print(f"[DEBUG] 全部分类合并后共 {len(unique_papers)} 篇论文。") - return unique_papers + print(f"[DEBUG] 所有分类加起来总数: {len(total_papers)}") -def update_readme_in_target(relevant_papers): - """ - 将匹配到的论文信息追加到目标仓库的 README.md - """ - if not relevant_papers: - print("No relevant papers found. Skipping README update.") + # 这里不再二次去重,因为我们在search_category_7days内部对同分类+同一关键词已做去重 + # 如果你想进一步去重(不同分类之间有重叠),可以在此做 set() 处理 + + return total_papers + +def update_readme_in_target(papers): + if not papers: + print("[DEBUG] No relevant papers found. Skipping README update.") return - # 按提交时间降序排序再写入,也许你想先写最新的 - relevant_papers.sort(key=lambda x: x["published"], reverse=True) + # 按提交时间降序 + papers.sort(key=lambda x: x["published"], reverse=True) g = Github(TARGET_REPO_TOKEN) repo = g.get_repo(TARGET_REPO_NAME) @@ -146,18 +149,18 @@ def update_readme_in_target(relevant_papers): readme_content = readme_file.decoded_content.decode("utf-8") date_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d") - new_section = f"\n\n### {date_str} (自动关键词搜索)\n" - for p in relevant_papers: - # 时间显示一下 - pub_str = p["published"].strftime("%Y-%m-%d %H:%M UTC") - new_section += f"- **[{p['title']}]({p['url']})** (Published: {pub_str})\n" + new_section = f"\n\n### {date_str} (过去{HOURS_TO_FETCH}小时内单词搜索)\n" + for p in papers: + pub_str = p["published"].strftime("%Y-%m-%d %H:%M") + new_section += f"- **[{p['title']}]({p['url']})** (Published: {pub_str} UTC)\n" updated_content = readme_content + new_section - print(f"[DEBUG] 即将在 README.md 添加的内容:\n{new_section}") + + print(f"[DEBUG] 即将在 README.md 添加的内容:\n{new_section[:500]}...") # 只打印500字作为示例 repo.update_file( path="README.md", - message=f"Auto Update README with {len(relevant_papers)} papers ({date_str})", + message=f"Auto Update README with {len(papers)} papers ({date_str})", content=updated_content, sha=readme_file.sha, branch="main" @@ -165,13 +168,10 @@ def update_readme_in_target(relevant_papers): print("[DEBUG] README.md 更新完成") def main(): - print("[DEBUG] 开始执行 main() ...") - # 1. 从 arXiv 获取过去24小时的论文(带关键词搜索 & 多分类 & 自动停止) - papers_24h = fetch_arxiv_papers_24h() - - # 2. 将结果写入目标仓库 - update_readme_in_target(papers_24h) - + print("[DEBUG] 脚本开始执行...") + papers = fetch_arxiv_papers() + print(f"[DEBUG] fetch_arxiv_papers() 返回 {len(papers)} 篇最终论文.") + update_readme_in_target(papers) print("[DEBUG] 脚本执行结束.") if __name__ == "__main__": |
