scripts/fetch_papers.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

import requests
import feedparser
import datetime

def fetch_arxiv_bias_fairness(days=3):
    """
    从 arXiv 中搜索过去 N 天内包含 'bias' OR 'fairness' 等关键词的论文
    分类限定为 cs.IR (可自行改)，使用 all: 字段 + submittedDate range + 本地过滤
    """
    now = datetime.datetime.utcnow()
    start_day = now - datetime.timedelta(days=days)
    # 构造日期范围 (只精确到天就行)
    # 格式: [YYYYMMDD0000 TO YYYYMMDD2359]
    start_str = start_day.strftime("%Y%m%d0000")
    end_str = now.strftime("%Y%m%d2359")

    # arXiv 布尔搜索表达式
    # 这里演示2个关键词 bias, fairness
    # 用 (all:bias OR all:fairness)
    # 同时限制分类 cat:cs.IR
    # 同时限制日期 submittedDate:[start_str TO end_str]
    # 并指定 sortBy=submittedDate
    search_query = f"(all:bias+OR+all:fairness)+AND+cat:cs.IR+AND+submittedDate:[{start_str}+TO+{end_str}]"

    base_url = "http://export.arxiv.org/api/query"
    params = {
        "search_query": search_query,
        "sortBy": "submittedDate",
        "sortOrder": "descending",
        "max_results": 100
    }
    print("[DEBUG] search_query=", search_query)

    response = requests.get(base_url, params=params)
    print("[DEBUG] Full URL =", response.url)
    if response.status_code != 200:
        print("[ERROR] HTTP Status:", response.status_code)
        return []

    feed = feedparser.parse(response.content)
    entries = feed.entries
    print("[DEBUG] arXiv 返回条数:", len(entries))

    papers = []
    for e in entries:
        title = e.title
        summary = e.summary
        published = e.published
        link = e.link

        # 在本地再做一个严格的匹配
        # 看标题或摘要中是否真的含 bias/fairness
        # 以免 all:bias 命中其他字段
        text = (title + " " + summary).lower()
        if ("bias" in text) or ("fairness" in text):
            papers.append({
                "title": title,
                "published": published,
                "link": link
            })

    return papers

if __name__ == "__main__":
    # 测试过去3天
    results = fetch_arxiv_bias_fairness(days=3)
    print(f"找到 {len(results)} 篇论文：")
    for i, p in enumerate(results, 1):
        print(f"{i}. {p['title']} - {p['published']} - {p['link']}")