diff options
| author | blackhao <13851610112@163.com> | 2025-03-29 06:30:26 -0500 |
|---|---|---|
| committer | blackhao <13851610112@163.com> | 2025-03-29 06:30:26 -0500 |
| commit | c75c7d42be8aeb998b36f5e444e844b7242bbb90 (patch) | |
| tree | 9c06a5daefc9b793b346c09be4a8bcf1a3ad3264 | |
| parent | e87fbb1d701f96ee53a2a7e9e5cc5ca6624ff85f (diff) | |
first version for test
| -rw-r--r-- | .github/workflows/daily_papers.yml | 27 | ||||
| -rw-r--r-- | scripts/fetch_papers.py | 89 |
2 files changed, 116 insertions, 0 deletions
diff --git a/.github/workflows/daily_papers.yml b/.github/workflows/daily_papers.yml index e69de29..d32db15 100644 --- a/.github/workflows/daily_papers.yml +++ b/.github/workflows/daily_papers.yml @@ -0,0 +1,27 @@ +name: Daily Paper Fetch + +on: + schedule: + - cron: '0 12 * * *' # 每天UTC 12:00 触发 + workflow_dispatch: # 支持手动触发 + +jobs: + fetch-and-update-readme: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.10 + + - name: Install dependencies + run: | + pip install requests sentence-transformers feedparser PyGithub + + - name: Run script + env: + TARGET_REPO_TOKEN: ${{ secrets.TARGET_REPO_TOKEN }} # 之后我们会在主仓库配置这个 + TARGET_REPO_NAME: "YourUsername/llm-bias-papers" # 后面我们会改成你的目标仓库名 + run: python scripts/fetch_papers.py diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index e69de29..49c3ba8 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -0,0 +1,89 @@ +import os +import requests +import feedparser +import datetime +from github import Github + +ARXIV_CATEGORIES = "cs.CL OR cs.AI OR stat.ML" +KEYWORDS = ["LLM bias", "debias", "fairness", "equity", "inclusivity", "diversity", "ethical AI", "responsible AI"] + +TARGET_REPO_TOKEN = os.getenv("TARGET_REPO_TOKEN") +TARGET_REPO_NAME = os.getenv("TARGET_REPO_NAME") + +def fetch_arxiv_papers(): + """ + 从arXiv获取过去24小时的新论文 + """ + base_url = "http://export.arxiv.org/api/query" + # 使用带时区的UTC时间 + now_utc = datetime.datetime.now(datetime.timezone.utc) + yesterday_utc = now_utc - datetime.timedelta(days=1) + + params = { + "search_query": f"cat:{ARXIV_CATEGORIES}", + "sortBy": "submittedDate", + "sortOrder": "descending", + "max_results": 100 + } + r = requests.get(base_url, params=params) + feed = feedparser.parse(r.content) + + papers = [] + for entry in feed.entries: + # entry.published 形如 "2025-03-28T10:05:24Z" + # 先解析出一个 naive datetime + published_naive = datetime.datetime.strptime(entry.published, "%Y-%m-%dT%H:%M:%SZ") + # 再为它添加UTC时区,使其成为 aware datetime + published_utc = published_naive.replace(tzinfo=datetime.timezone.utc) + + if published_utc > yesterday_utc: + papers.append({ + "title": entry.title, + "url": entry.link, + "abstract": entry.summary + }) + return papers + +def filter_papers(papers): + relevant = [] + for p in papers: + abstract_lower = p["abstract"].lower() + title_lower = p["title"].lower() + if any(kw.lower() in abstract_lower or kw.lower() in title_lower for kw in KEYWORDS): + relevant.append(p) + return relevant + +def update_readme_in_target(relevant_papers): + if not relevant_papers: + print("No relevant papers found. Skipping README update.") + return + + g = Github(TARGET_REPO_TOKEN) + repo = g.get_repo(TARGET_REPO_NAME) + + readme_file = repo.get_contents("README.md", ref="main") + readme_content = readme_file.decoded_content.decode("utf-8") + + # 此处同样可以用带时区的时间来记录日期(格式化字符串不影响) + date_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d") + new_section = f"\n\n### {date_str}\n" + for p in relevant_papers: + new_section += f"- **[{p['title']}]({p['url']})**\n" + + updated_content = readme_content + new_section + + repo.update_file( + path="README.md", + message=f"Auto Update README with {len(relevant_papers)} papers ({date_str})", + content=updated_content, + sha=readme_file.sha, + branch="main" + ) + +def main(): + papers = fetch_arxiv_papers() + relevant_papers = filter_papers(papers) + update_readme_in_target(relevant_papers) + +if __name__ == "__main__": + main() |
