summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhaoyuren <13851610112@163.com>2025-06-29 16:19:06 -0700
committerhaoyuren <13851610112@163.com>2025-06-29 16:19:06 -0700
commit388f0407ef8c9f68866509f722491fcfd44afa11 (patch)
treee90b933967670d08be1134e80bdee0187ec606f3
parentda629fd626deb304ac8c562bfafbf4922c997b18 (diff)
fix bugs
-rw-r--r--.github/workflows/daily_papers.yml83
-rw-r--r--README.md184
-rw-r--r--requirements.txt5
-rw-r--r--scripts/fetch_papers.py514
-rw-r--r--scripts/single_run_test.py36
-rw-r--r--scripts/test_arxiv_widest.py78
-rw-r--r--scripts/test_daily_fetch.py122
-rw-r--r--scripts/test_historical_fetch.py117
-rw-r--r--scripts/test_uiuc_chat.py71
9 files changed, 819 insertions, 391 deletions
diff --git a/.github/workflows/daily_papers.yml b/.github/workflows/daily_papers.yml
index cae6b6c..510e75c 100644
--- a/.github/workflows/daily_papers.yml
+++ b/.github/workflows/daily_papers.yml
@@ -1,30 +1,71 @@
-name: Daily Paper Fetch
+name: Arxiv LLM Bias Paper Fetcher
on:
schedule:
+ # Run daily at 12:00 UTC
- cron: '0 12 * * *'
workflow_dispatch:
+ inputs:
+ mode:
+ description: 'Fetch mode: daily or historical'
+ required: true
+ default: 'daily'
+ type: choice
+ options:
+ - daily
+ - historical
+ days:
+ description: 'Number of days to fetch (only for daily mode)'
+ required: false
+ default: '1'
+ type: string
jobs:
- fetch-job:
+ fetch-papers:
runs-on: ubuntu-latest
steps:
- - name: Check out
- uses: actions/checkout@v4
-
- - name: Set up Python
- uses: actions/setup-python@v5
- with:
- python-version: '3.10'
-
- - name: Install dependencies
- run: |
- pip install requests feedparser PyGithub openai
-
- - name: Run fetch script
- env:
- TARGET_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} # GitHub 的自动 token
- TARGET_REPO_NAME: "YurenHao0426/awesome-llm-bias-papers"
- UIUC_API_KEY: ${{ secrets.UIUC_API_KEY }} # 取出你的API密钥
- OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- run: python scripts/fetch_papers.py
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.11'
+
+ - name: Cache pip dependencies
+ uses: actions/cache@v4
+ with:
+ path: ~/.cache/pip
+ key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
+ restore-keys: |
+ ${{ runner.os }}-pip-
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements.txt
+
+ - name: Run paper fetcher (Daily Mode)
+ if: github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.mode == 'daily')
+ env:
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+ TARGET_REPO_TOKEN: ${{ secrets.TARGET_REPO_TOKEN }}
+ TARGET_REPO_NAME: "YurenHao0426/awesome-llm-bias-papers"
+ FETCH_MODE: "daily"
+ FETCH_DAYS: ${{ github.event.inputs.days || '1' }}
+ run: python scripts/fetch_papers.py
+
+ - name: Run paper fetcher (Historical Mode)
+ if: github.event_name == 'workflow_dispatch' && github.event.inputs.mode == 'historical'
+ env:
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+ TARGET_REPO_TOKEN: ${{ secrets.TARGET_REPO_TOKEN }}
+ TARGET_REPO_NAME: "YurenHao0426/awesome-llm-bias-papers"
+ FETCH_MODE: "historical"
+ run: python scripts/fetch_papers.py
+
+ - name: Log completion
+ run: |
+ echo "Paper fetching completed successfully!"
+ echo "Mode: ${{ github.event.inputs.mode || 'daily' }}"
+ echo "Repository: YurenHao0426/awesome-llm-bias-papers"
diff --git a/README.md b/README.md
index 7913174..8dc5c80 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,183 @@
-# PaperFetcher \ No newline at end of file
+# ArXiv LLM Bias Paper Fetcher
+
+An automated system for discovering and cataloging research papers related to bias in Large Language Models (LLMs) from arXiv.org. This tool uses GPT-4o to intelligently filter papers and automatically updates a target repository with newly discovered relevant research.
+
+## 🎯 Features
+
+- **Intelligent Paper Detection**: Uses GPT-4o to analyze paper titles and abstracts for LLM bias relevance
+- **Automated Daily Updates**: Runs daily via GitHub Actions to fetch the latest papers
+- **Historical Paper Collection**: Can fetch and process papers from the past 2 years
+- **GitHub Integration**: Automatically updates target repository README with new findings
+- **Comprehensive Filtering**: Focuses on AI/ML categories most likely to contain relevant research
+
+## 🔧 Setup & Configuration
+
+### Prerequisites
+
+- Python 3.11+
+- OpenAI API key with GPT-4o access
+- GitHub Personal Access Token with repository write permissions
+
+### Environment Variables
+
+Configure the following environment variables:
+
+```bash
+OPENAI_API_KEY=your_openai_api_key_here
+TARGET_REPO_TOKEN=your_github_token_here
+TARGET_REPO_NAME=username/repository-name
+```
+
+For GitHub Actions, these should be configured as repository secrets:
+- `OPENAI_API_KEY`
+- `TARGET_REPO_TOKEN`
+
+### Installation
+
+1. Clone this repository:
+```bash
+git clone https://github.com/YurenHao0426/PaperFetcher.git
+cd PaperFetcher
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## 🚀 Usage
+
+### Daily Paper Fetching
+
+Fetch papers from the last 24 hours:
+```bash
+python scripts/fetch_papers.py
+```
+
+Fetch papers from the last N days:
+```bash
+FETCH_DAYS=7 python scripts/fetch_papers.py
+```
+
+### Historical Paper Fetching
+
+Fetch papers from the past 2 years:
+```bash
+FETCH_MODE=historical python scripts/fetch_papers.py
+```
+
+### Testing
+
+Test the historical fetching functionality:
+```bash
+python scripts/test_historical_fetch.py
+```
+
+## 🤖 GitHub Actions
+
+The project includes automated GitHub Actions workflows:
+
+### Daily Schedule
+- Runs daily at 12:00 UTC
+- Fetches papers from the last 24 hours
+- Updates target repository automatically
+
+### Manual Trigger
+- Can be triggered manually from GitHub Actions tab
+- Supports both `daily` and `historical` modes
+- Configurable number of days for daily mode
+
+## 📋 Paper Categories
+
+The system searches these arXiv categories for relevant papers:
+
+- `cs.AI` - Artificial Intelligence
+- `cs.CL` - Computation and Language
+- `cs.CV` - Computer Vision and Pattern Recognition
+- `cs.LG` - Machine Learning
+- `cs.NE` - Neural and Evolutionary Computing
+- `cs.RO` - Robotics
+- `cs.IR` - Information Retrieval
+- `cs.HC` - Human-Computer Interaction
+- `stat.ML` - Machine Learning (Statistics)
+
+## 🎯 Relevance Criteria
+
+Papers are considered relevant if they discuss:
+
+- Bias in large language models, generative AI, or foundation models
+- Fairness issues in NLP models or text generation
+- Ethical concerns with language models
+- Demographic bias in AI systems
+- Alignment and safety of language models
+- Bias evaluation or mitigation in NLP
+
+## 📁 Project Structure
+
+```
+PaperFetcher/
+├── scripts/
+│ ├── fetch_papers.py # Main fetching script
+│ ├── test_historical_fetch.py # Historical fetching test
+│ └── [other test scripts] # Legacy test scripts
+├── .github/
+│ └── workflows/
+│ └── daily_papers.yml # GitHub Actions workflow
+├── requirements.txt # Python dependencies
+└── README.md # This file
+```
+
+## 🔍 How It Works
+
+1. **Paper Retrieval**: Queries arXiv API for papers in relevant CS categories
+2. **Date Filtering**: Filters papers based on submission/update dates
+3. **AI Analysis**: Uses GPT-4o to analyze each paper's title and abstract
+4. **Repository Update**: Adds relevant papers to target repository's README
+5. **Version Control**: Commits changes with descriptive commit messages
+
+## ⚙️ Configuration Options
+
+### Environment Variables
+
+| Variable | Description | Default | Required |
+|----------|-------------|---------|----------|
+| `OPENAI_API_KEY` | OpenAI API key for GPT-4o | - | Yes |
+| `TARGET_REPO_TOKEN` | GitHub token for repository access | - | Yes |
+| `TARGET_REPO_NAME` | Target repository (owner/repo format) | `YurenHao0426/awesome-llm-bias-papers` | No |
+| `FETCH_MODE` | Mode: `daily` or `historical` | `daily` | No |
+| `FETCH_DAYS` | Number of days to fetch (daily mode) | `1` | No |
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+1. **API Rate Limits**: The system includes retry logic and respects API limits
+2. **Large Historical Fetches**: Historical mode processes up to 5000 papers and may take time
+3. **Token Permissions**: Ensure GitHub token has write access to target repository
+
+### Debugging
+
+Enable debug logging by modifying the logging level in the script:
+```python
+logging.basicConfig(level=logging.DEBUG)
+```
+
+## 🤝 Contributing
+
+1. Fork the repository
+2. Create a feature branch
+3. Make your changes
+4. Test thoroughly
+5. Submit a pull request
+
+## 📄 License
+
+This project is open source and available under the MIT License.
+
+## 📧 Contact
+
+For questions or issues, please open a GitHub issue or contact the maintainer.
+
+---
+
+**Note**: This tool is designed for academic research purposes. Please respect arXiv's usage policies and OpenAI's API terms of service. \ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..4618867
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+requests>=2.31.0
+feedparser>=6.0.10
+PyGithub>=1.58.0
+openai>=1.0.0
+python-dateutil>=2.8.2 \ No newline at end of file
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index a0d98f3..4fdfc87 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -1,203 +1,349 @@
+#!/usr/bin/env python3
+"""
+Arxiv Paper Fetcher for LLM Bias Research
+==========================================
+
+This script fetches computer science papers from arxiv.org, filters them using
+GPT-4o to identify papers related to LLM bias and fairness, and updates a
+target GitHub repository's README with the results.
+
+Features:
+- Fetches papers from the last 24 hours (or specified days)
+- Can also fetch historical papers from the past 2 years
+- Uses GPT-4o for intelligent filtering
+- Updates target repository via GitHub API
+- Supports GitHub Actions automation
+"""
+
import os
+import sys
+import json
+import logging
import requests
import feedparser
-import datetime
+from datetime import datetime, timezone, timedelta
+from typing import List, Dict, Optional, Tuple
from github import Github
from openai import OpenAI
-
-ALLOWED_CATEGORIES = [
- "cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO",
- "cs.IR", "stat.ML"
-]
-
-SYSTEM_PROMPT = (
- "You are a helpful assistant. The user will give you a paper title and abstract. "
- "Your task: Decide if this paper is about large language models (or generative text models) AND about bias/fairness. "
- "If yes, respond with just a single character: 1. Otherwise, respond with a single character: 0. "
- "No extra explanation, no punctuation—only the number."
+# Configure logging
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - %(message)s',
+ handlers=[
+ logging.StreamHandler(sys.stdout),
+ ]
)
+logger = logging.getLogger(__name__)
+
+# Configuration
+ARXIV_BASE_URL = "http://export.arxiv.org/api/query"
+MAX_RESULTS_PER_BATCH = 100
+MAX_RETRIES = 3
+
+# Computer Science categories related to AI/ML
+CS_CATEGORIES = [
+ "cs.AI", # Artificial Intelligence
+ "cs.CL", # Computation and Language
+ "cs.CV", # Computer Vision and Pattern Recognition
+ "cs.LG", # Machine Learning
+ "cs.NE", # Neural and Evolutionary Computing
+ "cs.RO", # Robotics
+ "cs.IR", # Information Retrieval
+ "cs.HC", # Human-Computer Interaction
+ "stat.ML" # Machine Learning (Statistics)
+]
-def advanced_filter(entry):
- title = getattr(entry, 'title', '').lower()
- summary = getattr(entry, 'summary', '').lower()
- full_text = title + " " + summary
-
- general_terms = ["bias", "fairness"]
- model_terms = ["llm", "language model", "transformer", "gpt", "nlp",
- "pretrained", "embedding", "generation", "alignment", "ai"]
- negative_terms = ["estimation", "variance", "quantum", "physics",
- "sensor", "circuit", "electronics", "hardware"]
-
- has_general = any(term in full_text for term in general_terms)
- has_model = any(term in full_text for term in model_terms)
- has_negative = any(term in full_text for term in negative_terms)
-
- return (has_general and has_model) and (not has_negative)
-
-def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"):
- prompt = f"Title: {title}\nAbstract: {summary}"
- try:
- dialogue = client.chat.completions.create(
- model=model,
- messages=[
- {"role": "system", "content": SYSTEM_PROMPT},
- {"role": "user", "content": prompt}
- ],
- temperature=0.0,
- max_tokens=1
- )
- response_msg = dialogue.choices[0].message.content.strip()
- print(f"[DEBUG][API] OpenAI response='{response_msg}' for paper '{title[:60]}...'")
- return response_msg == "1"
- except Exception as e:
- print("[ERROR][API] calling OpenAI API:", e)
- return False
-
-def fetch_papers_combined(days=1):
- import datetime, requests, feedparser, os
- from openai import OpenAI
-
- # 1) Compute & log the window
- now_utc = datetime.datetime.now(datetime.timezone.utc)
- cutoff_utc = now_utc - datetime.timedelta(days=days)
- print(f"[DEBUG] now_utc = {now_utc.isoformat()}")
- print(f"[DEBUG] cutoff_utc = {cutoff_utc.isoformat()}")
-
- # 2) Build (or disable) category filtering
- cat_query = " OR ".join(f"cat:{c}" for c in ALLOWED_CATEGORIES)
- # To disable completely, you could instead do:
- # cat_query = "all:*"
-
- base_url = "http://export.arxiv.org/api/query"
- step, start = 100, 0
- all_entries = []
-
- while True:
- params = {
- "search_query": cat_query,
- "sortBy": "submittedDate",
- "sortOrder": "descending",
- "start": start,
- "max_results": step
+GPT_SYSTEM_PROMPT = """You are an expert researcher in AI/ML bias and fairness.
+
+Your task is to analyze a paper's title and abstract to determine if it's relevant to LLM (Large Language Model) bias and fairness research.
+
+A paper is relevant if it discusses:
+- Bias in large language models, generative AI, or foundation models
+- Fairness issues in NLP models or text generation
+- Ethical concerns with language models
+- Demographic bias in AI systems
+- Alignment and safety of language models
+- Bias evaluation or mitigation in NLP
+
+Respond with exactly "1" if the paper is relevant, or "0" if it's not relevant.
+Do not include any other text in your response."""
+
+
+class ArxivPaperFetcher:
+ """Main class for fetching and filtering arxiv papers."""
+
+ def __init__(self, openai_api_key: str):
+ """Initialize the fetcher with OpenAI API key."""
+ self.openai_client = OpenAI(api_key=openai_api_key)
+ self.session = requests.Session()
+ self.session.headers.update({
+ 'User-Agent': 'PaperFetcher/1.0 (https://github.com/YurenHao0426/PaperFetcher)'
+ })
+
+ def fetch_papers_by_date_range(self, start_date: datetime, end_date: datetime,
+ max_papers: int = 1000) -> List[Dict]:
+ """
+ Fetch papers from arxiv within a specific date range.
+
+ Args:
+ start_date: Start date for paper search
+ end_date: End date for paper search
+ max_papers: Maximum number of papers to fetch
+
+ Returns:
+ List of paper dictionaries
+ """
+ logger.info(f"Fetching papers from {start_date.date()} to {end_date.date()}")
+
+ # Build category query
+ category_query = " OR ".join(f"cat:{cat}" for cat in CS_CATEGORIES)
+
+ all_papers = []
+ start_index = 0
+
+ while len(all_papers) < max_papers:
+ try:
+ # Build search query
+ search_query = f"({category_query})"
+
+ params = {
+ "search_query": search_query,
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "start": start_index,
+ "max_results": min(MAX_RESULTS_PER_BATCH, max_papers - len(all_papers))
+ }
+
+ logger.debug(f"Fetching batch starting at index {start_index}")
+ response = self.session.get(ARXIV_BASE_URL, params=params, timeout=30)
+ response.raise_for_status()
+
+ feed = feedparser.parse(response.content)
+ entries = feed.entries
+
+ if not entries:
+ logger.info("No more papers available")
+ break
+
+ # Filter papers by date
+ batch_papers = []
+ for entry in entries:
+ paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
+
+ if paper_date < start_date:
+ # Papers are sorted by date, so we can stop here
+ logger.info(f"Reached papers older than start date: {paper_date.date()}")
+ return all_papers
+
+ if start_date <= paper_date <= end_date:
+ paper_data = self._parse_paper_entry(entry)
+ batch_papers.append(paper_data)
+
+ all_papers.extend(batch_papers)
+ logger.info(f"Fetched {len(batch_papers)} papers in date range from this batch. Total: {len(all_papers)}")
+
+ # If we got fewer papers than requested, we've reached the end
+ if len(entries) < MAX_RESULTS_PER_BATCH:
+ break
+
+ start_index += MAX_RESULTS_PER_BATCH
+
+ except Exception as e:
+ logger.error(f"Error fetching papers: {e}")
+ break
+
+ logger.info(f"Total papers fetched: {len(all_papers)}")
+ return all_papers
+
+ def _parse_paper_entry(self, entry) -> Dict:
+ """Parse a feedparser entry into a paper dictionary."""
+ return {
+ "title": entry.title.replace('\n', ' ').strip(),
+ "abstract": entry.summary.replace('\n', ' ').strip(),
+ "authors": [author.name for author in entry.authors] if hasattr(entry, 'authors') else [],
+ "published": entry.published,
+ "updated": entry.updated,
+ "link": entry.link,
+ "arxiv_id": entry.id.split('/')[-1],
+ "categories": [tag.term for tag in entry.tags] if hasattr(entry, 'tags') else []
}
- resp = requests.get(base_url, params=params, timeout=30)
- resp.raise_for_status()
- print(f"[DEBUG] arXiv query URL: {resp.url}")
-
- feed = feedparser.parse(resp.content)
- batch = feed.entries
- print(f"[DEBUG] fetched batch size: {len(batch)}")
- if not batch:
- break
-
- # 3) Use the *updated* time (announcement) for your 24h filter
- kept = []
- for e in batch:
- updated = datetime.datetime(
- *e.updated_parsed[:6],
- tzinfo=datetime.timezone.utc
+
+ def filter_papers_with_gpt(self, papers: List[Dict]) -> List[Dict]:
+ """
+ Filter papers using GPT-4o to identify bias-related research.
+
+ Args:
+ papers: List of paper dictionaries
+
+ Returns:
+ List of relevant papers
+ """
+ logger.info(f"Filtering {len(papers)} papers using GPT-4o")
+ relevant_papers = []
+
+ for i, paper in enumerate(papers, 1):
+ try:
+ is_relevant = self._check_paper_relevance(paper)
+ if is_relevant:
+ relevant_papers.append(paper)
+ logger.info(f"✓ Paper {i}/{len(papers)}: {paper['title'][:80]}...")
+ else:
+ logger.debug(f"✗ Paper {i}/{len(papers)}: {paper['title'][:80]}...")
+
+ except Exception as e:
+ logger.error(f"Error filtering paper {i}: {e}")
+ continue
+
+ logger.info(f"Found {len(relevant_papers)} relevant papers out of {len(papers)}")
+ return relevant_papers
+
+ def _check_paper_relevance(self, paper: Dict) -> bool:
+ """Check if a paper is relevant using GPT-4o."""
+ prompt = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}"
+
+ try:
+ response = self.openai_client.chat.completions.create(
+ model="gpt-4o",
+ messages=[
+ {"role": "system", "content": GPT_SYSTEM_PROMPT},
+ {"role": "user", "content": prompt}
+ ],
+ temperature=0,
+ max_tokens=1
)
- print(f"[DEBUG] entry.updated → {updated.isoformat()}")
- if updated >= cutoff_utc:
- kept.append(e)
-
- print(f"[DEBUG] kept {len(kept)} of {len(batch)} in this batch")
- if not kept:
- print("[DEBUG] no recent entries → stopping fetch loop")
- break
-
- all_entries.extend(kept)
- if len(batch) < step:
- break
- start += step
+
+ result = response.choices[0].message.content.strip()
+ return result == "1"
+
+ except Exception as e:
+ logger.error(f"Error calling GPT-4o: {e}")
+ return False
+
+ def fetch_recent_papers(self, days: int = 1) -> List[Dict]:
+ """Fetch papers from the last N days."""
+ end_date = datetime.now(timezone.utc)
+ start_date = end_date - timedelta(days=days)
+
+ papers = self.fetch_papers_by_date_range(start_date, end_date)
+ return self.filter_papers_with_gpt(papers)
+
+ def fetch_historical_papers(self, years: int = 2) -> List[Dict]:
+ """Fetch papers from the past N years."""
+ end_date = datetime.now(timezone.utc)
+ start_date = end_date - timedelta(days=years * 365)
+
+ logger.info(f"Fetching historical papers from the past {years} years")
+ papers = self.fetch_papers_by_date_range(start_date, end_date, max_papers=5000)
+ return self.filter_papers_with_gpt(papers)
+
+
+class GitHubUpdater:
+ """Handle GitHub repository updates."""
+
+ def __init__(self, token: str, repo_name: str):
+ """Initialize GitHub updater."""
+ self.github = Github(token)
+ self.repo_name = repo_name
+ self.repo = self.github.get_repo(repo_name)
+
+ def update_readme_with_papers(self, papers: List[Dict], section_title: str = None):
+ """Update README with new papers."""
+ if not papers:
+ logger.info("No papers to add to README")
+ return
+
+ if section_title is None:
+ section_title = f"Papers Updated on {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}"
+
+ try:
+ # Get current README
+ readme_file = self.repo.get_contents("README.md", ref="main")
+ current_content = readme_file.decoded_content.decode("utf-8")
+
+ # Create new section
+ new_section = f"\n\n## {section_title}\n\n"
+
+ for paper in papers:
+ # Format paper entry
+ authors_str = ", ".join(paper['authors'][:3]) # First 3 authors
+ if len(paper['authors']) > 3:
+ authors_str += " et al."
+
+ categories_str = ", ".join(paper['categories'])
+
+ new_section += f"### {paper['title']}\n\n"
+ new_section += f"**Authors:** {authors_str}\n\n"
+ new_section += f"**Categories:** {categories_str}\n\n"
+ new_section += f"**Published:** {paper['published']}\n\n"
+ new_section += f"**Abstract:** {paper['abstract']}\n\n"
+ new_section += f"**Link:** [arXiv:{paper['arxiv_id']}]({paper['link']})\n\n"
+ new_section += "---\n\n"
+
+ # Update README
+ updated_content = current_content + new_section
+ commit_message = f"Auto-update: Added {len(papers)} new papers on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}"
+
+ self.repo.update_file(
+ path="README.md",
+ message=commit_message,
+ content=updated_content,
+ sha=readme_file.sha,
+ branch="main"
+ )
+
+ logger.info(f"Successfully updated README with {len(papers)} papers")
+
+ except Exception as e:
+ logger.error(f"Error updating README: {e}")
+ raise
- print(f"[DEBUG] total fetched papers in last {days} day(s): {len(all_entries)}")
- # 4) Now run your OpenAI filter and category check
+def main():
+ """Main function to run the paper fetcher."""
+ # Get environment variables
openai_api_key = os.getenv("OPENAI_API_KEY")
+ github_token = os.getenv("TARGET_REPO_TOKEN")
+ target_repo = os.getenv("TARGET_REPO_NAME", "YurenHao0426/awesome-llm-bias-papers")
+
+ # Check for required environment variables
if not openai_api_key:
- print("[ERROR] OPENAI_API_KEY missing, aborting.")
- return []
-
- client = OpenAI(api_key=openai_api_key)
- final_matched = []
-
- for idx, entry in enumerate(all_entries, start=1):
- title = entry.title
- summary = entry.summary
- cats = [t.term for t in getattr(entry, 'tags', [])]
-
- # (optional) re‑enable or disable category filtering here
- if not any(cat in ALLOWED_CATEGORIES for cat in cats):
- continue
-
- if is_relevant_by_api(title, summary, client):
- final_matched.append({
- "title": title,
- "summary": summary,
- "published": entry.published,
- "link": entry.link,
- "categories": cats
- })
- print(f"[DEBUG][API] Included #{idx}: {title[:60]}...")
+ logger.error("OPENAI_API_KEY environment variable is required")
+ sys.exit(1)
+
+ if not github_token:
+ logger.error("TARGET_REPO_TOKEN environment variable is required")
+ sys.exit(1)
+
+ # Get command line arguments
+ mode = os.getenv("FETCH_MODE", "daily") # daily or historical
+ days = int(os.getenv("FETCH_DAYS", "1"))
+
+ try:
+ # Initialize fetcher
+ fetcher = ArxivPaperFetcher(openai_api_key)
+
+ if mode == "historical":
+ logger.info("Running in historical mode - fetching papers from past 2 years")
+ papers = fetcher.fetch_historical_papers(years=2)
+ section_title = "Historical LLM Bias Papers (Past 2 Years)"
else:
- print(f"[DEBUG][API] Excluded #{idx}: {title[:60]}...")
-
- print(f"[DEBUG] final matched papers: {len(final_matched)}")
- return final_matched
-
-
-
-
-
-def update_readme_in_repo(papers, token, repo_name):
- if not papers:
- print("[INFO] No matched papers, skip README update.")
- return
-
- g = Github(token)
- repo = g.get_repo(repo_name)
-
- readme_file = repo.get_contents("README.md", ref="main")
- old_content = readme_file.decoded_content.decode("utf-8")
-
- now_utc_str = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
- new_section = f"\n\n### Auto-captured papers on {now_utc_str}\n"
- for p in papers:
- cat_str = ", ".join(p["categories"])
- new_section += f"- **{p['title']}** (Published={p['published']}) \n"
- new_section += f" - Categories: {cat_str} \n"
- new_section += f" - Link: {p['link']}\n\n"
-
- updated_content = old_content + new_section
- commit_msg = f"Auto update README with {len(papers)} new papers"
-
- repo.update_file(
- path="README.md",
- message=commit_msg,
- content=updated_content,
- sha=readme_file.sha,
- branch="main"
- )
- print(f"[INFO] README updated with {len(papers)} papers.")
-
-def main():
- days = 1
- print(f"[DEBUG] Starting fetch_papers_combined with days={days}")
- papers = fetch_papers_combined(days=days)
-
- print(f"[DEBUG] After fetch_papers_combined: {len(papers)} papers matched.")
+ logger.info(f"Running in daily mode - fetching papers from last {days} day(s)")
+ papers = fetcher.fetch_recent_papers(days=days)
+ section_title = None # Use default timestamp
+
+ # Update GitHub repository
+ if papers:
+ updater = GitHubUpdater(github_token, target_repo)
+ updater.update_readme_with_papers(papers, section_title)
+ logger.info(f"Successfully processed {len(papers)} papers")
+ else:
+ logger.info("No relevant papers found")
+
+ except Exception as e:
+ logger.error(f"Error in main execution: {e}")
+ sys.exit(1)
- github_token = os.getenv("TARGET_REPO_TOKEN")
- target_repo_name = os.getenv("TARGET_REPO_NAME")
- print(f"[DEBUG] Github Token Set: {'Yes' if github_token else 'No'}")
- print(f"[DEBUG] Target Repo Name: {target_repo_name}")
-
- if github_token and target_repo_name and papers:
- update_readme_in_repo(papers, github_token, target_repo_name)
- else:
- print("[INFO] Skipped README update due to missing credentials or no papers matched.")
if __name__ == "__main__":
main()
diff --git a/scripts/single_run_test.py b/scripts/single_run_test.py
deleted file mode 100644
index fe52bb4..0000000
--- a/scripts/single_run_test.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import requests
-import feedparser
-
-def test_arxiv():
- base_url = "http://export.arxiv.org/api/query"
- # 时间段设为 2025-03-27 00:00 到 2025-03-29 00:00
- # 注意: 论文在 3月27日 07:54Z 提交,应该在这个区间之内
- search_query = (
- "(all:bias+OR+all:fairness)"
- "+AND+cat:cs.IR"
- "+AND+submittedDate:[202503270000+TO+202503290000]"
- )
-
- params = {
- "search_query": search_query,
- "sortBy": "submittedDate",
- "sortOrder": "descending",
- "max_results": 100
- }
- print("[DEBUG] search_query =", search_query)
-
- r = requests.get(base_url, params=params)
- print("[DEBUG] Full URL =", r.url)
- if r.status_code != 200:
- print("[ERROR] HTTP Status:", r.status_code)
- return
-
- feed = feedparser.parse(r.content)
- print("[DEBUG] Returned entries:", len(feed.entries))
-
- # 打印出标题和发布时间供检查
- for i, entry in enumerate(feed.entries, start=1):
- print(f"{i}. Title: {entry.title} | updated: {entry.updated} | published: {entry.published}")
-
-if __name__ == "__main__":
- test_arxiv()
diff --git a/scripts/test_arxiv_widest.py b/scripts/test_arxiv_widest.py
deleted file mode 100644
index 466c62d..0000000
--- a/scripts/test_arxiv_widest.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import requests
-import feedparser
-import datetime
-
-def fetch_arxiv_full_range():
- """
- 不限制分类、关键词,仅根据 submittedDate 做一个宽区间。
- 分批次抓取,每批 100 条,直到再也拿不到新条目或达到我们设定的安全上限。
- 同时演示如何在循环中检测如果发布时间超过了上限,就可以提前退出。
- """
-
- base_url = "http://export.arxiv.org/api/query"
-
- # 宽松的日期范围 [202503250000 TO 202504020000]
- # 你可以改成更广或更精确
- start_date_str = "202503250000"
- end_date_str = "202504020000"
-
- search_query = f"submittedDate:[{start_date_str} TO {end_date_str}]"
-
- # 分批抓取
- step = 100
- start = 0
- all_entries = []
-
- while True:
- params = {
- "search_query": search_query,
- "sortBy": "submittedDate",
- "sortOrder": "descending",
- "start": start,
- "max_results": step
- }
- print(f"[DEBUG] Fetching from index={start} to {start+step}, date range = {start_date_str} ~ {end_date_str}")
- resp = requests.get(base_url, params=params)
- if resp.status_code != 200:
- print("[ERROR] HTTP status:", resp.status_code)
- break
-
- feed = feedparser.parse(resp.content)
- entries = feed.entries
- got_count = len(entries)
- print(f"[DEBUG] Got {got_count} entries this batch.")
-
- if got_count == 0:
- # 没有更多数据了
- break
-
- # 把本批加入总list
- all_entries.extend(entries)
- # 下一批
- start += step
-
- # 自定义一个安全上限,防止无限循环或极大数据
- if start >= 3000:
- # 3k 只是举例
- print("[DEBUG] Over 3000 entries, stopping to avoid extremely large dataset.")
- break
-
- print("[DEBUG] total retrieved:", len(all_entries))
-
- # 现在 all_entries 就是我们抓到的全部。
- # 可以查看是否包含 "Bias-Aware Agent..." 或做后续处理
-
- found = False
- for idx, e in enumerate(all_entries, 1):
- title_lower = e.title.lower()
- if "bias-aware agent" in title_lower:
- found = True
- print(f"\n[FOUND] Index={idx}, Title={e.title}, published={e.published}, updated={e.updated}")
- break
-
- if not found:
- print("\n[INFO] 'Bias-Aware Agent...' not found in the entire set.")
-
-
-if __name__ == "__main__":
- fetch_arxiv_full_range()
diff --git a/scripts/test_daily_fetch.py b/scripts/test_daily_fetch.py
new file mode 100644
index 0000000..84ccd9e
--- /dev/null
+++ b/scripts/test_daily_fetch.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""
+Test script for daily paper fetching functionality.
+
+This script tests the daily paper fetching with a small sample to verify
+the system works correctly before running in production.
+"""
+
+import os
+import sys
+from datetime import datetime, timezone, timedelta
+
+# Add the parent directory to the path so we can import the main module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scripts.fetch_papers import ArxivPaperFetcher
+
+
+def test_daily_fetch():
+ """Test fetching papers from the last 3 days (to ensure we get some results)."""
+
+ # Check for OpenAI API key
+ openai_api_key = os.getenv("OPENAI_API_KEY")
+ if not openai_api_key:
+ print("ERROR: OPENAI_API_KEY environment variable is required")
+ print("Please set your OpenAI API key in the environment variable")
+ sys.exit(1)
+
+ print("Testing daily paper fetching (last 3 days)...")
+
+ # Initialize fetcher
+ fetcher = ArxivPaperFetcher(openai_api_key)
+
+ # Test with last 3 days to ensure we get some results
+ papers = fetcher.fetch_recent_papers(days=3)
+
+ print(f"\nFetch completed!")
+ print(f"Found {len(papers)} relevant LLM bias papers in the last 3 days")
+
+ if papers:
+ print("\nRelevant papers found:")
+ for i, paper in enumerate(papers, 1):
+ print(f"\n{i}. {paper['title']}")
+ print(f" Authors: {', '.join(paper['authors'][:3])}")
+ if len(paper['authors']) > 3:
+ print(" et al.")
+ print(f" Categories: {', '.join(paper['categories'])}")
+ print(f" Published: {paper['published']}")
+ print(f" arXiv ID: {paper['arxiv_id']}")
+ print(f" Link: {paper['link']}")
+ print(f" Abstract: {paper['abstract'][:200]}...")
+ print("-" * 50)
+ else:
+ print("\nNo relevant papers found in the last 3 days.")
+ print("This could be normal - LLM bias papers are not published every day.")
+
+
+def test_system_components():
+ """Test individual system components."""
+
+ openai_api_key = os.getenv("OPENAI_API_KEY")
+ if not openai_api_key:
+ print("ERROR: OPENAI_API_KEY environment variable is required")
+ sys.exit(1)
+
+ print("\nTesting system components...")
+
+ # Test fetcher initialization
+ try:
+ fetcher = ArxivPaperFetcher(openai_api_key)
+ print("✓ ArxivPaperFetcher initialized successfully")
+ except Exception as e:
+ print(f"✗ Failed to initialize ArxivPaperFetcher: {e}")
+ return False
+
+ # Test arXiv API connectivity
+ try:
+ end_date = datetime.now(timezone.utc)
+ start_date = end_date - timedelta(days=1)
+ papers = fetcher.fetch_papers_by_date_range(start_date, end_date, max_papers=5)
+ print(f"✓ arXiv API connectivity works (fetched {len(papers)} papers)")
+ except Exception as e:
+ print(f"✗ Failed to connect to arXiv API: {e}")
+ return False
+
+ # Test OpenAI API connectivity (if we have papers to test)
+ if papers:
+ try:
+ sample_paper = papers[0]
+ is_relevant = fetcher._check_paper_relevance(sample_paper)
+ print(f"✓ OpenAI API connectivity works (test result: {is_relevant})")
+ except Exception as e:
+ print(f"✗ Failed to connect to OpenAI API: {e}")
+ return False
+
+ return True
+
+
+if __name__ == "__main__":
+ print("ArXiv Daily Paper Fetcher Test")
+ print("=" * 40)
+
+ try:
+ # Test system components first
+ if test_system_components():
+ print("\nAll system components working correctly!")
+
+ # Run main test
+ test_daily_fetch()
+
+ print("\n" + "=" * 40)
+ print("Test completed successfully!")
+ print("\nTo run the actual daily fetch:")
+ print("python scripts/fetch_papers.py")
+
+ else:
+ print("\nSystem component test failed!")
+ sys.exit(1)
+
+ except Exception as e:
+ print(f"\nError during testing: {e}")
+ sys.exit(1) \ No newline at end of file
diff --git a/scripts/test_historical_fetch.py b/scripts/test_historical_fetch.py
new file mode 100644
index 0000000..d4e8a6b
--- /dev/null
+++ b/scripts/test_historical_fetch.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+Test script for historical paper fetching functionality.
+
+This script tests the ArxivPaperFetcher class with a smaller date range
+to verify the historical fetching works correctly before running on 2 years of data.
+"""
+
+import os
+import sys
+from datetime import datetime, timezone, timedelta
+
+# Add the parent directory to the path so we can import the main module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scripts.fetch_papers import ArxivPaperFetcher
+
+
+def test_recent_historical_fetch():
+ """Test fetching papers from the last 30 days as a historical test."""
+
+ # Check for OpenAI API key
+ openai_api_key = os.getenv("OPENAI_API_KEY")
+ if not openai_api_key:
+ print("ERROR: OPENAI_API_KEY environment variable is required")
+ sys.exit(1)
+
+ print("Testing historical paper fetching (last 30 days)...")
+
+ # Initialize fetcher
+ fetcher = ArxivPaperFetcher(openai_api_key)
+
+ # Test with last 30 days
+ end_date = datetime.now(timezone.utc)
+ start_date = end_date - timedelta(days=30)
+
+ print(f"Fetching papers from {start_date.date()} to {end_date.date()}")
+
+ # Fetch papers (limit to 200 for testing)
+ papers = fetcher.fetch_papers_by_date_range(start_date, end_date, max_papers=200)
+
+ print(f"\nFetched {len(papers)} papers total")
+
+ if papers:
+ print("\nSample papers:")
+ for i, paper in enumerate(papers[:3], 1):
+ print(f"\n{i}. {paper['title']}")
+ print(f" Authors: {', '.join(paper['authors'][:2])}")
+ print(f" Categories: {', '.join(paper['categories'])}")
+ print(f" Published: {paper['published']}")
+ print(f" Abstract: {paper['abstract'][:150]}...")
+
+ # Test GPT filtering on a smaller subset
+ print(f"\nTesting GPT-4o filtering on first 10 papers...")
+ sample_papers = papers[:10]
+ filtered_papers = fetcher.filter_papers_with_gpt(sample_papers)
+
+ print(f"\nFiltering results: {len(filtered_papers)}/{len(sample_papers)} papers are relevant")
+
+ if filtered_papers:
+ print("\nRelevant papers found:")
+ for i, paper in enumerate(filtered_papers, 1):
+ print(f"\n{i}. {paper['title']}")
+ print(f" Abstract: {paper['abstract'][:200]}...")
+ else:
+ print("No relevant papers found in the sample.")
+
+ else:
+ print("No papers found in the date range.")
+
+
+def test_specific_date_range():
+ """Test fetching papers from a specific date range known to have bias papers."""
+
+ openai_api_key = os.getenv("OPENAI_API_KEY")
+ if not openai_api_key:
+ print("ERROR: OPENAI_API_KEY environment variable is required")
+ sys.exit(1)
+
+ print("\nTesting specific date range (January 2024)...")
+
+ fetcher = ArxivPaperFetcher(openai_api_key)
+
+ # Test January 2024 (likely to have some relevant papers)
+ start_date = datetime(2024, 1, 1, tzinfo=timezone.utc)
+ end_date = datetime(2024, 1, 31, tzinfo=timezone.utc)
+
+ print(f"Fetching papers from {start_date.date()} to {end_date.date()}")
+
+ papers = fetcher.fetch_papers_by_date_range(start_date, end_date, max_papers=500)
+ print(f"Fetched {len(papers)} papers from January 2024")
+
+ if papers:
+ # Filter for bias-related papers
+ filtered_papers = fetcher.filter_papers_with_gpt(papers)
+
+ print(f"\nFound {len(filtered_papers)} bias-related papers in January 2024")
+
+ for i, paper in enumerate(filtered_papers[:5], 1):
+ print(f"\n{i}. {paper['title']}")
+ print(f" arXiv ID: {paper['arxiv_id']}")
+ print(f" Link: {paper['link']}")
+
+
+if __name__ == "__main__":
+ print("ArXiv Historical Paper Fetcher Test")
+ print("=" * 40)
+
+ try:
+ test_recent_historical_fetch()
+ test_specific_date_range()
+ print("\n" + "=" * 40)
+ print("Test completed successfully!")
+
+ except Exception as e:
+ print(f"\nError during testing: {e}")
+ sys.exit(1) \ No newline at end of file
diff --git a/scripts/test_uiuc_chat.py b/scripts/test_uiuc_chat.py
deleted file mode 100644
index c8c4e64..0000000
--- a/scripts/test_uiuc_chat.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import requests
-
-API_URL = "https://uiuc.chat/api/chat-api/chat"
-API_KEY = "uc_0a6c6e31ac654737a3cd4d5c1ad4e4cd" # Replace with your actual key
-
-title = "Bias-Aware Agent: Enhancing Fairness in AI-Driven Knowledge Retrieval"
-abstract = "Advancements in retrieving accessible information have evolved more rapidly over the last few years... (omitting for brevity) ... by empowering users with transparency and awareness, this approach aims to foster more equitable information systems."
-'''
-headers = {
- 'Content-Type': 'application/json'
-}
-messages = [
- {
- "role": "system",
- # 可以空着,或写很简短
- "content": "You are a helpful assistant."
- },
- {
- "role": "user",
- "content": (
- "Here is a paper's title and abstract:\n\n"
- f"Title: {title}\n\n"
- f"Abstract: {abstract}\n\n"
- "Respond with '1' (just the digit) if this paper is clearly about both "
- "large language models (or generative text models) AND about bias/fairness. "
- "Otherwise respond '0'. No explanation, no punctuation, only the digit."
- )
- }
-]
-
-data = {
- "model": "llama3.1:8b-instruct-fp16",
- "messages": messages,
- "api_key": API_KEY,
- "course_name": "llm-bias-papers",
- "stream": False,
- "temperature": 0.1
-}
-'''
-url = "https://uiuc.chat/api/chat-api/chat"
-headers = {
- 'Content-Type': 'application/json'
-}
-data = {
- "model": "llama3.1:8b-instruct-fp16",
- "messages": [
- {
- "role": "system",
- "content": "You are a helpful AI assistant. Follow instructions carefully."
- },
- {
- "role": "user",
- "content": "Here is a paper's title and abstract:\n\nTitle:" + title + "\n\nAbstract: " + abstract + "\n\nRespond with 'Yes' (just the word) if this paper is clearly about both large language models (or generative text models) AND about bias/fairness. Otherwise respond 'No'. No explanation, no punctuation, only the digit."
- }
- ],
- "api_key": "uc_0a6c6e31ac654737a3cd4d5c1ad4e4cd",
- "course_name": "llm-bias-papers",
- "stream": False,
- "temperature": 0.0,
- "retrieval_only": False
-}
-
-response = requests.post(url, headers=headers, json=data)
-for chunk in response.iter_lines():
- if chunk:
- print(chunk.decode())
-resp = requests.post(API_URL, headers=headers, json=data)
-print("Status:", resp.status_code)
-print("Response:", resp.text)
-print("[DEBUG] resp.text =", resp.text)
-print("[DEBUG] resp.json() =", resp.json()) # if not raising error