summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/__pycache__/fetch_papers.cpython-312.pycbin26121 -> 36787 bytes
-rw-r--r--scripts/fetch_papers.py230
-rw-r--r--scripts/test_parallel_processing.py228
-rw-r--r--scripts/test_reverse_chronological.py241
-rw-r--r--scripts/test_social_good_prompt.py186
5 files changed, 869 insertions, 16 deletions
diff --git a/scripts/__pycache__/fetch_papers.cpython-312.pyc b/scripts/__pycache__/fetch_papers.cpython-312.pyc
index afe99e8..47d73c0 100644
--- a/scripts/__pycache__/fetch_papers.cpython-312.pyc
+++ b/scripts/__pycache__/fetch_papers.cpython-312.pyc
Binary files differ
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index 3db80c7..fd3e628 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -24,7 +24,11 @@ import feedparser
from datetime import datetime, timezone, timedelta
from typing import List, Dict, Optional, Tuple
from github import Github
-from openai import OpenAI
+from openai import OpenAI, AsyncOpenAI
+import asyncio
+import aiohttp
+from concurrent.futures import ThreadPoolExecutor
+import time
# Configure logging
logging.basicConfig(
@@ -54,17 +58,24 @@ CS_CATEGORIES = [
"stat.ML" # Machine Learning (Statistics)
]
-GPT_SYSTEM_PROMPT = """You are an expert researcher in AI/ML bias and fairness.
+GPT_SYSTEM_PROMPT = """You are an expert researcher in AI/ML bias, fairness, and social good applications.
-Your task is to analyze a paper's title and abstract to determine if it's relevant to LLM (Large Language Model) bias and fairness research.
+Your task is to analyze a paper's title and abstract to determine if it's relevant to bias and fairness research with social good implications.
A paper is relevant if it discusses:
-- Bias in large language models, generative AI, or foundation models
-- Fairness issues in NLP models or text generation
-- Ethical concerns with language models
-- Demographic bias in AI systems
-- Alignment and safety of language models
-- Bias evaluation or mitigation in NLP
+- Bias, fairness, or discrimination in AI/ML systems with societal impact
+- Algorithmic fairness in healthcare, education, criminal justice, hiring, or finance
+- Demographic bias affecting marginalized or underrepresented groups
+- Data bias and its social consequences
+- Ethical AI and responsible AI deployment in society
+- AI safety and alignment with human values and social welfare
+- Bias evaluation, auditing, or mitigation in real-world applications
+- Representation and inclusion in AI systems and datasets
+- Social implications of AI bias (e.g., perpetuating inequality)
+- Fairness in recommendation systems, search engines, or content moderation
+- Bias in computer vision, NLP, or other AI domains affecting people
+
+The focus is on research that addresses how AI bias impacts society, vulnerable populations, or social justice, rather than purely technical ML advances without clear social relevance.
Respond with exactly "1" if the paper is relevant, or "0" if it's not relevant.
Do not include any other text in your response."""
@@ -76,6 +87,7 @@ class ArxivPaperFetcher:
def __init__(self, openai_api_key: str):
"""Initialize the fetcher with OpenAI API key."""
self.openai_client = OpenAI(api_key=openai_api_key)
+ self.async_openai_client = AsyncOpenAI(api_key=openai_api_key)
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'PaperFetcher/1.0 (https://github.com/YurenHao0426/PaperFetcher)'
@@ -257,12 +269,15 @@ class ArxivPaperFetcher:
"categories": [tag.term for tag in entry.tags] if hasattr(entry, 'tags') else []
}
- def filter_papers_with_gpt(self, papers: List[Dict]) -> List[Dict]:
+ def filter_papers_with_gpt(self, papers: List[Dict], use_parallel: bool = True,
+ max_concurrent: int = 16) -> List[Dict]:
"""
Filter papers using GPT-4o to identify bias-related research.
Args:
papers: List of paper dictionaries
+ use_parallel: Whether to use parallel processing (default: True)
+ max_concurrent: Maximum concurrent requests (default: 16)
Returns:
List of relevant papers
@@ -271,6 +286,15 @@ class ArxivPaperFetcher:
logger.warning("⚠️ 没有论文需要过滤!")
return []
+ if use_parallel and len(papers) > 5:
+ logger.info(f"🚀 使用并行模式处理 {len(papers)} 篇论文 (最大并发: {max_concurrent})")
+ return self._filter_papers_parallel(papers, max_concurrent)
+ else:
+ logger.info(f"🔄 使用串行模式处理 {len(papers)} 篇论文")
+ return self._filter_papers_sequential(papers)
+
+ def _filter_papers_sequential(self, papers: List[Dict]) -> List[Dict]:
+ """Serial processing of papers (original method)."""
logger.info(f"🤖 开始使用GPT-4o过滤论文...")
logger.info(f"📝 待处理论文数量: {len(papers)} 篇")
@@ -304,6 +328,111 @@ class ArxivPaperFetcher:
return relevant_papers
+ def _filter_papers_parallel(self, papers: List[Dict], max_concurrent: int = 16) -> List[Dict]:
+ """Parallel processing of papers using asyncio."""
+ try:
+ # 检查是否已有事件循环
+ loop = asyncio.get_event_loop()
+ if loop.is_running():
+ # 在已有事件循环中运行
+ import nest_asyncio
+ nest_asyncio.apply()
+ return loop.run_until_complete(self._async_filter_papers(papers, max_concurrent))
+ else:
+ # 创建新的事件循环
+ return asyncio.run(self._async_filter_papers(papers, max_concurrent))
+ except Exception as e:
+ logger.error(f"❌ 并行处理失败: {e}")
+ logger.info("🔄 回退到串行处理模式...")
+ return self._filter_papers_sequential(papers)
+
+ async def _async_filter_papers(self, papers: List[Dict], max_concurrent: int) -> List[Dict]:
+ """Async implementation of paper filtering."""
+ logger.info(f"🤖 开始异步GPT-4o过滤...")
+ logger.info(f"📝 待处理论文数量: {len(papers)} 篇")
+
+ # 创建信号量控制并发数
+ semaphore = asyncio.Semaphore(max_concurrent)
+
+ # 创建所有任务
+ tasks = []
+ for i, paper in enumerate(papers):
+ task = self._check_paper_relevance_async(paper, semaphore, i + 1, len(papers))
+ tasks.append(task)
+
+ # 并行执行所有任务
+ start_time = time.time()
+ results = await asyncio.gather(*tasks, return_exceptions=True)
+ total_time = time.time() - start_time
+
+ # 处理结果
+ relevant_papers = []
+ successful_count = 0
+ error_count = 0
+
+ for i, result in enumerate(results):
+ if isinstance(result, Exception):
+ logger.error(f"❌ 第 {i+1} 篇论文处理出错: {result}")
+ error_count += 1
+ elif isinstance(result, tuple):
+ is_relevant, paper = result
+ successful_count += 1
+ if is_relevant:
+ relevant_papers.append(paper)
+ logger.debug(f"✅ 第 {i+1} 篇论文 [相关]: {paper['title'][:60]}...")
+ else:
+ logger.debug(f"❌ 第 {i+1} 篇论文 [不相关]: {paper['title'][:60]}...")
+
+ # 显示最终统计
+ logger.info(f"🎯 并行GPT-4o过滤完成!")
+ logger.info(f" - 总处理时间: {total_time:.1f} 秒")
+ logger.info(f" - 平均每篇: {total_time/len(papers):.2f} 秒")
+ logger.info(f" - 成功处理: {successful_count} 篇论文")
+ logger.info(f" - 处理错误: {error_count} 篇论文")
+ logger.info(f" - 发现相关: {len(relevant_papers)} 篇论文")
+
+ if successful_count > 0:
+ logger.info(f" - 相关比例: {len(relevant_papers)/successful_count*100:.1f}%")
+
+ # 估算加速效果
+ estimated_serial_time = len(papers) * 2.0 # 估计串行处理每篇需要2秒
+ speedup = estimated_serial_time / total_time if total_time > 0 else 1
+ logger.info(f" - 预估加速: {speedup:.1f}x")
+
+ return relevant_papers
+
+ async def _check_paper_relevance_async(self, paper: Dict, semaphore: asyncio.Semaphore,
+ index: int, total: int) -> tuple:
+ """Async version of paper relevance checking."""
+ async with semaphore:
+ try:
+ # 显示进度(每10篇显示一次)
+ if index % 10 == 0:
+ logger.info(f"📊 并行进度: {index}/{total} 篇论文处理中...")
+
+ prompt = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}"
+
+ response = await self.async_openai_client.chat.completions.create(
+ model="gpt-4o",
+ messages=[
+ {"role": "system", "content": GPT_SYSTEM_PROMPT},
+ {"role": "user", "content": prompt}
+ ],
+ temperature=0,
+ max_tokens=1
+ )
+
+ result = response.choices[0].message.content.strip()
+ is_relevant = result == "1"
+
+ logger.debug(f"GPT-4o响应 #{index}: '{result}' -> {'相关' if is_relevant else '不相关'}")
+ return (is_relevant, paper)
+
+ except Exception as e:
+ logger.error(f"❌ 第 {index} 篇论文异步处理出错: {e}")
+ # 返回异常,让上层处理
+ raise e
+
def _check_paper_relevance(self, paper: Dict) -> bool:
"""Check if a paper is relevant using GPT-4o."""
prompt = f"Title: {paper['title']}\n\nAbstract: {paper['abstract']}"
@@ -341,7 +470,13 @@ class ArxivPaperFetcher:
if papers:
logger.info(f"📋 开始GPT-4o智能过滤阶段...")
- return self.filter_papers_with_gpt(papers)
+
+ # 从环境变量获取并行设置
+ use_parallel = os.getenv("USE_PARALLEL", "true").lower() == "true"
+ max_concurrent = int(os.getenv("MAX_CONCURRENT", "16"))
+
+ return self.filter_papers_with_gpt(papers, use_parallel=use_parallel,
+ max_concurrent=max_concurrent)
else:
logger.warning("⚠️ 未获取到任何论文,跳过GPT过滤步骤")
return []
@@ -359,7 +494,13 @@ class ArxivPaperFetcher:
if papers:
logger.info(f"📋 开始GPT-4o智能过滤阶段...")
- return self.filter_papers_with_gpt(papers)
+
+ # 历史模式默认使用更高的并发数(因为论文数量多)
+ use_parallel = os.getenv("USE_PARALLEL", "true").lower() == "true"
+ max_concurrent = int(os.getenv("MAX_CONCURRENT", "25")) # 历史模式默认更高并发
+
+ return self.filter_papers_with_gpt(papers, use_parallel=use_parallel,
+ max_concurrent=max_concurrent)
else:
logger.warning("⚠️ 未获取到任何论文,跳过GPT过滤步骤")
return []
@@ -375,7 +516,7 @@ class GitHubUpdater:
self.repo = self.github.get_repo(repo_name)
def update_readme_with_papers(self, papers: List[Dict], section_title: str = None):
- """Update README with new papers."""
+ """Update README with new papers in reverse chronological order (newest first)."""
if not papers:
logger.info("No papers to add to README")
return
@@ -407,8 +548,21 @@ class GitHubUpdater:
new_section += f"**Link:** [arXiv:{paper['arxiv_id']}]({paper['link']})\n\n"
new_section += "---\n\n"
- # Update README
- updated_content = current_content + new_section
+ # Insert new papers at the beginning to maintain reverse chronological order
+ # Find the end of the main documentation (after the project description and setup)
+ insert_position = self._find_papers_insert_position(current_content)
+
+ if insert_position > 0:
+ # Insert new section after the main documentation but before existing papers
+ updated_content = (current_content[:insert_position] +
+ new_section +
+ current_content[insert_position:])
+ logger.info(f"📝 新论文段落插入到README开头,保持时间倒序")
+ else:
+ # Fallback: append to end if can't find proper insertion point
+ updated_content = current_content + new_section
+ logger.info(f"📝 新论文段落追加到README末尾(找不到合适插入位置)")
+
commit_message = f"Auto-update: Added {len(papers)} new papers on {datetime.now(timezone.utc).strftime('%Y-%m-%d')}"
self.repo.update_file(
@@ -419,11 +573,55 @@ class GitHubUpdater:
branch="main"
)
- logger.info(f"Successfully updated README with {len(papers)} papers")
+ logger.info(f"✅ 成功更新README,添加了 {len(papers)} 篇论文 (时间倒序)")
except Exception as e:
logger.error(f"Error updating README: {e}")
raise
+
+ def _find_papers_insert_position(self, content: str) -> int:
+ """Find the best position to insert new papers (after main doc, before existing papers)."""
+ lines = content.split('\n')
+
+ # Look for patterns that indicate the end of documentation and start of papers
+ # Search in order of priority
+ insert_patterns = [
+ "**Note**: This tool is designed for academic research purposes", # End of README
+ "## Papers Updated on", # Existing paper sections
+ "## Historical", # Historical paper sections
+ "### ", # Any section that might be a paper title
+ "---", # Common separator before papers
+ ]
+
+ for pattern in insert_patterns:
+ for i, line in enumerate(lines):
+ if pattern in line:
+ # Found a good insertion point - insert before this line
+ # Convert line index to character position
+ char_position = sum(len(lines[j]) + 1 for j in range(i)) # +1 for newline
+ return char_position
+
+ # If no patterns found, try to find end of main documentation
+ # Look for the end of the last documentation section
+ last_doc_section = -1
+ for i, line in enumerate(lines):
+ if line.startswith('## ') and not line.startswith('## Papers') and not line.startswith('## Historical'):
+ last_doc_section = i
+
+ if last_doc_section >= 0:
+ # Find the end of this documentation section
+ section_end = len(lines)
+ for i in range(last_doc_section + 1, len(lines)):
+ if lines[i].startswith('## '):
+ section_end = i
+ break
+
+ # Insert after this section
+ char_position = sum(len(lines[j]) + 1 for j in range(section_end))
+ return char_position
+
+ # Final fallback: return 0 to trigger append behavior
+ return 0
def main():
diff --git a/scripts/test_parallel_processing.py b/scripts/test_parallel_processing.py
new file mode 100644
index 0000000..891f618
--- /dev/null
+++ b/scripts/test_parallel_processing.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+测试并行化OpenAI请求处理
+
+比较串行处理和并行处理的性能差异,展示加速效果。
+"""
+
+import os
+import sys
+import time
+import logging
+from datetime import datetime, timezone, timedelta
+
+# 设置日志
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - %(message)s',
+ handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+
+# Add the parent directory to the path so we can import the main module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scripts.fetch_papers import ArxivPaperFetcher
+
+
+def test_parallel_performance():
+ """测试并行处理性能"""
+
+ print("🚀 测试OpenAI请求并行化性能")
+ print("=" * 60)
+
+ # 检查API密钥
+ openai_api_key = os.getenv("OPENAI_API_KEY")
+ if not openai_api_key:
+ print("❌ 请设置OPENAI_API_KEY环境变量")
+ print(" export OPENAI_API_KEY='your-api-key-here'")
+ return
+
+ print("✅ OpenAI API密钥已设置")
+
+ try:
+ # 初始化fetcher
+ fetcher = ArxivPaperFetcher(openai_api_key)
+
+ # 获取一些论文作为测试数据
+ print("\n📋 获取测试数据...")
+ end_date = datetime.now(timezone.utc)
+ start_date = end_date - timedelta(days=7)
+
+ all_papers = fetcher.fetch_papers_by_date_range(start_date, end_date, max_papers=100)
+
+ if len(all_papers) < 10:
+ print(f"⚠️ 只获取到 {len(all_papers)} 篇论文,可能不足以展示并行效果")
+ if len(all_papers) < 5:
+ print("❌ 论文数量太少,无法进行有效测试")
+ return
+
+ # 选择测试子集(避免API费用过高)
+ test_papers = all_papers[:min(20, len(all_papers))] # 最多测试20篇论文
+ print(f"📝 将测试 {len(test_papers)} 篇论文")
+
+ print(f"\n📋 测试论文样本:")
+ for i, paper in enumerate(test_papers[:3], 1):
+ print(f" {i}. {paper['title'][:60]}...")
+
+ if len(test_papers) > 3:
+ print(f" ... 还有 {len(test_papers) - 3} 篇论文")
+
+ # 测试1: 串行处理
+ print(f"\n" + "="*60)
+ print("🔄 测试1: 串行处理")
+ print("="*60)
+
+ start_time = time.time()
+ serial_results = fetcher.filter_papers_with_gpt(
+ test_papers.copy(),
+ use_parallel=False
+ )
+ serial_time = time.time() - start_time
+
+ print(f"🔄 串行处理结果:")
+ print(f" - 处理时间: {serial_time:.1f} 秒")
+ print(f" - 平均每篇: {serial_time/len(test_papers):.2f} 秒")
+ print(f" - 相关论文: {len(serial_results)} 篇")
+
+ # 测试2: 并行处理(低并发)
+ print(f"\n" + "="*60)
+ print("🚀 测试2: 并行处理 (并发=5)")
+ print("="*60)
+
+ start_time = time.time()
+ parallel_results_5 = fetcher.filter_papers_with_gpt(
+ test_papers.copy(),
+ use_parallel=True,
+ max_concurrent=5
+ )
+ parallel_time_5 = time.time() - start_time
+
+ print(f"🚀 并行处理结果 (并发=5):")
+ print(f" - 处理时间: {parallel_time_5:.1f} 秒")
+ print(f" - 平均每篇: {parallel_time_5/len(test_papers):.2f} 秒")
+ print(f" - 相关论文: {len(parallel_results_5)} 篇")
+ print(f" - 加速比: {serial_time/parallel_time_5:.1f}x")
+
+ # 测试3: 并行处理(高并发)
+ print(f"\n" + "="*60)
+ print("🚀 测试3: 并行处理 (并发=10)")
+ print("="*60)
+
+ start_time = time.time()
+ parallel_results_10 = fetcher.filter_papers_with_gpt(
+ test_papers.copy(),
+ use_parallel=True,
+ max_concurrent=10
+ )
+ parallel_time_10 = time.time() - start_time
+
+ print(f"🚀 并行处理结果 (并发=10):")
+ print(f" - 处理时间: {parallel_time_10:.1f} 秒")
+ print(f" - 平均每篇: {parallel_time_10/len(test_papers):.2f} 秒")
+ print(f" - 相关论文: {len(parallel_results_10)} 篇")
+ print(f" - 加速比: {serial_time/parallel_time_10:.1f}x")
+
+ # 验证结果一致性
+ print(f"\n" + "="*60)
+ print("🔍 结果一致性验证")
+ print("="*60)
+
+ # 获取相关论文的ID
+ serial_ids = set(paper['arxiv_id'] for paper in serial_results)
+ parallel_ids_5 = set(paper['arxiv_id'] for paper in parallel_results_5)
+ parallel_ids_10 = set(paper['arxiv_id'] for paper in parallel_results_10)
+
+ print(f"📊 结果对比:")
+ print(f" - 串行结果: {len(serial_ids)} 篇相关论文")
+ print(f" - 并行结果(5): {len(parallel_ids_5)} 篇相关论文")
+ print(f" - 并行结果(10): {len(parallel_ids_10)} 篇相关论文")
+
+ # 检查一致性
+ consistency_5 = len(serial_ids.symmetric_difference(parallel_ids_5))
+ consistency_10 = len(serial_ids.symmetric_difference(parallel_ids_10))
+
+ print(f"📋 一致性检查:")
+ if consistency_5 == 0:
+ print(f" ✅ 串行 vs 并行(5): 结果完全一致")
+ else:
+ print(f" ⚠️ 串行 vs 并行(5): {consistency_5} 篇论文结果不同")
+
+ if consistency_10 == 0:
+ print(f" ✅ 串行 vs 并行(10): 结果完全一致")
+ else:
+ print(f" ⚠️ 串行 vs 并行(10): {consistency_10} 篇论文结果不同")
+
+ # 最终总结
+ print(f"\n" + "="*60)
+ print("📊 性能测试总结")
+ print("="*60)
+
+ print(f"📈 处理时间对比:")
+ print(f" - 串行处理: {serial_time:6.1f} 秒")
+ print(f" - 并行处理(5): {parallel_time_5:6.1f} 秒 ({serial_time/parallel_time_5:.1f}x 加速)")
+ print(f" - 并行处理(10): {parallel_time_10:6.1f} 秒 ({serial_time/parallel_time_10:.1f}x 加速)")
+
+ # 计算理论最大加速
+ theoretical_speedup = min(len(test_papers), 10) # 理论上最大加速等于并发数或论文数
+ actual_speedup = serial_time / parallel_time_10
+ efficiency = (actual_speedup / theoretical_speedup) * 100
+
+ print(f"\n💡 性能分析:")
+ print(f" - 理论最大加速: {theoretical_speedup}x")
+ print(f" - 实际最大加速: {actual_speedup:.1f}x")
+ print(f" - 并行效率: {efficiency:.1f}%")
+
+ if actual_speedup > 3:
+ print(f" 🎉 并行化效果excellent!")
+ elif actual_speedup > 2:
+ print(f" ✅ 并行化效果良好!")
+ else:
+ print(f" ⚠️ 并行化效果一般,可能受网络延迟影响")
+
+ print(f"\n💰 成本估算:")
+ total_requests = len(test_papers) * 3 # 3次测试
+ estimated_cost = total_requests * 0.0001 # 估算每次请求成本
+ print(f" - 总API调用: {total_requests} 次")
+ print(f" - 估算成本: ${estimated_cost:.4f}")
+
+ except Exception as e:
+ print(f"❌ 测试过程出错: {e}")
+ import traceback
+ print(f"详细错误: {traceback.format_exc()}")
+
+
+def demo_usage():
+ """演示如何使用并行功能"""
+
+ print(f"\n" + "="*60)
+ print("📖 使用方法说明")
+ print("="*60)
+
+ print("🔧 环境变量控制:")
+ print(" USE_PARALLEL=true/false # 是否启用并行处理")
+ print(" MAX_CONCURRENT=16 # 最大并发请求数")
+
+ print("\n💡 使用示例:")
+ print(" # 默认并行处理")
+ print(" python scripts/fetch_papers.py")
+ print("")
+ print(" # 关闭并行处理")
+ print(" USE_PARALLEL=false python scripts/fetch_papers.py")
+ print("")
+ print(" # 自定义并发数")
+ print(" MAX_CONCURRENT=25 python scripts/fetch_papers.py")
+ print("")
+ print(" # 历史模式高并发")
+ print(" FETCH_MODE=historical MAX_CONCURRENT=40 python scripts/fetch_papers.py")
+
+ print("\n⚠️ 注意事项:")
+ print(" - 并发数过高可能触发OpenAI速率限制")
+ print(" - 建议日常模式并发≤20,历史模式并发≤30")
+ print(" - 网络不稳定时建议降低并发数")
+ print(" - 并行处理会增加API调用成本(同时间内更多请求)")
+
+
+if __name__ == "__main__":
+ test_parallel_performance()
+ demo_usage() \ No newline at end of file
diff --git a/scripts/test_reverse_chronological.py b/scripts/test_reverse_chronological.py
new file mode 100644
index 0000000..fb71933
--- /dev/null
+++ b/scripts/test_reverse_chronological.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""
+测试时间倒序排列功能
+
+验证README更新逻辑是否正确地将最新论文放在最前面,
+确保论文始终按时间倒序排列。
+"""
+
+import os
+import sys
+import tempfile
+from datetime import datetime, timezone
+
+# Add the parent directory to the path so we can import the main module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scripts.fetch_papers import GitHubUpdater
+
+
+def test_reverse_chronological_order():
+ """测试时间倒序插入逻辑"""
+
+ print("🔍 测试README时间倒序排列功能")
+ print("=" * 60)
+
+ # Create a mock README content
+ mock_readme_content = """# ArXiv Social Good AI Paper Fetcher
+
+An automated system for discovering and cataloging research papers related to AI bias, fairness, and social good from arXiv.org.
+
+## 🎯 Features
+
+- **Intelligent Paper Detection**: Uses GPT-4o to analyze papers
+- **Automated Daily Updates**: Runs daily via GitHub Actions
+
+## 🔧 Setup & Configuration
+
+Setup instructions here...
+
+## 🚀 Usage
+
+Usage instructions here...
+
+**Note**: This tool is designed for academic research purposes. Please respect arXiv's usage policies.
+
+## Papers Updated on 2024-01-15 08:00 UTC
+
+### Old Paper 1
+
+**Authors:** Author A, Author B
+
+**Categories:** cs.AI, cs.LG
+
+**Published:** 2024-01-14T10:00:00Z
+
+**Abstract:** This is an old paper abstract...
+
+**Link:** [arXiv:2401.12345](https://arxiv.org/abs/2401.12345)
+
+---
+
+### Old Paper 2
+
+**Authors:** Author C, Author D
+
+**Categories:** cs.CL
+
+**Published:** 2024-01-13T15:30:00Z
+
+**Abstract:** This is another old paper abstract...
+
+**Link:** [arXiv:2401.12346](https://arxiv.org/abs/2401.12346)
+
+---
+"""
+
+ print("📄 模拟的现有README内容:")
+ print(" - 包含项目描述和设置说明")
+ print(" - 已有2篇旧论文 (2024-01-15 和 2024-01-13)")
+ print(" - 测试新论文是否会插入到正确位置")
+
+ # Create mock new papers (should be inserted at the top)
+ new_papers = [
+ {
+ 'title': 'Brand New Paper on AI Fairness',
+ 'authors': ['New Author A', 'New Author B', 'New Author C', 'New Author D'],
+ 'categories': ['cs.AI', 'cs.LG', 'cs.CL'],
+ 'published': '2024-01-16T12:00:00Z',
+ 'abstract': 'This is a brand new paper about AI fairness that should appear at the top of the README.',
+ 'link': 'https://arxiv.org/abs/2401.99999',
+ 'arxiv_id': '2401.99999'
+ },
+ {
+ 'title': 'Another New Paper on Social Good AI',
+ 'authors': ['New Author E', 'New Author F'],
+ 'categories': ['cs.AI', 'cs.HC'],
+ 'published': '2024-01-16T09:30:00Z',
+ 'abstract': 'This is another new paper about social good AI applications.',
+ 'link': 'https://arxiv.org/abs/2401.99998',
+ 'arxiv_id': '2401.99998'
+ }
+ ]
+
+ print(f"\n📝 模拟添加 {len(new_papers)} 篇新论文:")
+ for i, paper in enumerate(new_papers, 1):
+ print(f" {i}. {paper['title'][:50]}... ({paper['published'][:10]})")
+
+ # Test the insertion logic
+ print(f"\n🧪 测试插入位置查找逻辑...")
+
+ class MockGitHubUpdater(GitHubUpdater):
+ def __init__(self):
+ # Skip the parent __init__ to avoid GitHub API calls
+ pass
+
+ def test_insert_position(self, content):
+ return self._find_papers_insert_position(content)
+
+ def test_format_new_section(self, papers, section_title):
+ new_section = f"\n\n## {section_title}\n\n"
+
+ for paper in papers:
+ # Format paper entry
+ authors_str = ", ".join(paper['authors'][:3]) # First 3 authors
+ if len(paper['authors']) > 3:
+ authors_str += " et al."
+
+ categories_str = ", ".join(paper['categories'])
+
+ new_section += f"### {paper['title']}\n\n"
+ new_section += f"**Authors:** {authors_str}\n\n"
+ new_section += f"**Categories:** {categories_str}\n\n"
+ new_section += f"**Published:** {paper['published']}\n\n"
+ new_section += f"**Abstract:** {paper['abstract']}\n\n"
+ new_section += f"**Link:** [arXiv:{paper['arxiv_id']}]({paper['link']})\n\n"
+ new_section += "---\n\n"
+
+ return new_section
+
+ # Test insertion position finding
+ updater = MockGitHubUpdater()
+ insert_pos = updater.test_insert_position(mock_readme_content)
+
+ if insert_pos > 0:
+ lines_before = mock_readme_content[:insert_pos].count('\n')
+ print(f" ✅ 找到插入位置: 第 {lines_before} 行之后")
+
+ # Show the context around insertion point
+ lines = mock_readme_content.split('\n')
+ context_start = max(0, lines_before - 2)
+ context_end = min(len(lines), lines_before + 3)
+
+ print(f" 📍 插入位置上下文:")
+ for i in range(context_start, context_end):
+ if i < len(lines):
+ marker = " >>> 插入点 <<<" if i == lines_before else ""
+ print(f" {i+1:2d}: {lines[i][:50]}{marker}")
+ else:
+ print(f" ⚠️ 未找到合适插入位置,将使用末尾追加")
+
+ # Test the complete update logic
+ print(f"\n🔄 测试完整的更新逻辑...")
+
+ section_title = f"Papers Updated on {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}"
+ new_section = updater.test_format_new_section(new_papers, section_title)
+
+ if insert_pos > 0:
+ updated_content = (mock_readme_content[:insert_pos] +
+ new_section +
+ mock_readme_content[insert_pos:])
+ print(f" ✅ 新内容插入到正确位置")
+ else:
+ updated_content = mock_readme_content + new_section
+ print(f" ⚠️ 新内容追加到末尾")
+
+ # Analyze the result
+ print(f"\n📊 结果分析:")
+
+ # Find all paper sections in the updated content
+ lines = updated_content.split('\n')
+ paper_sections = []
+
+ for i, line in enumerate(lines):
+ if line.startswith('## Papers Updated on') or line.startswith('## Historical'):
+ # Found a paper section header
+ section_info = {
+ 'line': i + 1,
+ 'title': line,
+ 'date_str': None
+ }
+
+ # Extract date from title
+ if 'Updated on' in line:
+ try:
+ date_part = line.split('Updated on ')[1].split(' UTC')[0]
+ section_info['date_str'] = date_part
+ except:
+ pass
+
+ paper_sections.append(section_info)
+
+ print(f" - 找到 {len(paper_sections)} 个论文段落:")
+ for i, section in enumerate(paper_sections, 1):
+ print(f" {i}. {section['title'][:60]}... (第{section['line']}行)")
+
+ # Check if chronological order is correct
+ if len(paper_sections) >= 2:
+ first_section = paper_sections[0]
+ second_section = paper_sections[1]
+
+ print(f"\n🎯 时间倒序验证:")
+ print(f" - 第一个段落: {first_section['title'][:40]}...")
+ print(f" - 第二个段落: {second_section['title'][:40]}...")
+
+ if first_section['date_str'] and second_section['date_str']:
+ first_is_newer = first_section['date_str'] > second_section['date_str']
+ if first_is_newer:
+ print(f" ✅ 时间倒序正确!最新论文在最上面")
+ else:
+ print(f" ❌ 时间倒序错误!需要调整插入逻辑")
+ else:
+ print(f" ℹ️ 无法比较日期,请手动检查")
+
+ # Save result to temporary file for inspection
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+ f.write(updated_content)
+ temp_file = f.name
+
+ print(f"\n📄 完整结果已保存到临时文件: {temp_file}")
+ print(f" 可以手动检查README更新结果")
+
+ print(f"\n✅ 测试完成!")
+ print(f" 关键改进:")
+ print(f" - ✅ 新论文会插入到README开头部分")
+ print(f" - ✅ 保持时间倒序排列(最新在上)")
+ print(f" - ✅ 避免在文档末尾追加")
+ print(f" - ✅ 智能识别插入位置")
+
+
+if __name__ == "__main__":
+ test_reverse_chronological_order() \ No newline at end of file
diff --git a/scripts/test_social_good_prompt.py b/scripts/test_social_good_prompt.py
new file mode 100644
index 0000000..c4ce5af
--- /dev/null
+++ b/scripts/test_social_good_prompt.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+"""
+测试Social Good导向的prompt
+
+验证新的prompt是否能正确识别社会影响相关的偏见研究,
+包括各种应用领域如医疗、教育、司法等。
+"""
+
+import os
+import sys
+import logging
+
+# 设置日志
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - %(message)s',
+ handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+
+# Add the parent directory to the path so we can import the main module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scripts.fetch_papers import ArxivPaperFetcher, GPT_SYSTEM_PROMPT
+
+
+def test_prompt_with_examples():
+ """使用示例论文测试新的prompt"""
+
+ print("🔍 测试Social Good导向的prompt")
+ print("=" * 60)
+
+ # 检查API密钥
+ openai_api_key = os.getenv("OPENAI_API_KEY")
+ if not openai_api_key:
+ print("❌ 请设置OPENAI_API_KEY环境变量")
+ print(" export OPENAI_API_KEY='your-api-key-here'")
+ return
+
+ print("✅ OpenAI API密钥已设置")
+ print(f"\n📋 当前prompt概要:")
+ print(" - 专注于社会影响的AI偏见研究")
+ print(" - 涵盖医疗、教育、司法、招聘等应用领域")
+ print(" - 关注弱势群体和社会公正")
+
+ # 初始化fetcher
+ fetcher = ArxivPaperFetcher(openai_api_key)
+
+ # 测试用例:应该被识别为相关的论文
+ positive_examples = [
+ {
+ "title": "Algorithmic Bias in Healthcare AI: Impact on Minority Populations",
+ "abstract": "This study examines how machine learning models used in healthcare decision-making exhibit systematic bias against racial minorities, leading to disparate treatment outcomes. We analyze bias in diagnostic algorithms and propose mitigation strategies to ensure equitable healthcare delivery."
+ },
+ {
+ "title": "Fairness in Hiring: Bias Detection in Resume Screening AI",
+ "abstract": "We investigate gender and racial bias in AI-powered resume screening systems used by major corporations. Our analysis reveals significant discrimination patterns and proposes fairness-aware algorithms to promote equal employment opportunities."
+ },
+ {
+ "title": "Criminal Justice AI: Bias in Recidivism Prediction Systems",
+ "abstract": "This paper analyzes bias in algorithmic risk assessment tools used in criminal justice, showing how these systems perpetuate racial disparities in sentencing and parole decisions. We propose bias auditing frameworks for judicial AI systems."
+ },
+ {
+ "title": "Educational Equity: Bias in AI-Powered Learning Platforms",
+ "abstract": "We examine how bias in educational AI systems affects learning outcomes for students from different socioeconomic backgrounds, identifying disparities in recommendation algorithms and assessment tools that impact educational equity."
+ },
+ {
+ "title": "Social Media Content Moderation: Bias Against Marginalized Communities",
+ "abstract": "This study reveals how AI content moderation systems disproportionately flag and remove content from LGBTQ+ and minority communities, examining the social impact of biased algorithmic enforcement on free expression and community safety."
+ }
+ ]
+
+ # 测试用例:应该被识别为不相关的论文
+ negative_examples = [
+ {
+ "title": "Optimizing Deep Neural Network Architecture for Image Classification",
+ "abstract": "We propose a novel neural network architecture that achieves state-of-the-art performance on ImageNet classification tasks. Our method introduces efficient attention mechanisms and demonstrates superior accuracy on standard benchmarks."
+ },
+ {
+ "title": "Quantum Computing Algorithms for Optimization Problems",
+ "abstract": "This paper presents quantum algorithms for solving complex optimization problems, demonstrating computational advantages over classical approaches. We analyze quantum circuit design and error correction techniques."
+ },
+ {
+ "title": "Blockchain Technology for Supply Chain Management",
+ "abstract": "We develop a blockchain-based framework for transparent supply chain tracking, improving traceability and reducing fraud in manufacturing and logistics. The system demonstrates scalability and security benefits."
+ },
+ {
+ "title": "5G Network Performance Optimization Using Machine Learning",
+ "abstract": "This study applies machine learning techniques to optimize 5G network performance, improving bandwidth allocation and reducing latency. We propose adaptive algorithms for network resource management."
+ }
+ ]
+
+ print(f"\n🧪 开始测试...")
+
+ # 测试正面例子
+ print(f"\n✅ 测试应该识别为相关的论文:")
+ positive_results = []
+ for i, example in enumerate(positive_examples, 1):
+ try:
+ is_relevant = fetcher._check_paper_relevance(example)
+ positive_results.append(is_relevant)
+ status = "✅ 正确" if is_relevant else "❌ 错误"
+ print(f" {i}. {status}: {example['title'][:50]}...")
+ except Exception as e:
+ print(f" {i}. ⚠️ 错误: {e}")
+ positive_results.append(False)
+
+ # 测试负面例子
+ print(f"\n❌ 测试应该识别为不相关的论文:")
+ negative_results = []
+ for i, example in enumerate(negative_examples, 1):
+ try:
+ is_relevant = fetcher._check_paper_relevance(example)
+ negative_results.append(not is_relevant) # 期望不相关,所以取反
+ status = "✅ 正确" if not is_relevant else "❌ 错误"
+ print(f" {i}. {status}: {example['title'][:50]}...")
+ except Exception as e:
+ print(f" {i}. ⚠️ 错误: {e}")
+ negative_results.append(False)
+
+ # 计算准确率
+ print(f"\n📊 测试结果统计:")
+ positive_accuracy = sum(positive_results) / len(positive_results) * 100
+ negative_accuracy = sum(negative_results) / len(negative_results) * 100
+ overall_accuracy = (sum(positive_results) + sum(negative_results)) / (len(positive_results) + len(negative_results)) * 100
+
+ print(f" - 正面例子准确率: {positive_accuracy:.1f}% ({sum(positive_results)}/{len(positive_results)})")
+ print(f" - 负面例子准确率: {negative_accuracy:.1f}% ({sum(negative_results)}/{len(negative_results)})")
+ print(f" - 总体准确率: {overall_accuracy:.1f}%")
+
+ # 评估结果
+ print(f"\n🎯 prompt评估:")
+ if overall_accuracy >= 80:
+ print(f" 🎉 excellent! prompt表现优秀")
+ elif overall_accuracy >= 60:
+ print(f" ✅ 良好,prompt表现不错")
+ else:
+ print(f" ⚠️ 需要改进,prompt可能需要调整")
+
+ # 显示具体的改进点
+ if positive_accuracy < 80:
+ print(f" 💡 建议: 可能需要强化对Social Good应用场景的识别")
+ if negative_accuracy < 80:
+ print(f" 💡 建议: 可能需要更明确地排除纯技术性研究")
+
+
+def display_new_prompt():
+ """显示新的prompt内容"""
+
+ print(f"\n" + "="*60)
+ print("📖 新的Social Good导向prompt")
+ print("="*60)
+
+ print(f"\n🎯 主要变化:")
+ print(" - 从专注LLM偏见 → 扩展到所有AI社会影响")
+ print(" - 增加了具体应用领域 (医疗、教育、司法等)")
+ print(" - 强调对弱势群体和社会公正的关注")
+ print(" - 明确排除纯技术性研究")
+
+ print(f"\n📋 新的识别标准:")
+ domains = [
+ "医疗AI中的偏见",
+ "教育技术的公平性",
+ "司法算法的歧视",
+ "招聘AI的偏见",
+ "金融AI的公平性",
+ "推荐系统的偏见",
+ "内容审核的偏见",
+ "数据集的代表性",
+ "AI系统的包容性",
+ "算法审计和评估"
+ ]
+
+ for domain in domains:
+ print(f" ✅ {domain}")
+
+ print(f"\n🎯 关注重点:")
+ print(" - 社会影响和社会公正")
+ print(" - 弱势群体和边缘化社区")
+ print(" - 实际应用中的公平性")
+ print(" - 系统性偏见的社会后果")
+
+
+if __name__ == "__main__":
+ display_new_prompt()
+ test_prompt_with_examples() \ No newline at end of file