diff options
Diffstat (limited to 'scripts/test_reverse_chronological.py')
| -rw-r--r-- | scripts/test_reverse_chronological.py | 241 |
1 files changed, 241 insertions, 0 deletions
diff --git a/scripts/test_reverse_chronological.py b/scripts/test_reverse_chronological.py new file mode 100644 index 0000000..fb71933 --- /dev/null +++ b/scripts/test_reverse_chronological.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +测试时间倒序排列功能 + +验证README更新逻辑是否正确地将最新论文放在最前面, +确保论文始终按时间倒序排列。 +""" + +import os +import sys +import tempfile +from datetime import datetime, timezone + +# Add the parent directory to the path so we can import the main module +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from scripts.fetch_papers import GitHubUpdater + + +def test_reverse_chronological_order(): + """测试时间倒序插入逻辑""" + + print("🔍 测试README时间倒序排列功能") + print("=" * 60) + + # Create a mock README content + mock_readme_content = """# ArXiv Social Good AI Paper Fetcher + +An automated system for discovering and cataloging research papers related to AI bias, fairness, and social good from arXiv.org. + +## 🎯 Features + +- **Intelligent Paper Detection**: Uses GPT-4o to analyze papers +- **Automated Daily Updates**: Runs daily via GitHub Actions + +## 🔧 Setup & Configuration + +Setup instructions here... + +## 🚀 Usage + +Usage instructions here... + +**Note**: This tool is designed for academic research purposes. Please respect arXiv's usage policies. + +## Papers Updated on 2024-01-15 08:00 UTC + +### Old Paper 1 + +**Authors:** Author A, Author B + +**Categories:** cs.AI, cs.LG + +**Published:** 2024-01-14T10:00:00Z + +**Abstract:** This is an old paper abstract... + +**Link:** [arXiv:2401.12345](https://arxiv.org/abs/2401.12345) + +--- + +### Old Paper 2 + +**Authors:** Author C, Author D + +**Categories:** cs.CL + +**Published:** 2024-01-13T15:30:00Z + +**Abstract:** This is another old paper abstract... + +**Link:** [arXiv:2401.12346](https://arxiv.org/abs/2401.12346) + +--- +""" + + print("📄 模拟的现有README内容:") + print(" - 包含项目描述和设置说明") + print(" - 已有2篇旧论文 (2024-01-15 和 2024-01-13)") + print(" - 测试新论文是否会插入到正确位置") + + # Create mock new papers (should be inserted at the top) + new_papers = [ + { + 'title': 'Brand New Paper on AI Fairness', + 'authors': ['New Author A', 'New Author B', 'New Author C', 'New Author D'], + 'categories': ['cs.AI', 'cs.LG', 'cs.CL'], + 'published': '2024-01-16T12:00:00Z', + 'abstract': 'This is a brand new paper about AI fairness that should appear at the top of the README.', + 'link': 'https://arxiv.org/abs/2401.99999', + 'arxiv_id': '2401.99999' + }, + { + 'title': 'Another New Paper on Social Good AI', + 'authors': ['New Author E', 'New Author F'], + 'categories': ['cs.AI', 'cs.HC'], + 'published': '2024-01-16T09:30:00Z', + 'abstract': 'This is another new paper about social good AI applications.', + 'link': 'https://arxiv.org/abs/2401.99998', + 'arxiv_id': '2401.99998' + } + ] + + print(f"\n📝 模拟添加 {len(new_papers)} 篇新论文:") + for i, paper in enumerate(new_papers, 1): + print(f" {i}. {paper['title'][:50]}... ({paper['published'][:10]})") + + # Test the insertion logic + print(f"\n🧪 测试插入位置查找逻辑...") + + class MockGitHubUpdater(GitHubUpdater): + def __init__(self): + # Skip the parent __init__ to avoid GitHub API calls + pass + + def test_insert_position(self, content): + return self._find_papers_insert_position(content) + + def test_format_new_section(self, papers, section_title): + new_section = f"\n\n## {section_title}\n\n" + + for paper in papers: + # Format paper entry + authors_str = ", ".join(paper['authors'][:3]) # First 3 authors + if len(paper['authors']) > 3: + authors_str += " et al." + + categories_str = ", ".join(paper['categories']) + + new_section += f"### {paper['title']}\n\n" + new_section += f"**Authors:** {authors_str}\n\n" + new_section += f"**Categories:** {categories_str}\n\n" + new_section += f"**Published:** {paper['published']}\n\n" + new_section += f"**Abstract:** {paper['abstract']}\n\n" + new_section += f"**Link:** [arXiv:{paper['arxiv_id']}]({paper['link']})\n\n" + new_section += "---\n\n" + + return new_section + + # Test insertion position finding + updater = MockGitHubUpdater() + insert_pos = updater.test_insert_position(mock_readme_content) + + if insert_pos > 0: + lines_before = mock_readme_content[:insert_pos].count('\n') + print(f" ✅ 找到插入位置: 第 {lines_before} 行之后") + + # Show the context around insertion point + lines = mock_readme_content.split('\n') + context_start = max(0, lines_before - 2) + context_end = min(len(lines), lines_before + 3) + + print(f" 📍 插入位置上下文:") + for i in range(context_start, context_end): + if i < len(lines): + marker = " >>> 插入点 <<<" if i == lines_before else "" + print(f" {i+1:2d}: {lines[i][:50]}{marker}") + else: + print(f" ⚠️ 未找到合适插入位置,将使用末尾追加") + + # Test the complete update logic + print(f"\n🔄 测试完整的更新逻辑...") + + section_title = f"Papers Updated on {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}" + new_section = updater.test_format_new_section(new_papers, section_title) + + if insert_pos > 0: + updated_content = (mock_readme_content[:insert_pos] + + new_section + + mock_readme_content[insert_pos:]) + print(f" ✅ 新内容插入到正确位置") + else: + updated_content = mock_readme_content + new_section + print(f" ⚠️ 新内容追加到末尾") + + # Analyze the result + print(f"\n📊 结果分析:") + + # Find all paper sections in the updated content + lines = updated_content.split('\n') + paper_sections = [] + + for i, line in enumerate(lines): + if line.startswith('## Papers Updated on') or line.startswith('## Historical'): + # Found a paper section header + section_info = { + 'line': i + 1, + 'title': line, + 'date_str': None + } + + # Extract date from title + if 'Updated on' in line: + try: + date_part = line.split('Updated on ')[1].split(' UTC')[0] + section_info['date_str'] = date_part + except: + pass + + paper_sections.append(section_info) + + print(f" - 找到 {len(paper_sections)} 个论文段落:") + for i, section in enumerate(paper_sections, 1): + print(f" {i}. {section['title'][:60]}... (第{section['line']}行)") + + # Check if chronological order is correct + if len(paper_sections) >= 2: + first_section = paper_sections[0] + second_section = paper_sections[1] + + print(f"\n🎯 时间倒序验证:") + print(f" - 第一个段落: {first_section['title'][:40]}...") + print(f" - 第二个段落: {second_section['title'][:40]}...") + + if first_section['date_str'] and second_section['date_str']: + first_is_newer = first_section['date_str'] > second_section['date_str'] + if first_is_newer: + print(f" ✅ 时间倒序正确!最新论文在最上面") + else: + print(f" ❌ 时间倒序错误!需要调整插入逻辑") + else: + print(f" ℹ️ 无法比较日期,请手动检查") + + # Save result to temporary file for inspection + with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: + f.write(updated_content) + temp_file = f.name + + print(f"\n📄 完整结果已保存到临时文件: {temp_file}") + print(f" 可以手动检查README更新结果") + + print(f"\n✅ 测试完成!") + print(f" 关键改进:") + print(f" - ✅ 新论文会插入到README开头部分") + print(f" - ✅ 保持时间倒序排列(最新在上)") + print(f" - ✅ 避免在文档末尾追加") + print(f" - ✅ 智能识别插入位置") + + +if __name__ == "__main__": + test_reverse_chronological_order()
\ No newline at end of file |
