scripts/test_reverse_chronological.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241

#!/usr/bin/env python3
"""
测试时间倒序排列功能

验证README更新逻辑是否正确地将最新论文放在最前面，
确保论文始终按时间倒序排列。
"""

import os
import sys
import tempfile
from datetime import datetime, timezone

# Add the parent directory to the path so we can import the main module
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from scripts.fetch_papers import GitHubUpdater


def test_reverse_chronological_order():
    """测试时间倒序插入逻辑"""
    
    print("🔍 测试README时间倒序排列功能")
    print("=" * 60)
    
    # Create a mock README content
    mock_readme_content = """# ArXiv Social Good AI Paper Fetcher

An automated system for discovering and cataloging research papers related to AI bias, fairness, and social good from arXiv.org.

## 🎯 Features

- **Intelligent Paper Detection**: Uses GPT-4o to analyze papers
- **Automated Daily Updates**: Runs daily via GitHub Actions

## 🔧 Setup & Configuration

Setup instructions here...

## 🚀 Usage

Usage instructions here...

**Note**: This tool is designed for academic research purposes. Please respect arXiv's usage policies.

## Papers Updated on 2024-01-15 08:00 UTC

### Old Paper 1

**Authors:** Author A, Author B

**Categories:** cs.AI, cs.LG

**Published:** 2024-01-14T10:00:00Z

**Abstract:** This is an old paper abstract...

**Link:** [arXiv:2401.12345](https://arxiv.org/abs/2401.12345)

---

### Old Paper 2

**Authors:** Author C, Author D

**Categories:** cs.CL

**Published:** 2024-01-13T15:30:00Z

**Abstract:** This is another old paper abstract...

**Link:** [arXiv:2401.12346](https://arxiv.org/abs/2401.12346)

---
"""
    
    print("📄 模拟的现有README内容:")
    print("   - 包含项目描述和设置说明")
    print("   - 已有2篇旧论文 (2024-01-15 和 2024-01-13)")
    print("   - 测试新论文是否会插入到正确位置")
    
    # Create mock new papers (should be inserted at the top)
    new_papers = [
        {
            'title': 'Brand New Paper on AI Fairness',
            'authors': ['New Author A', 'New Author B', 'New Author C', 'New Author D'],
            'categories': ['cs.AI', 'cs.LG', 'cs.CL'],
            'published': '2024-01-16T12:00:00Z',
            'abstract': 'This is a brand new paper about AI fairness that should appear at the top of the README.',
            'link': 'https://arxiv.org/abs/2401.99999',
            'arxiv_id': '2401.99999'
        },
        {
            'title': 'Another New Paper on Social Good AI',
            'authors': ['New Author E', 'New Author F'],
            'categories': ['cs.AI', 'cs.HC'],
            'published': '2024-01-16T09:30:00Z',
            'abstract': 'This is another new paper about social good AI applications.',
            'link': 'https://arxiv.org/abs/2401.99998',
            'arxiv_id': '2401.99998'
        }
    ]
    
    print(f"\n📝 模拟添加 {len(new_papers)} 篇新论文:")
    for i, paper in enumerate(new_papers, 1):
        print(f"   {i}. {paper['title'][:50]}... ({paper['published'][:10]})")
    
    # Test the insertion logic
    print(f"\n🧪 测试插入位置查找逻辑...")
    
    class MockGitHubUpdater(GitHubUpdater):
        def __init__(self):
            # Skip the parent __init__ to avoid GitHub API calls
            pass
        
        def test_insert_position(self, content):
            return self._find_papers_insert_position(content)
        
        def test_format_new_section(self, papers, section_title):
            new_section = f"\n\n## {section_title}\n\n"
            
            for paper in papers:
                # Format paper entry
                authors_str = ", ".join(paper['authors'][:3])  # First 3 authors
                if len(paper['authors']) > 3:
                    authors_str += " et al."
                
                categories_str = ", ".join(paper['categories'])
                
                new_section += f"### {paper['title']}\n\n"
                new_section += f"**Authors:** {authors_str}\n\n"
                new_section += f"**Categories:** {categories_str}\n\n"
                new_section += f"**Published:** {paper['published']}\n\n"
                new_section += f"**Abstract:** {paper['abstract']}\n\n"
                new_section += f"**Link:** [arXiv:{paper['arxiv_id']}]({paper['link']})\n\n"
                new_section += "---\n\n"
            
            return new_section
    
    # Test insertion position finding
    updater = MockGitHubUpdater()
    insert_pos = updater.test_insert_position(mock_readme_content)
    
    if insert_pos > 0:
        lines_before = mock_readme_content[:insert_pos].count('\n')
        print(f"   ✅ 找到插入位置: 第 {lines_before} 行之后")
        
        # Show the context around insertion point
        lines = mock_readme_content.split('\n')
        context_start = max(0, lines_before - 2)
        context_end = min(len(lines), lines_before + 3)
        
        print(f"   📍 插入位置上下文:")
        for i in range(context_start, context_end):
            if i < len(lines):
                marker = " >>> 插入点 <<<" if i == lines_before else ""
                print(f"     {i+1:2d}: {lines[i][:50]}{marker}")
    else:
        print(f"   ⚠️ 未找到合适插入位置，将使用末尾追加")
    
    # Test the complete update logic
    print(f"\n🔄 测试完整的更新逻辑...")
    
    section_title = f"Papers Updated on {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}"
    new_section = updater.test_format_new_section(new_papers, section_title)
    
    if insert_pos > 0:
        updated_content = (mock_readme_content[:insert_pos] + 
                         new_section + 
                         mock_readme_content[insert_pos:])
        print(f"   ✅ 新内容插入到正确位置")
    else:
        updated_content = mock_readme_content + new_section
        print(f"   ⚠️ 新内容追加到末尾")
    
    # Analyze the result
    print(f"\n📊 结果分析:")
    
    # Find all paper sections in the updated content
    lines = updated_content.split('\n')
    paper_sections = []
    
    for i, line in enumerate(lines):
        if line.startswith('## Papers Updated on') or line.startswith('## Historical'):
            # Found a paper section header
            section_info = {
                'line': i + 1,
                'title': line,
                'date_str': None
            }
            
            # Extract date from title
            if 'Updated on' in line:
                try:
                    date_part = line.split('Updated on ')[1].split(' UTC')[0]
                    section_info['date_str'] = date_part
                except:
                    pass
            
            paper_sections.append(section_info)
    
    print(f"   - 找到 {len(paper_sections)} 个论文段落:")
    for i, section in enumerate(paper_sections, 1):
        print(f"     {i}. {section['title'][:60]}... (第{section['line']}行)")
    
    # Check if chronological order is correct
    if len(paper_sections) >= 2:
        first_section = paper_sections[0]
        second_section = paper_sections[1]
        
        print(f"\n🎯 时间倒序验证:")
        print(f"   - 第一个段落: {first_section['title'][:40]}...")
        print(f"   - 第二个段落: {second_section['title'][:40]}...")
        
        if first_section['date_str'] and second_section['date_str']:
            first_is_newer = first_section['date_str'] > second_section['date_str']
            if first_is_newer:
                print(f"   ✅ 时间倒序正确！最新论文在最上面")
            else:
                print(f"   ❌ 时间倒序错误！需要调整插入逻辑")
        else:
            print(f"   ℹ️ 无法比较日期，请手动检查")
    
    # Save result to temporary file for inspection
    with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
        f.write(updated_content)
        temp_file = f.name
    
    print(f"\n📄 完整结果已保存到临时文件: {temp_file}")
    print(f"   可以手动检查README更新结果")
    
    print(f"\n✅ 测试完成！")
    print(f"   关键改进:")
    print(f"   - ✅ 新论文会插入到README开头部分")
    print(f"   - ✅ 保持时间倒序排列（最新在上）")
    print(f"   - ✅ 避免在文档末尾追加")
    print(f"   - ✅ 智能识别插入位置")


if __name__ == "__main__":
    test_reverse_chronological_order()