diff options
Diffstat (limited to 'scripts/test_improved_fetch.py')
| -rw-r--r-- | scripts/test_improved_fetch.py | 168 |
1 files changed, 168 insertions, 0 deletions
diff --git a/scripts/test_improved_fetch.py b/scripts/test_improved_fetch.py new file mode 100644 index 0000000..14490f0 --- /dev/null +++ b/scripts/test_improved_fetch.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +""" +测试改进后的论文抓取功能 + +验证分别查询每个类别和去重逻辑是否正常工作。 +""" + +import os +import sys +import logging +from datetime import datetime, timezone, timedelta + +# 设置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[logging.StreamHandler(sys.stdout)] +) +logger = logging.getLogger(__name__) + +# Add the parent directory to the path so we can import the main module +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from scripts.fetch_papers import ArxivPaperFetcher + + +def test_improved_fetching(): + """测试改进后的抓取逻辑""" + + print("🚀 测试改进后的论文抓取逻辑") + print("=" * 60) + + # 创建一个模拟的fetcher(不需要OpenAI API) + class MockArxivFetcher(ArxivPaperFetcher): + def __init__(self): + import requests + self.session = requests.Session() + self.session.headers.update({ + 'User-Agent': 'PaperFetcher/1.0 (Test)' + }) + + # 测试不同的时间范围 + fetcher = MockArxivFetcher() + + print("\n🕐 测试1: 过去1天(应该显示0篇论文)") + end_date = datetime.now(timezone.utc) + start_date = end_date - timedelta(days=1) + papers_1day = fetcher.fetch_papers_by_date_range(start_date, end_date, max_papers=100) + + print(f"\n🕐 测试2: 过去7天(应该显示更多论文和详细分布)") + start_date = end_date - timedelta(days=7) + papers_7days = fetcher.fetch_papers_by_date_range(start_date, end_date, max_papers=300) + + print(f"\n📊 改进效果对比:") + print(f" - 过去1天: {len(papers_1day)} 篇论文") + print(f" - 过去7天: {len(papers_7days)} 篇论文") + + if papers_7days: + print(f"\n📋 论文样本 (前3篇):") + for i, paper in enumerate(papers_7days[:3], 1): + print(f"\n{i}. {paper['title'][:80]}...") + print(f" arXiv ID: {paper['arxiv_id']}") + print(f" 更新时间: {paper['updated']}") + print(f" 类别: {', '.join(paper['categories'][:3])}") + print(f" 作者: {', '.join(paper['authors'][:2])}") + if len(paper['authors']) > 2: + print(f" et al.") + + print(f"\n✅ 改进后的优势:") + print(f" - ✅ 分别查询每个类别,避免OR查询限制") + print(f" - ✅ 自动去重,避免重复论文") + print(f" - ✅ 详细的类别分布统计") + print(f" - ✅ 更准确的日期分布分析") + print(f" - ✅ 更透明的日志显示") + +def test_category_overlap(): + """测试类别重叠和去重功能""" + + print(f"\n" + "="*60) + print("🔍 测试类别重叠和去重功能") + print("="*60) + + # 简单测试:手动获取几个类别,看看重叠情况 + import requests + import feedparser + from collections import defaultdict + + categories = ['cs.AI', 'cs.LG', 'cs.CL'] + papers_by_category = {} + arxiv_ids_seen = set() + overlaps = defaultdict(list) + + for cat in categories: + print(f"\n📂 获取 {cat} 类别的论文...") + + params = { + 'search_query': f'cat:{cat}', + 'sortBy': 'submittedDate', + 'sortOrder': 'descending', + 'max_results': 50 + } + + try: + response = requests.get('http://export.arxiv.org/api/query', params=params, timeout=10) + feed = feedparser.parse(response.content) + entries = feed.entries + + papers_by_category[cat] = [] + + for entry in entries: + arxiv_id = entry.id.split('/')[-1] + title = entry.title.replace('\n', ' ').strip() + categories_list = [tag.term for tag in entry.tags] if hasattr(entry, 'tags') else [] + + papers_by_category[cat].append({ + 'arxiv_id': arxiv_id, + 'title': title, + 'categories': categories_list + }) + + # 检查重叠 + if arxiv_id in arxiv_ids_seen: + overlaps[arxiv_id].append(cat) + else: + arxiv_ids_seen.add(arxiv_id) + overlaps[arxiv_id] = [cat] + + print(f" 获得 {len(entries)} 篇论文") + + except Exception as e: + print(f" 错误: {e}") + + # 分析重叠情况 + print(f"\n📊 重叠分析:") + total_papers = sum(len(papers) for papers in papers_by_category.values()) + unique_papers = len(arxiv_ids_seen) + duplicate_papers = total_papers - unique_papers + + print(f" - 总获取论文: {total_papers} 篇") + print(f" - 唯一论文: {unique_papers} 篇") + print(f" - 重复论文: {duplicate_papers} 篇") + print(f" - 去重率: {duplicate_papers/total_papers*100:.1f}%") + + # 显示一些重叠例子 + overlap_examples = [(arxiv_id, cats) for arxiv_id, cats in overlaps.items() if len(cats) > 1][:5] + + if overlap_examples: + print(f"\n📋 重叠论文示例:") + for arxiv_id, cats in overlap_examples: + # 找到这篇论文的标题 + title = "未找到标题" + for cat, papers in papers_by_category.items(): + for paper in papers: + if paper['arxiv_id'] == arxiv_id: + title = paper['title'][:60] + "..." if len(paper['title']) > 60 else paper['title'] + break + if title != "未找到标题": + break + + print(f" - {arxiv_id}: {title}") + print(f" 类别: {', '.join(cats)}") + + print(f"\n✅ 这证明了去重功能的重要性!") + + +if __name__ == "__main__": + test_improved_fetching() + test_category_overlap()
\ No newline at end of file |
