summaryrefslogtreecommitdiff
path: root/scripts/test_fetch_only.py
diff options
context:
space:
mode:
authorhaoyuren <13851610112@163.com>2025-06-29 16:43:03 -0700
committerhaoyuren <13851610112@163.com>2025-06-29 16:43:03 -0700
commit88ab5e0d4f9a62018a428a833f3fa9cb6addba15 (patch)
tree43498d466b9c5af07b7155130067dc777c5af655 /scripts/test_fetch_only.py
parent388f0407ef8c9f68866509f722491fcfd44afa11 (diff)
fix fetch
Diffstat (limited to 'scripts/test_fetch_only.py')
-rw-r--r--scripts/test_fetch_only.py169
1 files changed, 169 insertions, 0 deletions
diff --git a/scripts/test_fetch_only.py b/scripts/test_fetch_only.py
new file mode 100644
index 0000000..db65231
--- /dev/null
+++ b/scripts/test_fetch_only.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+测试论文抓取功能 - 显示改进的日志
+
+这个脚本只测试论文抓取部分,展示分页过程和日期分布,不需要OpenAI API。
+"""
+
+import os
+import sys
+import logging
+from datetime import datetime, timezone, timedelta
+from collections import Counter
+
+# 设置日志
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - %(message)s',
+ handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+
+# Add the parent directory to the path so we can import the main module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scripts.fetch_papers import ArxivPaperFetcher
+
+
+def test_paper_fetching_with_detailed_logs():
+ """测试论文抓取,显示详细的分页和日期信息"""
+
+ print("🚀 测试改进后的论文抓取日志显示")
+ print("=" * 60)
+
+ # 创建一个模拟的fetcher(不需要OpenAI API)
+ class MockArxivFetcher:
+ def __init__(self):
+ import requests
+ self.session = requests.Session()
+ self.session.headers.update({
+ 'User-Agent': 'PaperFetcher/1.0 (Test)'
+ })
+
+ def fetch_papers_by_date_range(self, start_date, end_date, max_papers=300):
+ """模拟我们改进后的抓取函数"""
+ logger.info(f"🔍 开始从arXiv抓取论文: {start_date.date()} 到 {end_date.date()}")
+ logger.info(f"📋 目标类别: cs.AI, cs.CL, cs.CV, cs.LG, cs.NE, cs.RO, cs.IR, cs.HC, stat.ML")
+
+ from scripts.fetch_papers import ARXIV_BASE_URL, CS_CATEGORIES, MAX_RESULTS_PER_BATCH
+ import requests
+ import feedparser
+
+ # Build category query
+ category_query = " OR ".join(f"cat:{cat}" for cat in CS_CATEGORIES)
+
+ all_papers = []
+ start_index = 0
+ batch_count = 0
+ total_raw_papers = 0
+
+ while len(all_papers) < max_papers:
+ try:
+ batch_count += 1
+ search_query = f"({category_query})"
+
+ params = {
+ "search_query": search_query,
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "start": start_index,
+ "max_results": min(MAX_RESULTS_PER_BATCH, max_papers - len(all_papers))
+ }
+
+ logger.info(f"📦 第{batch_count}批次: 从索引{start_index}开始抓取...")
+
+ response = self.session.get(ARXIV_BASE_URL, params=params, timeout=30)
+ response.raise_for_status()
+
+ feed = feedparser.parse(response.content)
+ entries = feed.entries
+ total_raw_papers += len(entries)
+
+ logger.info(f"✅ 第{batch_count}批次获取了 {len(entries)} 篇论文")
+
+ if not entries:
+ logger.info("📭 没有更多论文可用")
+ break
+
+ # Filter papers by date and parse them
+ batch_papers = []
+ older_papers = 0
+ for entry in entries:
+ paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
+
+ if paper_date < start_date:
+ older_papers += 1
+ continue
+
+ if start_date <= paper_date <= end_date:
+ paper_data = {
+ "title": entry.title.replace('\n', ' ').strip(),
+ "abstract": entry.summary.replace('\n', ' ').strip(),
+ "authors": [author.name for author in entry.authors] if hasattr(entry, 'authors') else [],
+ "published": entry.published,
+ "updated": entry.updated,
+ "link": entry.link,
+ "arxiv_id": entry.id.split('/')[-1],
+ "categories": [tag.term for tag in entry.tags] if hasattr(entry, 'tags') else []
+ }
+ batch_papers.append(paper_data)
+
+ all_papers.extend(batch_papers)
+ logger.info(f"📊 第{batch_count}批次筛选结果: {len(batch_papers)}篇在日期范围内, {older_papers}篇过旧")
+ logger.info(f"📈 累计获取论文: {len(all_papers)}篇")
+
+ if older_papers > 0:
+ logger.info(f"🔚 发现{older_papers}篇超出日期范围的论文,停止抓取")
+ break
+
+ if len(entries) < MAX_RESULTS_PER_BATCH:
+ logger.info("🔚 已达到arXiv数据末尾")
+ break
+
+ start_index += MAX_RESULTS_PER_BATCH
+
+ except Exception as e:
+ logger.error(f"❌ 抓取论文时出错: {e}")
+ break
+
+ # 显示总结信息
+ logger.info(f"📊 抓取总结:")
+ logger.info(f" - 总共处理了 {batch_count} 个批次")
+ logger.info(f" - 从arXiv获取了 {total_raw_papers} 篇原始论文")
+ logger.info(f" - 筛选出 {len(all_papers)} 篇符合日期范围的论文")
+
+ # 显示日期分布
+ if all_papers:
+ dates = []
+ for paper in all_papers:
+ paper_date = datetime.strptime(paper['updated'][:10], '%Y-%m-%d')
+ dates.append(paper_date.strftime('%Y-%m-%d'))
+
+ date_counts = Counter(dates)
+ logger.info(f"📅 论文日期分布 (前5天):")
+ for date, count in date_counts.most_common(5):
+ days_ago = (datetime.now(timezone.utc).date() - datetime.strptime(date, '%Y-%m-%d').date()).days
+ logger.info(f" - {date}: {count}篇 ({days_ago}天前)")
+
+ return all_papers
+
+ # 测试不同的时间范围
+ fetcher = MockArxivFetcher()
+
+ print("\n🕐 测试1: 过去1天")
+ end_date = datetime.now(timezone.utc)
+ start_date = end_date - timedelta(days=1)
+ papers_1day = fetcher.fetch_papers_by_date_range(start_date, end_date, max_papers=50)
+
+ print(f"\n🕐 测试2: 过去7天")
+ start_date = end_date - timedelta(days=7)
+ papers_7days = fetcher.fetch_papers_by_date_range(start_date, end_date, max_papers=200)
+
+ print(f"\n📊 对比结果:")
+ print(f" - 过去1天: {len(papers_1day)} 篇论文")
+ print(f" - 过去7天: {len(papers_7days)} 篇论文")
+ print(f" - 这解释了为什么日常模式很快完成!")
+
+
+if __name__ == "__main__":
+ test_paper_fetching_with_detailed_logs() \ No newline at end of file