From 88ab5e0d4f9a62018a428a833f3fa9cb6addba15 Mon Sep 17 00:00:00 2001 From: haoyuren <13851610112@163.com> Date: Sun, 29 Jun 2025 16:43:03 -0700 Subject: fix fetch --- scripts/test_arxiv_only.py | 150 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 scripts/test_arxiv_only.py (limited to 'scripts/test_arxiv_only.py') diff --git a/scripts/test_arxiv_only.py b/scripts/test_arxiv_only.py new file mode 100644 index 0000000..1c8f653 --- /dev/null +++ b/scripts/test_arxiv_only.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +""" +测试arXiv连接 - 不需要OpenAI API密钥 + +这个脚本只测试arXiv API连接和论文抓取功能,不涉及GPT过滤。 +""" + +import requests +import feedparser +from datetime import datetime, timezone, timedelta + +def test_arxiv_connection(): + """测试arXiv API连接""" + print("🔍 测试arXiv API连接...") + + try: + # 测试最基本的arXiv查询 + url = "http://export.arxiv.org/api/query" + params = { + "search_query": "cat:cs.AI", + "sortBy": "submittedDate", + "sortOrder": "descending", + "max_results": 10 + } + + print(f"📡 发送请求到: {url}") + print(f"📋 查询参数: {params}") + + response = requests.get(url, params=params, timeout=15) + print(f"✅ HTTP状态码: {response.status_code}") + + if response.status_code == 200: + feed = feedparser.parse(response.content) + entries = feed.entries + print(f"📄 获取到 {len(entries)} 篇论文") + + if entries: + print(f"\n📝 论文样本:") + for i, entry in enumerate(entries[:3], 1): + print(f"\n{i}. 标题: {entry.title}") + print(f" 发布时间: {entry.published}") + print(f" 更新时间: {entry.updated}") + print(f" 类别: {[tag.term for tag in entry.tags] if hasattr(entry, 'tags') else '无'}") + print(f" 摘要长度: {len(entry.summary)} 字符") + print(f" 摘要预览: {entry.summary[:150]}...") + return True + else: + print(f"❌ HTTP请求失败: {response.status_code}") + return False + + except Exception as e: + print(f"❌ arXiv连接测试失败: {e}") + return False + +def test_date_filtering(): + """测试日期过滤功能""" + print(f"\n🕐 测试日期过滤功能...") + + try: + # 测试最近3天的论文 + url = "http://export.arxiv.org/api/query" + + # 构建包含多个CS类别的查询 + categories = ["cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO", "cs.IR", "cs.HC", "stat.ML"] + category_query = " OR ".join(f"cat:{cat}" for cat in categories) + + params = { + "search_query": f"({category_query})", + "sortBy": "submittedDate", + "sortOrder": "descending", + "max_results": 100 + } + + print(f"📋 搜索类别: {', '.join(categories)}") + print(f"📦 请求最多100篇论文...") + + response = requests.get(url, params=params, timeout=15) + + if response.status_code == 200: + feed = feedparser.parse(response.content) + entries = feed.entries + print(f"📄 总共获取: {len(entries)} 篇论文") + + # 分析日期分布 + now = datetime.now(timezone.utc) + cutoff_1day = now - timedelta(days=1) + cutoff_3days = now - timedelta(days=3) + cutoff_7days = now - timedelta(days=7) + + recent_1day = 0 + recent_3days = 0 + recent_7days = 0 + + for entry in entries: + paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc) + + if paper_date >= cutoff_1day: + recent_1day += 1 + if paper_date >= cutoff_3days: + recent_3days += 1 + if paper_date >= cutoff_7days: + recent_7days += 1 + + print(f"\n📊 日期分布统计:") + print(f" - 最近1天: {recent_1day} 篇") + print(f" - 最近3天: {recent_3days} 篇") + print(f" - 最近7天: {recent_7days} 篇") + + # 显示最新的几篇论文 + if entries: + print(f"\n📝 最新论文样本:") + for i, entry in enumerate(entries[:5], 1): + paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc) + print(f"\n{i}. {entry.title[:80]}...") + print(f" 更新时间: {paper_date.strftime('%Y-%m-%d %H:%M')} UTC") + print(f" 类别: {', '.join([tag.term for tag in entry.tags][:3])}") + + return True + else: + print(f"❌ 请求失败: {response.status_code}") + return False + + except Exception as e: + print(f"❌ 日期过滤测试失败: {e}") + return False + +def main(): + print("🚀 开始ArXiv连接测试...") + print("=" * 60) + + success1 = test_arxiv_connection() + success2 = test_date_filtering() + + print("\n" + "=" * 60) + if success1 and success2: + print("✅ arXiv连接测试通过!") + print("\n🎯 测试结果:") + print(" - arXiv API连接正常") + print(" - 论文抓取功能正常") + print(" - 日期过滤功能正常") + print("\n💡 接下来需要:") + print(" - 设置OPENAI_API_KEY环境变量") + print(" - 运行完整的调试脚本: python scripts/debug_fetch.py") + else: + print("❌ 测试发现问题,请检查网络连接") + + print("=" * 60) + +if __name__ == "__main__": + main() \ No newline at end of file -- cgit v1.2.3