summaryrefslogtreecommitdiff
path: root/scripts/test_arxiv_only.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/test_arxiv_only.py')
-rw-r--r--scripts/test_arxiv_only.py150
1 files changed, 150 insertions, 0 deletions
diff --git a/scripts/test_arxiv_only.py b/scripts/test_arxiv_only.py
new file mode 100644
index 0000000..1c8f653
--- /dev/null
+++ b/scripts/test_arxiv_only.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""
+测试arXiv连接 - 不需要OpenAI API密钥
+
+这个脚本只测试arXiv API连接和论文抓取功能,不涉及GPT过滤。
+"""
+
+import requests
+import feedparser
+from datetime import datetime, timezone, timedelta
+
+def test_arxiv_connection():
+ """测试arXiv API连接"""
+ print("🔍 测试arXiv API连接...")
+
+ try:
+ # 测试最基本的arXiv查询
+ url = "http://export.arxiv.org/api/query"
+ params = {
+ "search_query": "cat:cs.AI",
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "max_results": 10
+ }
+
+ print(f"📡 发送请求到: {url}")
+ print(f"📋 查询参数: {params}")
+
+ response = requests.get(url, params=params, timeout=15)
+ print(f"✅ HTTP状态码: {response.status_code}")
+
+ if response.status_code == 200:
+ feed = feedparser.parse(response.content)
+ entries = feed.entries
+ print(f"📄 获取到 {len(entries)} 篇论文")
+
+ if entries:
+ print(f"\n📝 论文样本:")
+ for i, entry in enumerate(entries[:3], 1):
+ print(f"\n{i}. 标题: {entry.title}")
+ print(f" 发布时间: {entry.published}")
+ print(f" 更新时间: {entry.updated}")
+ print(f" 类别: {[tag.term for tag in entry.tags] if hasattr(entry, 'tags') else '无'}")
+ print(f" 摘要长度: {len(entry.summary)} 字符")
+ print(f" 摘要预览: {entry.summary[:150]}...")
+ return True
+ else:
+ print(f"❌ HTTP请求失败: {response.status_code}")
+ return False
+
+ except Exception as e:
+ print(f"❌ arXiv连接测试失败: {e}")
+ return False
+
+def test_date_filtering():
+ """测试日期过滤功能"""
+ print(f"\n🕐 测试日期过滤功能...")
+
+ try:
+ # 测试最近3天的论文
+ url = "http://export.arxiv.org/api/query"
+
+ # 构建包含多个CS类别的查询
+ categories = ["cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO", "cs.IR", "cs.HC", "stat.ML"]
+ category_query = " OR ".join(f"cat:{cat}" for cat in categories)
+
+ params = {
+ "search_query": f"({category_query})",
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "max_results": 100
+ }
+
+ print(f"📋 搜索类别: {', '.join(categories)}")
+ print(f"📦 请求最多100篇论文...")
+
+ response = requests.get(url, params=params, timeout=15)
+
+ if response.status_code == 200:
+ feed = feedparser.parse(response.content)
+ entries = feed.entries
+ print(f"📄 总共获取: {len(entries)} 篇论文")
+
+ # 分析日期分布
+ now = datetime.now(timezone.utc)
+ cutoff_1day = now - timedelta(days=1)
+ cutoff_3days = now - timedelta(days=3)
+ cutoff_7days = now - timedelta(days=7)
+
+ recent_1day = 0
+ recent_3days = 0
+ recent_7days = 0
+
+ for entry in entries:
+ paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
+
+ if paper_date >= cutoff_1day:
+ recent_1day += 1
+ if paper_date >= cutoff_3days:
+ recent_3days += 1
+ if paper_date >= cutoff_7days:
+ recent_7days += 1
+
+ print(f"\n📊 日期分布统计:")
+ print(f" - 最近1天: {recent_1day} 篇")
+ print(f" - 最近3天: {recent_3days} 篇")
+ print(f" - 最近7天: {recent_7days} 篇")
+
+ # 显示最新的几篇论文
+ if entries:
+ print(f"\n📝 最新论文样本:")
+ for i, entry in enumerate(entries[:5], 1):
+ paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
+ print(f"\n{i}. {entry.title[:80]}...")
+ print(f" 更新时间: {paper_date.strftime('%Y-%m-%d %H:%M')} UTC")
+ print(f" 类别: {', '.join([tag.term for tag in entry.tags][:3])}")
+
+ return True
+ else:
+ print(f"❌ 请求失败: {response.status_code}")
+ return False
+
+ except Exception as e:
+ print(f"❌ 日期过滤测试失败: {e}")
+ return False
+
+def main():
+ print("🚀 开始ArXiv连接测试...")
+ print("=" * 60)
+
+ success1 = test_arxiv_connection()
+ success2 = test_date_filtering()
+
+ print("\n" + "=" * 60)
+ if success1 and success2:
+ print("✅ arXiv连接测试通过!")
+ print("\n🎯 测试结果:")
+ print(" - arXiv API连接正常")
+ print(" - 论文抓取功能正常")
+ print(" - 日期过滤功能正常")
+ print("\n💡 接下来需要:")
+ print(" - 设置OPENAI_API_KEY环境变量")
+ print(" - 运行完整的调试脚本: python scripts/debug_fetch.py")
+ else:
+ print("❌ 测试发现问题,请检查网络连接")
+
+ print("=" * 60)
+
+if __name__ == "__main__":
+ main() \ No newline at end of file