From 88ab5e0d4f9a62018a428a833f3fa9cb6addba15 Mon Sep 17 00:00:00 2001
From: haoyuren <13851610112@163.com>
Date: Sun, 29 Jun 2025 16:43:03 -0700
Subject: fix fetch

---
 scripts/test_arxiv_only.py | 150 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 scripts/test_arxiv_only.py

(limited to 'scripts/test_arxiv_only.py')

diff --git a/scripts/test_arxiv_only.py b/scripts/test_arxiv_only.py
new file mode 100644
index 0000000..1c8f653
--- /dev/null
+++ b/scripts/test_arxiv_only.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""
+测试arXiv连接 - 不需要OpenAI API密钥
+
+这个脚本只测试arXiv API连接和论文抓取功能，不涉及GPT过滤。
+"""
+
+import requests
+import feedparser
+from datetime import datetime, timezone, timedelta
+
+def test_arxiv_connection():
+    """测试arXiv API连接"""
+    print("🔍 测试arXiv API连接...")
+    
+    try:
+        # 测试最基本的arXiv查询
+        url = "http://export.arxiv.org/api/query"
+        params = {
+            "search_query": "cat:cs.AI",
+            "sortBy": "submittedDate", 
+            "sortOrder": "descending",
+            "max_results": 10
+        }
+        
+        print(f"📡 发送请求到: {url}")
+        print(f"📋 查询参数: {params}")
+        
+        response = requests.get(url, params=params, timeout=15)
+        print(f"✅ HTTP状态码: {response.status_code}")
+        
+        if response.status_code == 200:
+            feed = feedparser.parse(response.content)
+            entries = feed.entries
+            print(f"📄 获取到 {len(entries)} 篇论文")
+            
+            if entries:
+                print(f"\n📝 论文样本:")
+                for i, entry in enumerate(entries[:3], 1):
+                    print(f"\n{i}. 标题: {entry.title}")
+                    print(f"   发布时间: {entry.published}")
+                    print(f"   更新时间: {entry.updated}")
+                    print(f"   类别: {[tag.term for tag in entry.tags] if hasattr(entry, 'tags') else '无'}")
+                    print(f"   摘要长度: {len(entry.summary)} 字符")
+                    print(f"   摘要预览: {entry.summary[:150]}...")
+                return True
+        else:
+            print(f"❌ HTTP请求失败: {response.status_code}")
+            return False
+            
+    except Exception as e:
+        print(f"❌ arXiv连接测试失败: {e}")
+        return False
+
+def test_date_filtering():
+    """测试日期过滤功能"""
+    print(f"\n🕐 测试日期过滤功能...")
+    
+    try:
+        # 测试最近3天的论文
+        url = "http://export.arxiv.org/api/query"
+        
+        # 构建包含多个CS类别的查询
+        categories = ["cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO", "cs.IR", "cs.HC", "stat.ML"]
+        category_query = " OR ".join(f"cat:{cat}" for cat in categories)
+        
+        params = {
+            "search_query": f"({category_query})",
+            "sortBy": "submittedDate",
+            "sortOrder": "descending",
+            "max_results": 100
+        }
+        
+        print(f"📋 搜索类别: {', '.join(categories)}")
+        print(f"📦 请求最多100篇论文...")
+        
+        response = requests.get(url, params=params, timeout=15)
+        
+        if response.status_code == 200:
+            feed = feedparser.parse(response.content)
+            entries = feed.entries
+            print(f"📄 总共获取: {len(entries)} 篇论文")
+            
+            # 分析日期分布
+            now = datetime.now(timezone.utc)
+            cutoff_1day = now - timedelta(days=1)
+            cutoff_3days = now - timedelta(days=3)
+            cutoff_7days = now - timedelta(days=7)
+            
+            recent_1day = 0
+            recent_3days = 0
+            recent_7days = 0
+            
+            for entry in entries:
+                paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
+                
+                if paper_date >= cutoff_1day:
+                    recent_1day += 1
+                if paper_date >= cutoff_3days:
+                    recent_3days += 1
+                if paper_date >= cutoff_7days:
+                    recent_7days += 1
+            
+            print(f"\n📊 日期分布统计:")
+            print(f"   - 最近1天: {recent_1day} 篇")
+            print(f"   - 最近3天: {recent_3days} 篇")
+            print(f"   - 最近7天: {recent_7days} 篇")
+            
+            # 显示最新的几篇论文
+            if entries:
+                print(f"\n📝 最新论文样本:")
+                for i, entry in enumerate(entries[:5], 1):
+                    paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
+                    print(f"\n{i}. {entry.title[:80]}...")
+                    print(f"   更新时间: {paper_date.strftime('%Y-%m-%d %H:%M')} UTC")
+                    print(f"   类别: {', '.join([tag.term for tag in entry.tags][:3])}")
+            
+            return True
+        else:
+            print(f"❌ 请求失败: {response.status_code}")
+            return False
+            
+    except Exception as e:
+        print(f"❌ 日期过滤测试失败: {e}")
+        return False
+
+def main():
+    print("🚀 开始ArXiv连接测试...")
+    print("=" * 60)
+    
+    success1 = test_arxiv_connection()
+    success2 = test_date_filtering()
+    
+    print("\n" + "=" * 60)
+    if success1 and success2:
+        print("✅ arXiv连接测试通过！")
+        print("\n🎯 测试结果:")
+        print("   - arXiv API连接正常")
+        print("   - 论文抓取功能正常")
+        print("   - 日期过滤功能正常")
+        print("\n💡 接下来需要:")
+        print("   - 设置OPENAI_API_KEY环境变量")
+        print("   - 运行完整的调试脚本: python scripts/debug_fetch.py")
+    else:
+        print("❌ 测试发现问题，请检查网络连接")
+    
+    print("=" * 60)
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
-- 
cgit v1.2.3