diff options
| author | haoyuren <13851610112@163.com> | 2025-06-29 16:43:03 -0700 |
|---|---|---|
| committer | haoyuren <13851610112@163.com> | 2025-06-29 16:43:03 -0700 |
| commit | 88ab5e0d4f9a62018a428a833f3fa9cb6addba15 (patch) | |
| tree | 43498d466b9c5af07b7155130067dc777c5af655 /scripts/debug_fetch.py | |
| parent | 388f0407ef8c9f68866509f722491fcfd44afa11 (diff) | |
fix fetch
Diffstat (limited to 'scripts/debug_fetch.py')
| -rw-r--r-- | scripts/debug_fetch.py | 200 |
1 files changed, 200 insertions, 0 deletions
diff --git a/scripts/debug_fetch.py b/scripts/debug_fetch.py new file mode 100644 index 0000000..100fc94 --- /dev/null +++ b/scripts/debug_fetch.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +""" +调试脚本 - 详细显示论文抓取过程 + +这个脚本专门用于调试和诊断论文抓取系统,会显示每个步骤的详细信息, +帮助用户了解系统是否正常工作,以及在哪个环节可能出现问题。 +""" + +import os +import sys +import logging +from datetime import datetime, timezone, timedelta + +# 设置详细的调试日志 +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout), + ] +) + +# Add the parent directory to the path so we can import the main module +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from scripts.fetch_papers import ArxivPaperFetcher + + +def debug_arxiv_connection(): + """调试arXiv连接""" + print("🔍 测试arXiv API连接...") + + import requests + import feedparser + + try: + # 测试最基本的arXiv查询 + url = "http://export.arxiv.org/api/query" + params = { + "search_query": "cat:cs.AI", + "sortBy": "submittedDate", + "sortOrder": "descending", + "max_results": 5 + } + + print(f"📡 发送请求到: {url}") + print(f"📋 查询参数: {params}") + + response = requests.get(url, params=params, timeout=10) + print(f"✅ HTTP状态码: {response.status_code}") + + if response.status_code == 200: + feed = feedparser.parse(response.content) + entries = feed.entries + print(f"📄 获取到 {len(entries)} 篇论文") + + if entries: + print(f"📝 第一篇论文示例:") + entry = entries[0] + print(f" - 标题: {entry.title}") + print(f" - 发布时间: {entry.published}") + print(f" - 更新时间: {entry.updated}") + print(f" - 类别: {[tag.term for tag in entry.tags] if hasattr(entry, 'tags') else '无'}") + print(f" - 摘要长度: {len(entry.summary)} 字符") + return True + else: + print(f"❌ HTTP请求失败: {response.status_code}") + return False + + except Exception as e: + print(f"❌ arXiv连接测试失败: {e}") + return False + + +def debug_openai_connection(api_key): + """调试OpenAI连接""" + print("\n🤖 测试OpenAI API连接...") + + try: + from openai import OpenAI + client = OpenAI(api_key=api_key) + + # 测试一个简单的请求 + response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": "You are a helpful assistant. Respond with just the number 1."}, + {"role": "user", "content": "Test"} + ], + temperature=0, + max_tokens=1 + ) + + result = response.choices[0].message.content.strip() + print(f"✅ OpenAI API连接成功") + print(f"📤 发送模型: gpt-4o") + print(f"📨 API响应: '{result}'") + return True + + except Exception as e: + print(f"❌ OpenAI连接测试失败: {e}") + return False + + +def debug_paper_fetch(): + """调试论文抓取过程""" + print("\n" + "="*60) + print("🔍 ArXiv论文抓取系统调试") + print("="*60) + + # 检查环境变量 + openai_api_key = os.getenv("OPENAI_API_KEY") + print(f"🔑 OpenAI API Key: {'已设置' if openai_api_key else '❌ 未设置'}") + + if not openai_api_key: + print("❌ 请设置OPENAI_API_KEY环境变量") + print(" export OPENAI_API_KEY='your-api-key-here'") + return False + + # 测试API连接 + if not debug_arxiv_connection(): + return False + + if not debug_openai_connection(openai_api_key): + return False + + # 测试论文抓取器 + print(f"\n📋 开始测试论文抓取器...") + + try: + fetcher = ArxivPaperFetcher(openai_api_key) + print("✅ 论文抓取器初始化成功") + + # 测试获取最近3天的论文(确保有一些结果) + print(f"\n🕐 测试获取最近3天的论文...") + end_date = datetime.now(timezone.utc) + start_date = end_date - timedelta(days=3) + + print(f"📅 时间范围: {start_date.date()} 到 {end_date.date()}") + + # 限制到20篇论文进行测试 + papers = fetcher.fetch_papers_by_date_range(start_date, end_date, max_papers=20) + + print(f"\n📊 抓取结果分析:") + print(f" - 总共获取: {len(papers)} 篇论文") + + if papers: + print(f"\n📄 论文样本 (前3篇):") + for i, paper in enumerate(papers[:3], 1): + print(f"\n {i}. {paper['title']}") + print(f" 发布时间: {paper['published']}") + print(f" 类别: {', '.join(paper['categories'])}") + print(f" 摘要长度: {len(paper['abstract'])} 字符") + + # 测试GPT过滤(只测试前5篇) + print(f"\n🤖 测试GPT-4o过滤 (前5篇论文)...") + sample_papers = papers[:5] + filtered_papers = fetcher.filter_papers_with_gpt(sample_papers) + + print(f"\n🎯 过滤结果:") + print(f" - 输入论文: {len(sample_papers)} 篇") + print(f" - 相关论文: {len(filtered_papers)} 篇") + print(f" - 相关比例: {len(filtered_papers)/len(sample_papers)*100:.1f}%") + + if filtered_papers: + print(f"\n✅ 发现相关论文:") + for i, paper in enumerate(filtered_papers, 1): + print(f" {i}. {paper['title']}") + + return True + else: + print("⚠️ 未获取到任何论文") + print("可能的原因:") + print(" - 最近3天内这些类别没有新论文") + print(" - arXiv API响应延迟") + print(" - 网络连接问题") + return False + + except Exception as e: + print(f"❌ 论文抓取测试失败: {e}") + import traceback + print(f"详细错误信息: {traceback.format_exc()}") + return False + + +if __name__ == "__main__": + print("🚀 开始ArXiv论文抓取系统调试...") + + success = debug_paper_fetch() + + print(f"\n" + "="*60) + if success: + print("✅ 调试完成!系统工作正常") + print("\n🎯 接下来可以:") + print(" - 运行 python scripts/fetch_papers.py 进行实际抓取") + print(" - 运行 python scripts/test_daily_fetch.py 进行完整测试") + else: + print("❌ 调试发现问题,请检查上述错误信息") + + print("="*60)
\ No newline at end of file |
