fix promptHEAD main

author: haoyuren <13851610112@163.com> 2025-06-29 17:42:40 -0700
committer: haoyuren <13851610112@163.com> 2025-06-29 17:42:40 -0700
commit: 49e1516fdb238b4d9a9181b641e344593aeedefc (patch)
tree: 3eb9c98017adb58791955d2ce20a22a0850ded51 /scripts/test_3day_fetch.py
parent: 5f9e75d58d3f2090f58132128f0b73152741d700 (diff)
1 files changed, 231 insertions, 0 deletions
diff --git a/scripts/test_3day_fetch.py b/scripts/test_3day_fetch.py
new file mode 100644
index 0000000..6a731ff
--- /dev/null
+++ b/scripts/test_3day_fetch.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+"""
+Test 3-Day Paper Fetch
+
+Detailed analysis of paper availability in the past 3 days
+to identify why 0 papers were retrieved.
+"""
+
+import os
+import sys
+import logging
+import requests
+import feedparser
+from datetime import datetime, timezone, timedelta
+from collections import Counter
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+
+# Add the parent directory to the path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scripts.fetch_papers import ArxivPaperFetcher, CS_CATEGORIES
+
+def analyze_recent_papers():
+    """Analyze papers from the past week with daily breakdown"""
+    
+    print("🔍 Analyzing Recent Paper Availability")
+    print("=" * 60)
+    
+    # Calculate time ranges
+    now = datetime.now(timezone.utc)
+    print(f"📅 Current time: {now.strftime('%Y-%m-%d %H:%M:%S')} UTC")
+    
+    # Test different time ranges
+    time_ranges = [
+        ("Past 1 day", now - timedelta(days=1)),
+        ("Past 2 days", now - timedelta(days=2)), 
+        ("Past 3 days", now - timedelta(days=3)),
+        ("Past 7 days", now - timedelta(days=7))
+    ]
+    
+    # Create a fake fetcher instance for accessing private methods
+    class TestFetcher:
+        def __init__(self):
+            import requests
+            self.session = requests.Session()
+            
+        def _parse_paper_entry(self, entry):
+            return {
+                "title": entry.title.replace('\n', ' ').strip(),
+                "abstract": entry.summary.replace('\n', ' ').strip(),
+                "authors": [author.name for author in entry.authors] if hasattr(entry, 'authors') else [],
+                "published": entry.published,
+                "updated": entry.updated,
+                "link": entry.link,
+                "arxiv_id": entry.id.split('/')[-1],
+                "categories": [tag.term for tag in entry.tags] if hasattr(entry, 'tags') else []
+            }
+        
+        def fetch_recent_sample(self, start_date, end_date, max_papers=500):
+            """Fetch a sample of papers from the date range"""
+            all_papers = []
+            
+            # Check a few key categories
+            test_categories = ["cs.AI", "cs.LG", "cs.CL", "cs.CV"]
+            
+            for category in test_categories:
+                try:
+                    params = {
+                        "search_query": f"cat:{category}",
+                        "sortBy": "submittedDate", 
+                        "sortOrder": "descending",
+                        "start": 0,
+                        "max_results": 100
+                    }
+                    
+                    response = self.session.get("http://export.arxiv.org/api/query", 
+                                              params=params, timeout=30)
+                    response.raise_for_status()
+                    
+                    feed = feedparser.parse(response.content)
+                    
+                    for entry in feed.entries:
+                        paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
+                        
+                        if start_date <= paper_date <= end_date:
+                            paper_data = self._parse_paper_entry(entry)
+                            all_papers.append(paper_data)
+                    
+                except Exception as e:
+                    print(f"   ❌ Error fetching {category}: {e}")
+            
+            # Remove duplicates
+            unique_papers = {}
+            for paper in all_papers:
+                unique_papers[paper['arxiv_id']] = paper
+            
+            return list(unique_papers.values())
+    
+    fetcher = TestFetcher()
+    
+    # Test each time range
+    for range_name, start_date in time_ranges:
+        print(f"\n📊 {range_name} ({start_date.strftime('%Y-%m-%d')} to {now.strftime('%Y-%m-%d')}):")
+        
+        papers = fetcher.fetch_recent_sample(start_date, now)
+        print(f"   📄 Found: {len(papers)} papers")
+        
+        if papers:
+            # Analyze dates
+            dates = []
+            for paper in papers:
+                paper_date = datetime.strptime(paper['updated'][:10], '%Y-%m-%d')
+                dates.append(paper_date.strftime('%Y-%m-%d'))
+            
+            date_counts = Counter(dates)
+            print(f"   📅 Daily distribution:")
+            for date, count in sorted(date_counts.items(), reverse=True)[:5]:
+                days_ago = (now.date() - datetime.strptime(date, '%Y-%m-%d').date()).days
+                print(f"     - {date}: {count} papers ({days_ago} days ago)")
+            
+            # Show some sample titles
+            print(f"   📝 Sample papers:")
+            for i, paper in enumerate(papers[:3], 1):
+                paper_date = datetime.strptime(paper['updated'][:10], '%Y-%m-%d')
+                days_ago = (now.date() - paper_date.date()).days
+                print(f"     {i}. {paper['title'][:60]}... ({days_ago} days ago)")
+        else:
+            print(f"   ❌ No papers found in this range")
+
+
+def check_weekend_effect():
+    """Check if weekend affects paper submission patterns"""
+    
+    print(f"\n" + "="*60)
+    print("📅 Weekend Effect Analysis")
+    print("="*60)
+    
+    now = datetime.now(timezone.utc)
+    current_weekday = now.strftime('%A')
+    
+    print(f"🗓️ Today is: {current_weekday}")
+    print(f"📊 Checking if weekend timing affects paper submissions...")
+    
+    # Analyze the past week day by day
+    for i in range(7):
+        date = now - timedelta(days=i)
+        weekday = date.strftime('%A')
+        date_str = date.strftime('%Y-%m-%d')
+        
+        if i == 0:
+            status = "(Today)"
+        elif i == 1:
+            status = "(Yesterday)" 
+        else:
+            status = f"({i} days ago)"
+        
+        print(f"   {date_str} {weekday} {status}")
+    
+    print(f"\n💡 Possible explanations for low paper count:")
+    if current_weekday in ['Saturday', 'Sunday']:
+        print(f"   🏠 It's {current_weekday} - researchers typically don't submit on weekends")
+    elif current_weekday == 'Monday':
+        print(f"   📅 It's Monday - weekend submissions are rare, Monday submissions may be low")
+    else:
+        print(f"   📚 It's {current_weekday} - should be normal submission day")
+    
+    print(f"   🕐 Time zone effects: arXiv updates happen at specific times")
+    print(f"   ⏰ Current UTC time: {now.strftime('%H:%M')} - submissions may not be processed yet")
+
+
+def test_specific_fetch():
+    """Test the actual fetch function with 3 days"""
+    
+    print(f"\n" + "="*60)
+    print("🧪 Testing Actual Fetch Function")
+    print("="*60)
+    
+    print(f"🔄 Testing the same logic your main script uses...")
+    
+    # Simulate the fetch without OpenAI API
+    class MockFetcher(ArxivPaperFetcher):
+        def __init__(self):
+            import requests
+            self.session = requests.Session()
+        
+        def filter_papers_with_gpt(self, papers, use_parallel=True, max_concurrent=16):
+            # Skip GPT filtering, return all papers
+            print(f"   ⏭️ Skipping GPT filtering, would have processed {len(papers)} papers")
+            return papers
+    
+    try:
+        # Test with mock fetcher
+        fetcher = MockFetcher()
+        
+        # Use the same parameters as your actual run
+        papers = fetcher.fetch_recent_papers(days=3)
+        
+        print(f"📄 Raw papers fetched: {len(papers)} papers")
+        
+        if papers:
+            print(f"✅ Papers found! The issue is likely in GPT filtering or API key")
+            print(f"📋 Sample papers:")
+            for i, paper in enumerate(papers[:3], 1):
+                print(f"   {i}. {paper['title'][:60]}...")
+        else:
+            print(f"❌ No papers found in raw fetch - arXiv issue or date range problem")
+            
+    except Exception as e:
+        print(f"❌ Error in fetch test: {e}")
+
+
+if __name__ == "__main__":
+    analyze_recent_papers()
+    check_weekend_effect() 
+    test_specific_fetch()
+    
+    print(f"\n" + "="*60)
+    print("🎯 Diagnosis Summary")
+    print("="*60)
+    print(f"If this analysis shows:")
+    print(f"   📄 Papers exist → Problem is with GPT filtering or API key")
+    print(f"   ❌ No papers → Weekend effect or arXiv submission patterns")
+    print(f"   🕐 Time zone → Wait a few hours and try again")
+    print(f"   📅 Date issue → Check date range logic in fetch function") 
+\ No newline at end of file
author	haoyuren <13851610112@163.com>	2025-06-29 17:42:40 -0700
committer	haoyuren <13851610112@163.com>	2025-06-29 17:42:40 -0700
commit	49e1516fdb238b4d9a9181b641e344593aeedefc (patch)
tree	3eb9c98017adb58791955d2ce20a22a0850ded51 /scripts/test_3day_fetch.py
parent	5f9e75d58d3f2090f58132128f0b73152741d700 (diff)