#!/usr/bin/env python3 """ Test 3-Day Paper Fetch Detailed analysis of paper availability in the past 3 days to identify why 0 papers were retrieved. """ import os import sys import logging import requests import feedparser from datetime import datetime, timezone, timedelta from collections import Counter # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)] ) logger = logging.getLogger(__name__) # Add the parent directory to the path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from scripts.fetch_papers import ArxivPaperFetcher, CS_CATEGORIES def analyze_recent_papers(): """Analyze papers from the past week with daily breakdown""" print("๐Ÿ” Analyzing Recent Paper Availability") print("=" * 60) # Calculate time ranges now = datetime.now(timezone.utc) print(f"๐Ÿ“… Current time: {now.strftime('%Y-%m-%d %H:%M:%S')} UTC") # Test different time ranges time_ranges = [ ("Past 1 day", now - timedelta(days=1)), ("Past 2 days", now - timedelta(days=2)), ("Past 3 days", now - timedelta(days=3)), ("Past 7 days", now - timedelta(days=7)) ] # Create a fake fetcher instance for accessing private methods class TestFetcher: def __init__(self): import requests self.session = requests.Session() def _parse_paper_entry(self, entry): return { "title": entry.title.replace('\n', ' ').strip(), "abstract": entry.summary.replace('\n', ' ').strip(), "authors": [author.name for author in entry.authors] if hasattr(entry, 'authors') else [], "published": entry.published, "updated": entry.updated, "link": entry.link, "arxiv_id": entry.id.split('/')[-1], "categories": [tag.term for tag in entry.tags] if hasattr(entry, 'tags') else [] } def fetch_recent_sample(self, start_date, end_date, max_papers=500): """Fetch a sample of papers from the date range""" all_papers = [] # Check a few key categories test_categories = ["cs.AI", "cs.LG", "cs.CL", "cs.CV"] for category in test_categories: try: params = { "search_query": f"cat:{category}", "sortBy": "submittedDate", "sortOrder": "descending", "start": 0, "max_results": 100 } response = self.session.get("http://export.arxiv.org/api/query", params=params, timeout=30) response.raise_for_status() feed = feedparser.parse(response.content) for entry in feed.entries: paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc) if start_date <= paper_date <= end_date: paper_data = self._parse_paper_entry(entry) all_papers.append(paper_data) except Exception as e: print(f" โŒ Error fetching {category}: {e}") # Remove duplicates unique_papers = {} for paper in all_papers: unique_papers[paper['arxiv_id']] = paper return list(unique_papers.values()) fetcher = TestFetcher() # Test each time range for range_name, start_date in time_ranges: print(f"\n๐Ÿ“Š {range_name} ({start_date.strftime('%Y-%m-%d')} to {now.strftime('%Y-%m-%d')}):") papers = fetcher.fetch_recent_sample(start_date, now) print(f" ๐Ÿ“„ Found: {len(papers)} papers") if papers: # Analyze dates dates = [] for paper in papers: paper_date = datetime.strptime(paper['updated'][:10], '%Y-%m-%d') dates.append(paper_date.strftime('%Y-%m-%d')) date_counts = Counter(dates) print(f" ๐Ÿ“… Daily distribution:") for date, count in sorted(date_counts.items(), reverse=True)[:5]: days_ago = (now.date() - datetime.strptime(date, '%Y-%m-%d').date()).days print(f" - {date}: {count} papers ({days_ago} days ago)") # Show some sample titles print(f" ๐Ÿ“ Sample papers:") for i, paper in enumerate(papers[:3], 1): paper_date = datetime.strptime(paper['updated'][:10], '%Y-%m-%d') days_ago = (now.date() - paper_date.date()).days print(f" {i}. {paper['title'][:60]}... ({days_ago} days ago)") else: print(f" โŒ No papers found in this range") def check_weekend_effect(): """Check if weekend affects paper submission patterns""" print(f"\n" + "="*60) print("๐Ÿ“… Weekend Effect Analysis") print("="*60) now = datetime.now(timezone.utc) current_weekday = now.strftime('%A') print(f"๐Ÿ—“๏ธ Today is: {current_weekday}") print(f"๐Ÿ“Š Checking if weekend timing affects paper submissions...") # Analyze the past week day by day for i in range(7): date = now - timedelta(days=i) weekday = date.strftime('%A') date_str = date.strftime('%Y-%m-%d') if i == 0: status = "(Today)" elif i == 1: status = "(Yesterday)" else: status = f"({i} days ago)" print(f" {date_str} {weekday} {status}") print(f"\n๐Ÿ’ก Possible explanations for low paper count:") if current_weekday in ['Saturday', 'Sunday']: print(f" ๐Ÿ  It's {current_weekday} - researchers typically don't submit on weekends") elif current_weekday == 'Monday': print(f" ๐Ÿ“… It's Monday - weekend submissions are rare, Monday submissions may be low") else: print(f" ๐Ÿ“š It's {current_weekday} - should be normal submission day") print(f" ๐Ÿ• Time zone effects: arXiv updates happen at specific times") print(f" โฐ Current UTC time: {now.strftime('%H:%M')} - submissions may not be processed yet") def test_specific_fetch(): """Test the actual fetch function with 3 days""" print(f"\n" + "="*60) print("๐Ÿงช Testing Actual Fetch Function") print("="*60) print(f"๐Ÿ”„ Testing the same logic your main script uses...") # Simulate the fetch without OpenAI API class MockFetcher(ArxivPaperFetcher): def __init__(self): import requests self.session = requests.Session() def filter_papers_with_gpt(self, papers, use_parallel=True, max_concurrent=16): # Skip GPT filtering, return all papers print(f" โญ๏ธ Skipping GPT filtering, would have processed {len(papers)} papers") return papers try: # Test with mock fetcher fetcher = MockFetcher() # Use the same parameters as your actual run papers = fetcher.fetch_recent_papers(days=3) print(f"๐Ÿ“„ Raw papers fetched: {len(papers)} papers") if papers: print(f"โœ… Papers found! The issue is likely in GPT filtering or API key") print(f"๐Ÿ“‹ Sample papers:") for i, paper in enumerate(papers[:3], 1): print(f" {i}. {paper['title'][:60]}...") else: print(f"โŒ No papers found in raw fetch - arXiv issue or date range problem") except Exception as e: print(f"โŒ Error in fetch test: {e}") if __name__ == "__main__": analyze_recent_papers() check_weekend_effect() test_specific_fetch() print(f"\n" + "="*60) print("๐ŸŽฏ Diagnosis Summary") print("="*60) print(f"If this analysis shows:") print(f" ๐Ÿ“„ Papers exist โ†’ Problem is with GPT filtering or API key") print(f" โŒ No papers โ†’ Weekend effect or arXiv submission patterns") print(f" ๐Ÿ• Time zone โ†’ Wait a few hours and try again") print(f" ๐Ÿ“… Date issue โ†’ Check date range logic in fetch function")