summaryrefslogtreecommitdiff
path: root/scripts/test_3day_fetch.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/test_3day_fetch.py')
-rw-r--r--scripts/test_3day_fetch.py231
1 files changed, 231 insertions, 0 deletions
diff --git a/scripts/test_3day_fetch.py b/scripts/test_3day_fetch.py
new file mode 100644
index 0000000..6a731ff
--- /dev/null
+++ b/scripts/test_3day_fetch.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+"""
+Test 3-Day Paper Fetch
+
+Detailed analysis of paper availability in the past 3 days
+to identify why 0 papers were retrieved.
+"""
+
+import os
+import sys
+import logging
+import requests
+import feedparser
+from datetime import datetime, timezone, timedelta
+from collections import Counter
+
+# Set up logging
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - %(message)s',
+ handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+
+# Add the parent directory to the path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from scripts.fetch_papers import ArxivPaperFetcher, CS_CATEGORIES
+
+def analyze_recent_papers():
+ """Analyze papers from the past week with daily breakdown"""
+
+ print("๐Ÿ” Analyzing Recent Paper Availability")
+ print("=" * 60)
+
+ # Calculate time ranges
+ now = datetime.now(timezone.utc)
+ print(f"๐Ÿ“… Current time: {now.strftime('%Y-%m-%d %H:%M:%S')} UTC")
+
+ # Test different time ranges
+ time_ranges = [
+ ("Past 1 day", now - timedelta(days=1)),
+ ("Past 2 days", now - timedelta(days=2)),
+ ("Past 3 days", now - timedelta(days=3)),
+ ("Past 7 days", now - timedelta(days=7))
+ ]
+
+ # Create a fake fetcher instance for accessing private methods
+ class TestFetcher:
+ def __init__(self):
+ import requests
+ self.session = requests.Session()
+
+ def _parse_paper_entry(self, entry):
+ return {
+ "title": entry.title.replace('\n', ' ').strip(),
+ "abstract": entry.summary.replace('\n', ' ').strip(),
+ "authors": [author.name for author in entry.authors] if hasattr(entry, 'authors') else [],
+ "published": entry.published,
+ "updated": entry.updated,
+ "link": entry.link,
+ "arxiv_id": entry.id.split('/')[-1],
+ "categories": [tag.term for tag in entry.tags] if hasattr(entry, 'tags') else []
+ }
+
+ def fetch_recent_sample(self, start_date, end_date, max_papers=500):
+ """Fetch a sample of papers from the date range"""
+ all_papers = []
+
+ # Check a few key categories
+ test_categories = ["cs.AI", "cs.LG", "cs.CL", "cs.CV"]
+
+ for category in test_categories:
+ try:
+ params = {
+ "search_query": f"cat:{category}",
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "start": 0,
+ "max_results": 100
+ }
+
+ response = self.session.get("http://export.arxiv.org/api/query",
+ params=params, timeout=30)
+ response.raise_for_status()
+
+ feed = feedparser.parse(response.content)
+
+ for entry in feed.entries:
+ paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc)
+
+ if start_date <= paper_date <= end_date:
+ paper_data = self._parse_paper_entry(entry)
+ all_papers.append(paper_data)
+
+ except Exception as e:
+ print(f" โŒ Error fetching {category}: {e}")
+
+ # Remove duplicates
+ unique_papers = {}
+ for paper in all_papers:
+ unique_papers[paper['arxiv_id']] = paper
+
+ return list(unique_papers.values())
+
+ fetcher = TestFetcher()
+
+ # Test each time range
+ for range_name, start_date in time_ranges:
+ print(f"\n๐Ÿ“Š {range_name} ({start_date.strftime('%Y-%m-%d')} to {now.strftime('%Y-%m-%d')}):")
+
+ papers = fetcher.fetch_recent_sample(start_date, now)
+ print(f" ๐Ÿ“„ Found: {len(papers)} papers")
+
+ if papers:
+ # Analyze dates
+ dates = []
+ for paper in papers:
+ paper_date = datetime.strptime(paper['updated'][:10], '%Y-%m-%d')
+ dates.append(paper_date.strftime('%Y-%m-%d'))
+
+ date_counts = Counter(dates)
+ print(f" ๐Ÿ“… Daily distribution:")
+ for date, count in sorted(date_counts.items(), reverse=True)[:5]:
+ days_ago = (now.date() - datetime.strptime(date, '%Y-%m-%d').date()).days
+ print(f" - {date}: {count} papers ({days_ago} days ago)")
+
+ # Show some sample titles
+ print(f" ๐Ÿ“ Sample papers:")
+ for i, paper in enumerate(papers[:3], 1):
+ paper_date = datetime.strptime(paper['updated'][:10], '%Y-%m-%d')
+ days_ago = (now.date() - paper_date.date()).days
+ print(f" {i}. {paper['title'][:60]}... ({days_ago} days ago)")
+ else:
+ print(f" โŒ No papers found in this range")
+
+
+def check_weekend_effect():
+ """Check if weekend affects paper submission patterns"""
+
+ print(f"\n" + "="*60)
+ print("๐Ÿ“… Weekend Effect Analysis")
+ print("="*60)
+
+ now = datetime.now(timezone.utc)
+ current_weekday = now.strftime('%A')
+
+ print(f"๐Ÿ—“๏ธ Today is: {current_weekday}")
+ print(f"๐Ÿ“Š Checking if weekend timing affects paper submissions...")
+
+ # Analyze the past week day by day
+ for i in range(7):
+ date = now - timedelta(days=i)
+ weekday = date.strftime('%A')
+ date_str = date.strftime('%Y-%m-%d')
+
+ if i == 0:
+ status = "(Today)"
+ elif i == 1:
+ status = "(Yesterday)"
+ else:
+ status = f"({i} days ago)"
+
+ print(f" {date_str} {weekday} {status}")
+
+ print(f"\n๐Ÿ’ก Possible explanations for low paper count:")
+ if current_weekday in ['Saturday', 'Sunday']:
+ print(f" ๐Ÿ  It's {current_weekday} - researchers typically don't submit on weekends")
+ elif current_weekday == 'Monday':
+ print(f" ๐Ÿ“… It's Monday - weekend submissions are rare, Monday submissions may be low")
+ else:
+ print(f" ๐Ÿ“š It's {current_weekday} - should be normal submission day")
+
+ print(f" ๐Ÿ• Time zone effects: arXiv updates happen at specific times")
+ print(f" โฐ Current UTC time: {now.strftime('%H:%M')} - submissions may not be processed yet")
+
+
+def test_specific_fetch():
+ """Test the actual fetch function with 3 days"""
+
+ print(f"\n" + "="*60)
+ print("๐Ÿงช Testing Actual Fetch Function")
+ print("="*60)
+
+ print(f"๐Ÿ”„ Testing the same logic your main script uses...")
+
+ # Simulate the fetch without OpenAI API
+ class MockFetcher(ArxivPaperFetcher):
+ def __init__(self):
+ import requests
+ self.session = requests.Session()
+
+ def filter_papers_with_gpt(self, papers, use_parallel=True, max_concurrent=16):
+ # Skip GPT filtering, return all papers
+ print(f" โญ๏ธ Skipping GPT filtering, would have processed {len(papers)} papers")
+ return papers
+
+ try:
+ # Test with mock fetcher
+ fetcher = MockFetcher()
+
+ # Use the same parameters as your actual run
+ papers = fetcher.fetch_recent_papers(days=3)
+
+ print(f"๐Ÿ“„ Raw papers fetched: {len(papers)} papers")
+
+ if papers:
+ print(f"โœ… Papers found! The issue is likely in GPT filtering or API key")
+ print(f"๐Ÿ“‹ Sample papers:")
+ for i, paper in enumerate(papers[:3], 1):
+ print(f" {i}. {paper['title'][:60]}...")
+ else:
+ print(f"โŒ No papers found in raw fetch - arXiv issue or date range problem")
+
+ except Exception as e:
+ print(f"โŒ Error in fetch test: {e}")
+
+
+if __name__ == "__main__":
+ analyze_recent_papers()
+ check_weekend_effect()
+ test_specific_fetch()
+
+ print(f"\n" + "="*60)
+ print("๐ŸŽฏ Diagnosis Summary")
+ print("="*60)
+ print(f"If this analysis shows:")
+ print(f" ๐Ÿ“„ Papers exist โ†’ Problem is with GPT filtering or API key")
+ print(f" โŒ No papers โ†’ Weekend effect or arXiv submission patterns")
+ print(f" ๐Ÿ• Time zone โ†’ Wait a few hours and try again")
+ print(f" ๐Ÿ“… Date issue โ†’ Check date range logic in fetch function") \ No newline at end of file