diff options
| author | haoyuren <13851610112@163.com> | 2025-06-29 17:42:40 -0700 |
|---|---|---|
| committer | haoyuren <13851610112@163.com> | 2025-06-29 17:42:40 -0700 |
| commit | 49e1516fdb238b4d9a9181b641e344593aeedefc (patch) | |
| tree | 3eb9c98017adb58791955d2ce20a22a0850ded51 /scripts/test_3day_fetch.py | |
| parent | 5f9e75d58d3f2090f58132128f0b73152741d700 (diff) | |
Diffstat (limited to 'scripts/test_3day_fetch.py')
| -rw-r--r-- | scripts/test_3day_fetch.py | 231 |
1 files changed, 231 insertions, 0 deletions
diff --git a/scripts/test_3day_fetch.py b/scripts/test_3day_fetch.py new file mode 100644 index 0000000..6a731ff --- /dev/null +++ b/scripts/test_3day_fetch.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +""" +Test 3-Day Paper Fetch + +Detailed analysis of paper availability in the past 3 days +to identify why 0 papers were retrieved. +""" + +import os +import sys +import logging +import requests +import feedparser +from datetime import datetime, timezone, timedelta +from collections import Counter + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[logging.StreamHandler(sys.stdout)] +) +logger = logging.getLogger(__name__) + +# Add the parent directory to the path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from scripts.fetch_papers import ArxivPaperFetcher, CS_CATEGORIES + +def analyze_recent_papers(): + """Analyze papers from the past week with daily breakdown""" + + print("๐ Analyzing Recent Paper Availability") + print("=" * 60) + + # Calculate time ranges + now = datetime.now(timezone.utc) + print(f"๐
Current time: {now.strftime('%Y-%m-%d %H:%M:%S')} UTC") + + # Test different time ranges + time_ranges = [ + ("Past 1 day", now - timedelta(days=1)), + ("Past 2 days", now - timedelta(days=2)), + ("Past 3 days", now - timedelta(days=3)), + ("Past 7 days", now - timedelta(days=7)) + ] + + # Create a fake fetcher instance for accessing private methods + class TestFetcher: + def __init__(self): + import requests + self.session = requests.Session() + + def _parse_paper_entry(self, entry): + return { + "title": entry.title.replace('\n', ' ').strip(), + "abstract": entry.summary.replace('\n', ' ').strip(), + "authors": [author.name for author in entry.authors] if hasattr(entry, 'authors') else [], + "published": entry.published, + "updated": entry.updated, + "link": entry.link, + "arxiv_id": entry.id.split('/')[-1], + "categories": [tag.term for tag in entry.tags] if hasattr(entry, 'tags') else [] + } + + def fetch_recent_sample(self, start_date, end_date, max_papers=500): + """Fetch a sample of papers from the date range""" + all_papers = [] + + # Check a few key categories + test_categories = ["cs.AI", "cs.LG", "cs.CL", "cs.CV"] + + for category in test_categories: + try: + params = { + "search_query": f"cat:{category}", + "sortBy": "submittedDate", + "sortOrder": "descending", + "start": 0, + "max_results": 100 + } + + response = self.session.get("http://export.arxiv.org/api/query", + params=params, timeout=30) + response.raise_for_status() + + feed = feedparser.parse(response.content) + + for entry in feed.entries: + paper_date = datetime(*entry.updated_parsed[:6], tzinfo=timezone.utc) + + if start_date <= paper_date <= end_date: + paper_data = self._parse_paper_entry(entry) + all_papers.append(paper_data) + + except Exception as e: + print(f" โ Error fetching {category}: {e}") + + # Remove duplicates + unique_papers = {} + for paper in all_papers: + unique_papers[paper['arxiv_id']] = paper + + return list(unique_papers.values()) + + fetcher = TestFetcher() + + # Test each time range + for range_name, start_date in time_ranges: + print(f"\n๐ {range_name} ({start_date.strftime('%Y-%m-%d')} to {now.strftime('%Y-%m-%d')}):") + + papers = fetcher.fetch_recent_sample(start_date, now) + print(f" ๐ Found: {len(papers)} papers") + + if papers: + # Analyze dates + dates = [] + for paper in papers: + paper_date = datetime.strptime(paper['updated'][:10], '%Y-%m-%d') + dates.append(paper_date.strftime('%Y-%m-%d')) + + date_counts = Counter(dates) + print(f" ๐
Daily distribution:") + for date, count in sorted(date_counts.items(), reverse=True)[:5]: + days_ago = (now.date() - datetime.strptime(date, '%Y-%m-%d').date()).days + print(f" - {date}: {count} papers ({days_ago} days ago)") + + # Show some sample titles + print(f" ๐ Sample papers:") + for i, paper in enumerate(papers[:3], 1): + paper_date = datetime.strptime(paper['updated'][:10], '%Y-%m-%d') + days_ago = (now.date() - paper_date.date()).days + print(f" {i}. {paper['title'][:60]}... ({days_ago} days ago)") + else: + print(f" โ No papers found in this range") + + +def check_weekend_effect(): + """Check if weekend affects paper submission patterns""" + + print(f"\n" + "="*60) + print("๐
Weekend Effect Analysis") + print("="*60) + + now = datetime.now(timezone.utc) + current_weekday = now.strftime('%A') + + print(f"๐๏ธ Today is: {current_weekday}") + print(f"๐ Checking if weekend timing affects paper submissions...") + + # Analyze the past week day by day + for i in range(7): + date = now - timedelta(days=i) + weekday = date.strftime('%A') + date_str = date.strftime('%Y-%m-%d') + + if i == 0: + status = "(Today)" + elif i == 1: + status = "(Yesterday)" + else: + status = f"({i} days ago)" + + print(f" {date_str} {weekday} {status}") + + print(f"\n๐ก Possible explanations for low paper count:") + if current_weekday in ['Saturday', 'Sunday']: + print(f" ๐ It's {current_weekday} - researchers typically don't submit on weekends") + elif current_weekday == 'Monday': + print(f" ๐
It's Monday - weekend submissions are rare, Monday submissions may be low") + else: + print(f" ๐ It's {current_weekday} - should be normal submission day") + + print(f" ๐ Time zone effects: arXiv updates happen at specific times") + print(f" โฐ Current UTC time: {now.strftime('%H:%M')} - submissions may not be processed yet") + + +def test_specific_fetch(): + """Test the actual fetch function with 3 days""" + + print(f"\n" + "="*60) + print("๐งช Testing Actual Fetch Function") + print("="*60) + + print(f"๐ Testing the same logic your main script uses...") + + # Simulate the fetch without OpenAI API + class MockFetcher(ArxivPaperFetcher): + def __init__(self): + import requests + self.session = requests.Session() + + def filter_papers_with_gpt(self, papers, use_parallel=True, max_concurrent=16): + # Skip GPT filtering, return all papers + print(f" โญ๏ธ Skipping GPT filtering, would have processed {len(papers)} papers") + return papers + + try: + # Test with mock fetcher + fetcher = MockFetcher() + + # Use the same parameters as your actual run + papers = fetcher.fetch_recent_papers(days=3) + + print(f"๐ Raw papers fetched: {len(papers)} papers") + + if papers: + print(f"โ
Papers found! The issue is likely in GPT filtering or API key") + print(f"๐ Sample papers:") + for i, paper in enumerate(papers[:3], 1): + print(f" {i}. {paper['title'][:60]}...") + else: + print(f"โ No papers found in raw fetch - arXiv issue or date range problem") + + except Exception as e: + print(f"โ Error in fetch test: {e}") + + +if __name__ == "__main__": + analyze_recent_papers() + check_weekend_effect() + test_specific_fetch() + + print(f"\n" + "="*60) + print("๐ฏ Diagnosis Summary") + print("="*60) + print(f"If this analysis shows:") + print(f" ๐ Papers exist โ Problem is with GPT filtering or API key") + print(f" โ No papers โ Weekend effect or arXiv submission patterns") + print(f" ๐ Time zone โ Wait a few hours and try again") + print(f" ๐
Date issue โ Check date range logic in fetch function")
\ No newline at end of file |
