summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scripts/fetch_papers.py91
1 files changed, 48 insertions, 43 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index 00da4f2..7de3f12 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -54,76 +54,81 @@ def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"):
return False
def fetch_papers_combined(days=1):
- now_utc = datetime.datetime.now(datetime.timezone.utc)
- start_utc = now_utc - datetime.timedelta(days=days)
+ import datetime
+ import requests
+ import feedparser
- start_str = start_utc.strftime("%Y%m%d%H%M")
- end_str = now_utc.strftime("%Y%m%d%H%M")
+ now_utc = datetime.datetime.now(datetime.timezone.utc)
+ cutoff_utc = now_utc - datetime.timedelta(days=days)
- search_query = f"submittedDate:[{start_str} TO {end_str}]"
- base_url = "http://export.arxiv.org/api/query"
+ # 1. Build an OR‑joined category filter:
+ cat_query = " OR ".join(f"cat:{c}" for c in ALLOWED_CATEGORIES)
+ # If you really want *no* category filtering, just set: cat_query = "all:*"
- step = 100
- start = 0
+ base_url = "http://export.arxiv.org/api/query"
+ step = 100
+ start = 0
all_entries = []
while True:
params = {
- "search_query": search_query,
- "sortBy": "submittedDate",
- "sortOrder": "descending",
- "start": start,
- "max_results": step
+ "search_query": cat_query,
+ "sortBy": "submittedDate",
+ "sortOrder": "descending",
+ "start": start,
+ "max_results": step
}
print(f"[DEBUG] fetching arXiv entries: {start} to {start+step}")
+ resp = requests.get(base_url, params=params, timeout=30)
+ resp.raise_for_status()
+ feed = feedparser.parse(resp.content)
+ batch = feed.entries
+ print(f"[DEBUG] fetched batch size: {len(batch)}")
- try:
- resp = requests.get(base_url, params=params, timeout=30)
- if resp.status_code != 200:
- print(f"[ERROR] HTTP Status Code: {resp.status_code}")
- break
- feed = feedparser.parse(resp.content)
- batch = feed.entries
- print(f"[DEBUG] fetched batch size: {len(batch)}")
-
- if not batch:
- break
-
- all_entries.extend(batch)
- start += step
+ if not batch:
+ break
- if start >= 3000:
- print("[DEBUG] Reached 3000 entries limit, stopping.")
+ # 2. Filter by published date >= cutoff
+ for entry in batch:
+ published = datetime.datetime.fromisoformat(entry.published)
+ if published >= cutoff_utc:
+ all_entries.append(entry)
+ else:
+ # since sorted descending, once we hit older papers we can stop entirely
+ start = None
break
- except Exception as e:
- print("[ERROR] Exception during fetching from arXiv:", e)
+ if start is None or len(batch) < step:
break
- print(f"[DEBUG] total fetched papers from arXiv: {len(all_entries)}")
+ start += step
+ print(f"[DEBUG] total fetched papers from arXiv in last {days} day(s): {len(all_entries)}")
+
+ # …then proceed with OpenAI filtering exactly as before…
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
print("[ERROR] OPENAI_API_KEY missing, aborting.")
return []
client = OpenAI(api_key=openai_api_key)
-
final_matched = []
- for idx, entry in enumerate(all_entries, 1):
- title = entry.title
- summary = entry.summary
- categories = [t.term for t in entry.tags] if hasattr(entry, 'tags') else []
+ for idx, entry in enumerate(all_entries, 1):
+ title = entry.title
+ summary = entry.summary
+ # if you *really* want to disable *all* filtering aside from the LLM check,
+ # you can comment out the category check below:
+ categories = [t.term for t in getattr(entry, 'tags', [])]
if not any(cat in ALLOWED_CATEGORIES for cat in categories):
- continue # 保留分类过滤,更精准高效(也可去掉)
+ continue
if is_relevant_by_api(title, summary, client):
final_matched.append({
- "title": title,
- "summary": summary,
- "published": entry.published,
- "link": entry.link,
+ "title": title,
+ "summary": summary,
+ "published": entry.published,
+ "link": entry.link,
"categories": categories
})
print(f"[DEBUG][API] Included #{idx}: {title[:60]}...")
@@ -131,10 +136,10 @@ def fetch_papers_combined(days=1):
print(f"[DEBUG][API] Excluded #{idx}: {title[:60]}...")
print(f"[DEBUG] final matched papers after OpenAI filter: {len(final_matched)}")
-
return final_matched
+
def update_readme_in_repo(papers, token, repo_name):
if not papers:
print("[INFO] No matched papers, skip README update.")