summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorYuren Hao <97327730+YurenHao0426@users.noreply.github.com>2025-04-16 15:47:47 -0500
committerGitHub <noreply@github.com>2025-04-16 15:47:47 -0500
commitc262fa04b647d47f6efa7288ceb553b02fc6d0de (patch)
treeb3ad7d0ad94c6b684b994030611f2f8419117b85 /scripts
parent8869bc82419db628dd8bb0cdceed4b5ae92d669e (diff)
Update fetch_papers.py
Diffstat (limited to 'scripts')
-rw-r--r--scripts/fetch_papers.py89
1 files changed, 33 insertions, 56 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index 3104f43..c22e6b0 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -55,17 +55,18 @@ def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"):
return False
def fetch_papers_combined(days=1):
+ import datetime, requests, feedparser
- now_utc = datetime.datetime.now(datetime.timezone.utc)
+ # 1. Compute & log your 24 h window
+ now_utc = datetime.datetime.now(datetime.timezone.utc)
cutoff_utc = now_utc - datetime.timedelta(days=days)
+ print(f"[DEBUG] now_utc = {now_utc.isoformat()}")
+ print(f"[DEBUG] cutoff_utc= {cutoff_utc.isoformat()}")
- # 1. Build an OR‑joined category filter:
+ # 2. Build your category query (or replace with "all:*" to disable)
cat_query = " OR ".join(f"cat:{c}" for c in ALLOWED_CATEGORIES)
- # If you really want *no* category filtering, just set: cat_query = "all:*"
-
- base_url = "http://export.arxiv.org/api/query"
- step = 100
- start = 0
+ base_url = "http://export.arxiv.org/api/query"
+ step, start = 100, 0
all_entries = []
while True:
@@ -76,75 +77,51 @@ def fetch_papers_combined(days=1):
"start": start,
"max_results": step
}
- print(f"[DEBUG] fetching arXiv entries: {start} to {start+step}")
resp = requests.get(base_url, params=params, timeout=30)
resp.raise_for_status()
- print("[DEBUG] arXiv query URL:", resp.url)
- feed = feedparser.parse(resp.content)
+ print(f"[DEBUG] arXiv query URL: {resp.url}")
+
+ feed = feedparser.parse(resp.content)
batch = feed.entries
- for i, e in enumerate(batch[:5], 1):
- print(f"[DEBUG] sample #{i} published:", e.published)
print(f"[DEBUG] fetched batch size: {len(batch)}")
-
if not batch:
break
- # 2. Filter by published date >= cutoff
- for entry in batch:
- # — parse the ISO Z‑time correctly —
- published = datetime.datetime.strptime(
- entry.published, "%Y-%m-%dT%H:%M:%SZ"
+ # 3. Parse & filter every entry in this batch
+ kept = []
+ for e in batch:
+ # parse the Z‑timestamp
+ pub = datetime.datetime.strptime(
+ e.published, "%Y-%m-%dT%H:%M:%SZ"
).replace(tzinfo=datetime.timezone.utc)
-
- if published >= cutoff_utc:
- all_entries.append(entry)
- else:
- start = None
- break
+ print(f"[DEBUG] entry.published → {pub.isoformat()}")
+ if pub >= cutoff_utc:
+ kept.append(e)
+ # 4. Collect those in window
+ all_entries.extend(kept)
+ print(f"[DEBUG] kept {len(kept)} of {len(batch)} in this batch")
- if start is None or len(batch) < step:
+ # 5. Stop if *none* in this batch were new enough
+ if not kept:
+ print("[DEBUG] no entries in window → stopping fetch loop")
break
+ # 6. Otherwise page on (or stop if fewer than a full page)
+ if len(batch) < step:
+ break
start += step
print(f"[DEBUG] total fetched papers from arXiv in last {days} day(s): {len(all_entries)}")
- # …then proceed with OpenAI filtering exactly as before…
- openai_api_key = os.getenv("OPENAI_API_KEY")
- if not openai_api_key:
- print("[ERROR] OPENAI_API_KEY missing, aborting.")
- return []
-
- client = OpenAI(api_key=openai_api_key)
- final_matched = []
-
- for idx, entry in enumerate(all_entries, 1):
- title = entry.title
- summary = entry.summary
- # if you *really* want to disable *all* filtering aside from the LLM check,
- # you can comment out the category check below:
- categories = [t.term for t in getattr(entry, 'tags', [])]
- if not any(cat in ALLOWED_CATEGORIES for cat in categories):
- continue
-
- if is_relevant_by_api(title, summary, client):
- final_matched.append({
- "title": title,
- "summary": summary,
- "published": entry.published,
- "link": entry.link,
- "categories": categories
- })
- print(f"[DEBUG][API] Included #{idx}: {title[:60]}...")
- else:
- print(f"[DEBUG][API] Excluded #{idx}: {title[:60]}...")
-
- print(f"[DEBUG] final matched papers after OpenAI filter: {len(final_matched)}")
+ # …then proceed with your OpenAI filtering as before…
+ # (unchanged code for OpenAI calls, category checks, README updates)
+
return final_matched
+
def update_readme_in_repo(papers, token, repo_name):
if not papers:
print("[INFO] No matched papers, skip README update.")