summaryrefslogtreecommitdiff
path: root/scripts/fetch_papers.py
diff options
context:
space:
mode:
authorYuren Hao <97327730+YurenHao0426@users.noreply.github.com>2025-04-13 22:26:00 -0700
committerGitHub <noreply@github.com>2025-04-13 22:26:00 -0700
commit48d0e05af0f040ce6f52b6ced120d5a4ba7f9b7e (patch)
tree461015ee32e88dc906cdf00a35f3666a656d9ebf /scripts/fetch_papers.py
parentafd449584d56e62e2658fc2afba6020edaed83a5 (diff)
Update fetch_papers.py
Diffstat (limited to 'scripts/fetch_papers.py')
-rw-r--r--scripts/fetch_papers.py80
1 files changed, 56 insertions, 24 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index 2c7d177..1bd201c 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -47,7 +47,7 @@ def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"):
max_tokens=1
)
response_msg = dialogue.choices[0].message.content.strip()
- print(f"[DEBUG][API] return message='{response_msg}' for paper title='{title[:60]}...'")
+ print(f"[DEBUG][API] OpenAI response='{response_msg}' for paper '{title[:60]}...'")
return response_msg == "1"
except Exception as e:
print("[ERROR][API] calling OpenAI API:", e)
@@ -58,7 +58,6 @@ def fetch_papers_combined(days=1):
start_utc = now_utc - datetime.timedelta(days=days)
start_str = start_utc.strftime("%Y%m%d%H%M")
end_str = now_utc.strftime("%Y%m%d%H%M")
-
search_query = f"submittedDate:[{start_str} TO {end_str}]"
base_url = "http://export.arxiv.org/api/query"
@@ -74,30 +73,45 @@ def fetch_papers_combined(days=1):
"start": start,
"max_results": step
}
- resp = requests.get(base_url, params=params, timeout=30)
- if resp.status_code != 200:
- break
- feed = feedparser.parse(resp.content)
- batch = feed.entries
- if not batch:
+ print(f"[DEBUG] fetching arXiv: {start} to {start+step}")
+ try:
+ resp = requests.get(base_url, params=params, timeout=30)
+ if resp.status_code != 200:
+ print("[ERROR] HTTP Status:", resp.status_code)
+ break
+ feed = feedparser.parse(resp.content)
+ batch = feed.entries
+ print(f"[DEBUG] fetched batch size: {len(batch)}")
+ if not batch:
+ break
+
+ all_entries.extend(batch)
+ start += step
+ if start >= 3000:
+ print("[DEBUG] reached fetch limit 3000, stop.")
+ break
+
+ except Exception as e:
+ print("[ERROR] fetching arXiv:", e)
break
- all_entries.extend(batch)
- start += step
- if start >= 3000:
- break
+ print(f"[DEBUG] total papers fetched from arXiv: {len(all_entries)}")
- local_candidates = [
- {
- "title": e.title,
- "summary": e.summary,
- "published": e.published,
- "link": e.link,
- "categories": [t.term for t in e.tags]
- }
- for e in all_entries
- if any(cat in ALLOWED_CATEGORIES for cat in [t.term for t in e.tags]) and advanced_filter(e)
- ]
+ local_candidates = []
+ for e in all_entries:
+ categories = [t.term for t in e.tags] if hasattr(e, 'tags') else []
+ if not any(cat in ALLOWED_CATEGORIES for cat in categories):
+ continue
+ if advanced_filter(e):
+ local_candidates.append({
+ "title": e.title,
+ "summary": e.summary,
+ "published": e.published,
+ "link": e.link,
+ "categories": categories
+ })
+
+ print(f"[DEBUG] candidates after local filter: {len(local_candidates)}")
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
@@ -105,12 +119,21 @@ def fetch_papers_combined(days=1):
return local_candidates
client = OpenAI(api_key=openai_api_key)
- final_matched = [p for p in local_candidates if is_relevant_by_api(p["title"], p["summary"], client)]
+
+ final_matched = []
+ for idx, paper in enumerate(local_candidates, 1):
+ if is_relevant_by_api(paper["title"], paper["summary"], client):
+ final_matched.append(paper)
+ else:
+ print(f"[DEBUG][API] Excluded paper #{idx}: {paper['title'][:60]}...")
+
+ print(f"[DEBUG] final matched papers after OpenAI filter: {len(final_matched)}")
return final_matched
def update_readme_in_repo(papers, token, repo_name):
if not papers:
+ print("[INFO] No matched papers, skip README update.")
return
g = Github(token)
@@ -137,15 +160,24 @@ def update_readme_in_repo(papers, token, repo_name):
sha=readme_file.sha,
branch="main"
)
+ print(f"[INFO] README updated with {len(papers)} papers.")
def main():
days = 1
+ print(f"[DEBUG] Starting fetch_papers_combined with days={days}")
papers = fetch_papers_combined(days=days)
+ print(f"[DEBUG] After fetch_papers_combined: {len(papers)} papers matched.")
+
github_token = os.getenv("TARGET_REPO_TOKEN")
target_repo_name = os.getenv("TARGET_REPO_NAME")
+ print(f"[DEBUG] Github Token Set: {'Yes' if github_token else 'No'}")
+ print(f"[DEBUG] Target Repo Name: {target_repo_name}")
+
if github_token and target_repo_name and papers:
update_readme_in_repo(papers, github_token, target_repo_name)
+ else:
+ print("[INFO] Skipped README update due to missing credentials or no papers matched.")
if __name__ == "__main__":
main()