Update fetch_papers.py

author: Yuren Hao <97327730+YurenHao0426@users.noreply.github.com> 2025-04-13 22:26:00 -0700
committer: GitHub <noreply@github.com> 2025-04-13 22:26:00 -0700
commit: 48d0e05af0f040ce6f52b6ced120d5a4ba7f9b7e (patch)
tree: 461015ee32e88dc906cdf00a35f3666a656d9ebf /scripts/fetch_papers.py
parent: afd449584d56e62e2658fc2afba6020edaed83a5 (diff)
1 files changed, 56 insertions, 24 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index 2c7d177..1bd201c 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -47,7 +47,7 @@ def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"):
             max_tokens=1
         )
         response_msg = dialogue.choices[0].message.content.strip()
-        print(f"[DEBUG][API] return message='{response_msg}' for paper title='{title[:60]}...'")
+        print(f"[DEBUG][API] OpenAI response='{response_msg}' for paper '{title[:60]}...'")
         return response_msg == "1"
     except Exception as e:
         print("[ERROR][API] calling OpenAI API:", e)
@@ -58,7 +58,6 @@ def fetch_papers_combined(days=1):
     start_utc = now_utc - datetime.timedelta(days=days)
     start_str = start_utc.strftime("%Y%m%d%H%M")
     end_str = now_utc.strftime("%Y%m%d%H%M")
-
     search_query = f"submittedDate:[{start_str} TO {end_str}]"
 
     base_url = "http://export.arxiv.org/api/query"
@@ -74,30 +73,45 @@ def fetch_papers_combined(days=1):
             "start": start,
             "max_results": step
         }
-        resp = requests.get(base_url, params=params, timeout=30)
-        if resp.status_code != 200:
-            break
-        feed = feedparser.parse(resp.content)
-        batch = feed.entries
-        if not batch:
+        print(f"[DEBUG] fetching arXiv: {start} to {start+step}")
+        try:
+            resp = requests.get(base_url, params=params, timeout=30)
+            if resp.status_code != 200:
+                print("[ERROR] HTTP Status:", resp.status_code)
+                break
+            feed = feedparser.parse(resp.content)
+            batch = feed.entries
+            print(f"[DEBUG] fetched batch size: {len(batch)}")
+            if not batch:
+                break
+
+            all_entries.extend(batch)
+            start += step
+            if start >= 3000:
+                print("[DEBUG] reached fetch limit 3000, stop.")
+                break
+
+        except Exception as e:
+            print("[ERROR] fetching arXiv:", e)
             break
 
-        all_entries.extend(batch)
-        start += step
-        if start >= 3000:
-            break
+    print(f"[DEBUG] total papers fetched from arXiv: {len(all_entries)}")
 
-    local_candidates = [
-        {
-            "title": e.title,
-            "summary": e.summary,
-            "published": e.published,
-            "link": e.link,
-            "categories": [t.term for t in e.tags]
-        }
-        for e in all_entries
-        if any(cat in ALLOWED_CATEGORIES for cat in [t.term for t in e.tags]) and advanced_filter(e)
-    ]
+    local_candidates = []
+    for e in all_entries:
+        categories = [t.term for t in e.tags] if hasattr(e, 'tags') else []
+        if not any(cat in ALLOWED_CATEGORIES for cat in categories):
+            continue
+        if advanced_filter(e):
+            local_candidates.append({
+                "title": e.title,
+                "summary": e.summary,
+                "published": e.published,
+                "link": e.link,
+                "categories": categories
+            })
+
+    print(f"[DEBUG] candidates after local filter: {len(local_candidates)}")
 
     openai_api_key = os.getenv("OPENAI_API_KEY")
     if not openai_api_key:
@@ -105,12 +119,21 @@ def fetch_papers_combined(days=1):
         return local_candidates
 
     client = OpenAI(api_key=openai_api_key)
-    final_matched = [p for p in local_candidates if is_relevant_by_api(p["title"], p["summary"], client)]
+
+    final_matched = []
+    for idx, paper in enumerate(local_candidates, 1):
+        if is_relevant_by_api(paper["title"], paper["summary"], client):
+            final_matched.append(paper)
+        else:
+            print(f"[DEBUG][API] Excluded paper #{idx}: {paper['title'][:60]}...")
+
+    print(f"[DEBUG] final matched papers after OpenAI filter: {len(final_matched)}")
 
     return final_matched
 
 def update_readme_in_repo(papers, token, repo_name):
     if not papers:
+        print("[INFO] No matched papers, skip README update.")
         return
 
     g = Github(token)
@@ -137,15 +160,24 @@ def update_readme_in_repo(papers, token, repo_name):
         sha=readme_file.sha,
         branch="main"
     )
+    print(f"[INFO] README updated with {len(papers)} papers.")
 
 def main():
     days = 1
+    print(f"[DEBUG] Starting fetch_papers_combined with days={days}")
     papers = fetch_papers_combined(days=days)
 
+    print(f"[DEBUG] After fetch_papers_combined: {len(papers)} papers matched.")
+
     github_token = os.getenv("TARGET_REPO_TOKEN")
     target_repo_name = os.getenv("TARGET_REPO_NAME")
+    print(f"[DEBUG] Github Token Set: {'Yes' if github_token else 'No'}")
+    print(f"[DEBUG] Target Repo Name: {target_repo_name}")
+
     if github_token and target_repo_name and papers:
         update_readme_in_repo(papers, github_token, target_repo_name)
+    else:
+        print("[INFO] Skipped README update due to missing credentials or no papers matched.")
 
 if __name__ == "__main__":
     main()
author	Yuren Hao <97327730+YurenHao0426@users.noreply.github.com>	2025-04-13 22:26:00 -0700
committer	GitHub <noreply@github.com>	2025-04-13 22:26:00 -0700
commit	48d0e05af0f040ce6f52b6ced120d5a4ba7f9b7e (patch)
tree	461015ee32e88dc906cdf00a35f3666a656d9ebf /scripts/fetch_papers.py
parent	afd449584d56e62e2658fc2afba6020edaed83a5 (diff)