diff options
| -rw-r--r-- | scripts/fetch_papers.py | 52 |
1 files changed, 26 insertions, 26 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py index ff2dd21..68a9fe9 100644 --- a/scripts/fetch_papers.py +++ b/scripts/fetch_papers.py @@ -4,13 +4,11 @@ import feedparser import datetime from github import Github -# Step 1: 你的分类限制 ALLOWED_CATEGORIES = [ "cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO", "cs.IR", "stat.ML" ] -# Step 2: 本地高级匹配 (正负关键词) def advanced_filter(entry): """ 基于标题+摘要,本地进行“正面关键词 + 负面关键词”筛选 @@ -29,15 +27,12 @@ def advanced_filter(entry): "sensor", "circuit", "electronics", "hardware" ] - # 检查正面关键词 has_general = any(term in full_text for term in general_terms) has_model = any(term in full_text for term in model_terms) - # 检查负面关键词 has_negative = any(term in full_text for term in negative_terms) return (has_general and has_model) and (not has_negative) -# Step 3: 外部API判别 API_URL = "https://uiuc.chat/api/chat-api/chat" MODEL_NAME = "qwen2.5:14b-instruct-fp16" SYSTEM_PROMPT = ( @@ -48,7 +43,8 @@ SYSTEM_PROMPT = ( def is_relevant_by_api(title, summary, api_key): """ - 调用外部API,根据title+summary判别是否相关(返回True/False) + 调用外部API,根据title+summary判别是否相关(返回 True/False), + 并打印调试信息。 """ headers = {"Content-Type": "application/json"} data = { @@ -60,7 +56,7 @@ def is_relevant_by_api(title, summary, api_key): }, { "role": "user", - "content": SYSTEM_PROMPT + f"Title: {title}\nAbstract: {summary}" + "content": f"Title: {title}\nAbstract: {summary}" } ], "api_key": api_key, @@ -71,18 +67,20 @@ def is_relevant_by_api(title, summary, api_key): try: resp = requests.post(API_URL, headers=headers, json=data, timeout=30) resp.raise_for_status() - response_msg = resp.json().get("message", "") + full_json = resp.json() + # 获取API返回的message + response_msg = full_json.get("message", "") + print(f"[DEBUG][API] return message='{response_msg.strip()}' for paper title='{title[:60]}...'") return (response_msg.strip() == "1") except Exception as e: - print("[ERROR] calling external API:", e) + print("[ERROR][API] calling external API:", e) return False -# Step 4: 抓arXiv, 先本地筛, 再API筛 def fetch_papers_combined(days=1): """ 1) 抓过去days天 arXiv论文(宽松) 2) 本地先过滤(分类 + advanced_filter) - 3) 对“通过本地筛”的候选,调用API二次判定 + 3) 对“通过本地筛”的候选,调用API二次判定 + debug输出 """ now_utc = datetime.datetime.now(datetime.timezone.utc) start_utc = now_utc - datetime.timedelta(days=days) @@ -139,33 +137,32 @@ def fetch_papers_combined(days=1): link = getattr(e, "link", "") categories = [t.term for t in e.tags] if hasattr(e, 'tags') else [] - # 分类是否允许 if not any(cat in ALLOWED_CATEGORIES for cat in categories): continue - # 是否通过 advanced_filter - if not advanced_filter(e): - continue - - local_candidates.append({ - "title": title, - "summary": summary, - "published": published, - "link": link, - "categories": categories - }) + if advanced_filter(e): + local_candidates.append({ + "title": title, + "summary": summary, + "published": published, + "link": link, + "categories": categories + }) print(f"[DEBUG] local_candidates = {len(local_candidates)} after local filter") + # Debug: 打印所有local_candidates的标题,看看是不是你预期的那几篇 + for idx, paper in enumerate(local_candidates, 1): + print(f"[DEBUG][LOCAL] #{idx}, title='{paper['title']}' cat={paper['categories']}") + # --- 2) 调API二次判定 --- api_key = os.getenv("UIUC_API_KEY") # 你在Secrets中配置 if not api_key: print("[WARNING] No UIUC_API_KEY found. Skip second filter.") - # 如果没api key,就直接return本地候选 return local_candidates final_matched = [] - for paper in local_candidates: + for idx, paper in enumerate(local_candidates, 1): relevant = is_relevant_by_api(paper["title"], paper["summary"], api_key) if relevant: final_matched.append({ @@ -174,11 +171,13 @@ def fetch_papers_combined(days=1): "link": paper["link"], "categories": paper["categories"] }) + else: + # 如果不相关,就打印个提示 + print(f"[DEBUG][API] => '0' => exclude paper #{idx}, title='{paper['title'][:60]}...'") print(f"[DEBUG] final_matched = {len(final_matched)} after API check") return final_matched -# Step 5: 写README def update_readme_in_repo(papers, token, repo_name): if not papers: print("[INFO] No matched papers, skip README update.") @@ -211,6 +210,7 @@ def update_readme_in_repo(papers, token, repo_name): print(f"[INFO] README updated with {len(papers)} papers.") def main(): + # 抓过去5天(你例子里是5) 或根据需要改 days = 5 papers = fetch_papers_combined(days=days) print(f"\n[RESULT] matched {len(papers)} papers total after double filter. Now update README if not empty...") |
