summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorblackhao <13851610112@163.com>2025-04-13 23:40:30 -0500
committerblackhao <13851610112@163.com>2025-04-13 23:40:30 -0500
commit5f26ab0f361e06e0a792c82436241883a11d80c5 (patch)
treebac7fd09a029586d261db989bef4b5c37e679f7e /scripts
parent864a62d1e97a7fbda50539e8bb6c92ffe82b3f31 (diff)
use gpt-4o
Diffstat (limited to 'scripts')
-rw-r--r--scripts/fetch_papers.py178
-rw-r--r--scripts/test_uiuc_chat.py71
2 files changed, 120 insertions, 129 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index 79246c9..2c7d177 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -3,29 +3,30 @@ import requests
import feedparser
import datetime
from github import Github
+from openai import OpenAI
ALLOWED_CATEGORIES = [
"cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.RO",
"cs.IR", "stat.ML"
]
+SYSTEM_PROMPT = (
+ "You are a helpful assistant. The user will give you a paper title and abstract. "
+ "Your task: Decide if this paper is about large language models (or generative text models) AND about bias/fairness. "
+ "If yes, respond with just a single character: 1. Otherwise, respond with a single character: 0. "
+ "No extra explanation, no punctuation—only the number."
+)
+
def advanced_filter(entry):
- """
- 基于标题+摘要,本地进行“正面关键词 + 负面关键词”筛选
- """
title = getattr(entry, 'title', '').lower()
summary = getattr(entry, 'summary', '').lower()
full_text = title + " " + summary
- # 正面关键词
general_terms = ["bias", "fairness"]
model_terms = ["llm", "language model", "transformer", "gpt", "nlp",
"pretrained", "embedding", "generation", "alignment", "ai"]
- # 负面关键词
- negative_terms = [
- "estimation", "variance", "quantum", "physics",
- "sensor", "circuit", "electronics", "hardware"
- ]
+ negative_terms = ["estimation", "variance", "quantum", "physics",
+ "sensor", "circuit", "electronics", "hardware"]
has_general = any(term in full_text for term in general_terms)
has_model = any(term in full_text for term in model_terms)
@@ -33,62 +34,31 @@ def advanced_filter(entry):
return (has_general and has_model) and (not has_negative)
-API_URL = "https://uiuc.chat/api/chat-api/chat"
-MODEL_NAME = "qwen2.5:14b-instruct-fp16"
-SYSTEM_PROMPT = (
- "You are a helpful assistant. The user will give you a paper title and abstract. "
- "Your task: Decide if this paper is about large language models (or generative text models) AND about bias/fairness. "
- "If yes, respond with just a single character: 1. Otherwise, respond with a single character: 0. "
- "No extra explanation, no punctuation—only the number."
-)
-
-def is_relevant_by_api(title, summary, api_key):
- """
- 调用外部API,根据title+summary判别是否相关(返回 True/False),
- 并打印调试信息。
- """
- headers = {"Content-Type": "application/json"}
- data = {
- "model": MODEL_NAME,
- "messages": [
- {
- "role": "system",
- "content": SYSTEM_PROMPT
- },
- {
- "role": "user",
- "content": SYSTEM_PROMPT + f"Title: {title}\nAbstract: {summary}"
- }
- ],
- "api_key": api_key,
- "course_name": "llm-bias-papers",
- "stream": False,
- "temperature": 0.0
- }
+def is_relevant_by_api(title, summary, client, model="gpt-4-turbo"):
+ prompt = f"Title: {title}\nAbstract: {summary}"
try:
- resp = requests.post(API_URL, headers=headers, json=data, timeout=30)
- resp.raise_for_status()
- full_json = resp.json()
- # 获取API返回的message
- response_msg = full_json.get("message", "")
- print(f"[DEBUG][API] return message='{response_msg.strip()}' for paper title='{title[:60]}...'")
- return (response_msg.strip() == "1")
+ dialogue = client.chat.completions.create(
+ model=model,
+ messages=[
+ {"role": "system", "content": SYSTEM_PROMPT},
+ {"role": "user", "content": prompt}
+ ],
+ temperature=0.0,
+ max_tokens=1
+ )
+ response_msg = dialogue.choices[0].message.content.strip()
+ print(f"[DEBUG][API] return message='{response_msg}' for paper title='{title[:60]}...'")
+ return response_msg == "1"
except Exception as e:
- print("[ERROR][API] calling external API:", e)
+ print("[ERROR][API] calling OpenAI API:", e)
return False
def fetch_papers_combined(days=1):
- """
- 1) 抓过去days天 arXiv论文(宽松)
- 2) 本地先过滤(分类 + advanced_filter)
- 3) 对“通过本地筛”的候选,调用API二次判定 + debug输出
- """
now_utc = datetime.datetime.now(datetime.timezone.utc)
start_utc = now_utc - datetime.timedelta(days=days)
start_str = start_utc.strftime("%Y%m%d%H%M")
end_str = now_utc.strftime("%Y%m%d%H%M")
- print(f"[DEBUG] date range (UTC): {start_str} ~ {end_str}, days={days}")
search_query = f"submittedDate:[{start_str} TO {end_str}]"
base_url = "http://export.arxiv.org/api/query"
@@ -104,84 +74,43 @@ def fetch_papers_combined(days=1):
"start": start,
"max_results": step
}
- print(f"[DEBUG] fetching: {start} -> {start+step}")
- try:
- resp = requests.get(base_url, params=params, timeout=30)
- if resp.status_code != 200:
- print("[ERROR] HTTP Status:", resp.status_code)
- break
- feed = feedparser.parse(resp.content)
- except Exception as e:
- print("[ERROR] fetching arXiv:", e)
+ resp = requests.get(base_url, params=params, timeout=30)
+ if resp.status_code != 200:
break
-
+ feed = feedparser.parse(resp.content)
batch = feed.entries
- got_count = len(batch)
- print(f"[DEBUG] got {got_count} entries in this batch.")
- if got_count == 0:
+ if not batch:
break
all_entries.extend(batch)
start += step
if start >= 3000:
- print("[DEBUG] reached 3000, stop.")
break
- print(f"[DEBUG] total retrieved in date range: {len(all_entries)}")
-
- # --- 本地过滤1: 分类 + advanced_filter ---
- local_candidates = []
- for e in all_entries:
- title = getattr(e, "title", "")
- summary = getattr(e, "summary", "")
- published = getattr(e, "published", "")
- link = getattr(e, "link", "")
- categories = [t.term for t in e.tags] if hasattr(e, 'tags') else []
-
- if not any(cat in ALLOWED_CATEGORIES for cat in categories):
- continue
-
- if advanced_filter(e):
- local_candidates.append({
- "title": title,
- "summary": summary,
- "published": published,
- "link": link,
- "categories": categories
- })
-
- print(f"[DEBUG] local_candidates = {len(local_candidates)} after local filter")
-
- # Debug: 打印所有local_candidates的标题,看看是不是你预期的那几篇
- for idx, paper in enumerate(local_candidates, 1):
- print(f"[DEBUG][LOCAL] #{idx}, title='{paper['title']}' cat={paper['categories']}")
-
- # --- 2) 调API二次判定 ---
- api_key = os.getenv("UIUC_API_KEY") # 你在Secrets中配置
- if not api_key:
- print("[WARNING] No UIUC_API_KEY found. Skip second filter.")
+ local_candidates = [
+ {
+ "title": e.title,
+ "summary": e.summary,
+ "published": e.published,
+ "link": e.link,
+ "categories": [t.term for t in e.tags]
+ }
+ for e in all_entries
+ if any(cat in ALLOWED_CATEGORIES for cat in [t.term for t in e.tags]) and advanced_filter(e)
+ ]
+
+ openai_api_key = os.getenv("OPENAI_API_KEY")
+ if not openai_api_key:
+ print("[WARNING] No OPENAI_API_KEY found. Skip second filter.")
return local_candidates
- final_matched = []
- for idx, paper in enumerate(local_candidates, 1):
- relevant = is_relevant_by_api(paper["title"], paper["summary"], api_key)
- if relevant:
- final_matched.append({
- "title": paper["title"],
- "published": paper["published"],
- "link": paper["link"],
- "categories": paper["categories"]
- })
- else:
- # 如果不相关,就打印个提示
- print(f"[DEBUG][API] => '0' => exclude paper #{idx}, title='{paper['title'][:60]}...'")
-
- print(f"[DEBUG] final_matched = {len(final_matched)} after API check")
+ client = OpenAI(api_key=openai_api_key)
+ final_matched = [p for p in local_candidates if is_relevant_by_api(p["title"], p["summary"], client)]
+
return final_matched
def update_readme_in_repo(papers, token, repo_name):
if not papers:
- print("[INFO] No matched papers, skip README update.")
return
g = Github(token)
@@ -208,24 +137,15 @@ def update_readme_in_repo(papers, token, repo_name):
sha=readme_file.sha,
branch="main"
)
- print(f"[INFO] README updated with {len(papers)} papers.")
def main():
- # 抓过去5天(你例子里是5) 或根据需要改
- days = 5
+ days = 1
papers = fetch_papers_combined(days=days)
- print(f"\n[RESULT] matched {len(papers)} papers total after double filter. Now update README if not empty...")
github_token = os.getenv("TARGET_REPO_TOKEN")
target_repo_name = os.getenv("TARGET_REPO_NAME")
- if not github_token or not target_repo_name:
- print("[ERROR] Missing environment variables: TARGET_REPO_TOKEN / TARGET_REPO_NAME.")
- return
-
- if papers:
+ if github_token and target_repo_name and papers:
update_readme_in_repo(papers, github_token, target_repo_name)
- else:
- print("[INFO] No matched papers, done without update.")
if __name__ == "__main__":
main()
diff --git a/scripts/test_uiuc_chat.py b/scripts/test_uiuc_chat.py
new file mode 100644
index 0000000..c8c4e64
--- /dev/null
+++ b/scripts/test_uiuc_chat.py
@@ -0,0 +1,71 @@
+import requests
+
+API_URL = "https://uiuc.chat/api/chat-api/chat"
+API_KEY = "uc_0a6c6e31ac654737a3cd4d5c1ad4e4cd" # Replace with your actual key
+
+title = "Bias-Aware Agent: Enhancing Fairness in AI-Driven Knowledge Retrieval"
+abstract = "Advancements in retrieving accessible information have evolved more rapidly over the last few years... (omitting for brevity) ... by empowering users with transparency and awareness, this approach aims to foster more equitable information systems."
+'''
+headers = {
+ 'Content-Type': 'application/json'
+}
+messages = [
+ {
+ "role": "system",
+ # 可以空着,或写很简短
+ "content": "You are a helpful assistant."
+ },
+ {
+ "role": "user",
+ "content": (
+ "Here is a paper's title and abstract:\n\n"
+ f"Title: {title}\n\n"
+ f"Abstract: {abstract}\n\n"
+ "Respond with '1' (just the digit) if this paper is clearly about both "
+ "large language models (or generative text models) AND about bias/fairness. "
+ "Otherwise respond '0'. No explanation, no punctuation, only the digit."
+ )
+ }
+]
+
+data = {
+ "model": "llama3.1:8b-instruct-fp16",
+ "messages": messages,
+ "api_key": API_KEY,
+ "course_name": "llm-bias-papers",
+ "stream": False,
+ "temperature": 0.1
+}
+'''
+url = "https://uiuc.chat/api/chat-api/chat"
+headers = {
+ 'Content-Type': 'application/json'
+}
+data = {
+ "model": "llama3.1:8b-instruct-fp16",
+ "messages": [
+ {
+ "role": "system",
+ "content": "You are a helpful AI assistant. Follow instructions carefully."
+ },
+ {
+ "role": "user",
+ "content": "Here is a paper's title and abstract:\n\nTitle:" + title + "\n\nAbstract: " + abstract + "\n\nRespond with 'Yes' (just the word) if this paper is clearly about both large language models (or generative text models) AND about bias/fairness. Otherwise respond 'No'. No explanation, no punctuation, only the digit."
+ }
+ ],
+ "api_key": "uc_0a6c6e31ac654737a3cd4d5c1ad4e4cd",
+ "course_name": "llm-bias-papers",
+ "stream": False,
+ "temperature": 0.0,
+ "retrieval_only": False
+}
+
+response = requests.post(url, headers=headers, json=data)
+for chunk in response.iter_lines():
+ if chunk:
+ print(chunk.decode())
+resp = requests.post(API_URL, headers=headers, json=data)
+print("Status:", resp.status_code)
+print("Response:", resp.text)
+print("[DEBUG] resp.text =", resp.text)
+print("[DEBUG] resp.json() =", resp.json()) # if not raising error