summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rw-r--r--scripts/fetch_papers.py97
1 files changed, 48 insertions, 49 deletions
diff --git a/scripts/fetch_papers.py b/scripts/fetch_papers.py
index f6fe21b..98e6eb0 100644
--- a/scripts/fetch_papers.py
+++ b/scripts/fetch_papers.py
@@ -16,16 +16,48 @@ ALLOWED_CATEGORIES = [
"stat.ML" # Stat.ML
]
-def fetch_papers_wide_then_filter(days=1, keywords=None):
+def advanced_filter(entry):
"""
- 从 arXiv 中抓取过去 N 天内提交的所有论文(只限制时间 submittedDate),
- 然后在本地过滤:
- 1) 只保留 tags 中包含 ALLOWED_CATEGORIES(若论文有多分类,只要有任意一个符合就OK)
- 2) 标题或摘要里包含指定关键词
+ 判断一篇论文是否含有正面关键词组合(bias/fairness + LLM/transformer/GPT等),
+ 且不包含负面关键词(统计、物理、电路等)。
"""
- if keywords is None:
- keywords = ["bias", "fairness"]
+ import re
+
+ # 减少重复处理,先统一转小写
+ title = getattr(entry, 'title', '').lower()
+ summary = getattr(entry, 'summary', '').lower()
+ full_text = title + " " + summary
+
+ # 1) 正面关键词
+ # - 必须含有 "bias" 或 "fairness"(泛泛概念)
+ # - 且含有至少一个模型相关关键词
+ general_terms = ["bias", "fairness"]
+ model_terms = ["llm", "language model", "transformer", "gpt", "nlp",
+ "pretrained", "embedding", "generation", "alignment", "ai"]
+
+ # 2) 负面关键词(排除统计、物理、电路等无关方向)
+ negative_terms = [
+ "estimation", "variance", "statistical", "sample", "sensor", "circuit",
+ "quantum", "physics", "electronics", "hardware", "transistor", "amplifier"
+ ]
+
+ # 检查正面关键词
+ has_general = any(term in full_text for term in general_terms)
+ has_model = any(term in full_text for term in model_terms)
+
+ # 检查负面关键词(命中则排除)
+ has_negative = any(term in full_text for term in negative_terms)
+
+ # 只有同时满足“general + model”并且“无负面”才返回True
+ return (has_general and has_model) and (not has_negative)
+
+def fetch_papers_wide_then_filter(days=1):
+ """
+ 从 arXiv 中抓取过去 N 天内提交的所有论文(限制时间),然后在本地过滤:
+ 1) 只保留 tags 中包含 ALLOWED_CATEGORIES(若论文有多分类,只要有任意一个符合就OK)
+ 2) 用 advanced_filter() 检查标题或摘要是否满足要求
+ """
now_utc = datetime.datetime.now(datetime.timezone.utc)
start_utc = now_utc - datetime.timedelta(days=days)
@@ -33,7 +65,7 @@ def fetch_papers_wide_then_filter(days=1, keywords=None):
end_str = now_utc.strftime("%Y%m%d%H%M")
print(f"[DEBUG] date range (UTC): {start_str} ~ {end_str} (past {days} days)")
- # 构造 search_query,仅用时间
+ # 构造搜索query,仅用时间
search_query = f"submittedDate:[{start_str} TO {end_str}]"
base_url = "http://export.arxiv.org/api/query"
@@ -76,11 +108,6 @@ def fetch_papers_wide_then_filter(days=1, keywords=None):
# -- 本地过滤 --
matched = []
for e in all_entries:
- title = getattr(e, 'title', '')
- summary = getattr(e, 'summary', '')
- published = getattr(e, 'published', '')
- link = getattr(e, 'link', '')
-
if hasattr(e, 'tags'):
# e.tags: a list of objects with .term
categories = [t.term for t in e.tags]
@@ -88,24 +115,20 @@ def fetch_papers_wide_then_filter(days=1, keywords=None):
categories = []
# 1) 是否属于 ALLOWED_CATEGORIES
- # 有些论文有多分类,只要其中一个在 ALLOWED_CATEGORIES 里就OK
- # 例如 "cs.IR", "cs.AI"
in_allowed_cat = any(cat in ALLOWED_CATEGORIES for cat in categories)
if not in_allowed_cat:
continue
- # 2) 是否含关键词
- text_lower = (title + " " + summary).lower()
- has_keyword = any(kw.lower() in text_lower for kw in keywords)
- if has_keyword:
+ # 2) 更精准的组合式关键词筛选
+ if advanced_filter(e):
matched.append({
- "title": title,
- "published": published,
- "link": link,
+ "title": e.title,
+ "published": e.published,
+ "link": e.link,
"categories": categories
})
- print(f"[DEBUG] matched {len(matched)} papers after local filtering (categories + keywords)")
+ print(f"[DEBUG] matched {len(matched)} papers after local filtering (categories + advanced_filter)")
return matched
def update_readme_in_repo(papers, token, repo_name):
@@ -144,33 +167,9 @@ def update_readme_in_repo(papers, token, repo_name):
print(f"[INFO] README updated with {len(papers)} papers.")
def main():
- # 1) 抓取过去3天, 关键词=["bias","fairness"]
+ # 1) 抓取过去1天
days = 1
- keywords = [
- "bias",
- "LLM bias",
- "language model bias",
- "debiasing",
- "bias mitigation",
- "fairness LLM",
- "bias reduction",
- "algorithmic fairness",
- "model fairness",
- "bias detection",
- "ethical LLM",
- "responsible AI",
- "bias evaluation",
- "fairness evaluation",
- "bias correction",
- "ethical AI",
- "fairness metrics",
- "unbiased LLM",
- "bias measurement",
- "alignment bias",
- "bias assessment"
- ]
-
- papers = fetch_papers_wide_then_filter(days=days, keywords=keywords)
+ papers = fetch_papers_wide_then_filter(days=days)
print(f"\n[RESULT] matched {len(papers)} papers. Will update README if not empty.")
# 2) 更新README