Files
wiki_crawler/scripts/test_apis.py

176 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import requests
import time
import json
import sys
# ================= ⚙️ 配置区域 =================
BASE_URL = "http://47.122.127.178/api/v3"
# 测试目标Firecrawl 官方文档 (结构清晰,适合测试)
TARGET_URL = "https://docs.firecrawl.dev"
# 测试搜索词
TEST_QUERY = "credits pricing"
# ==============================================
class Colors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
def log(step, msg, color=Colors.OKBLUE):
print(f"{color}[{step}] {msg}{Colors.ENDC}")
def run_v3_test():
print(f"{Colors.HEADER}🚀 开始 Wiki Crawler V3 API 全链路测试{Colors.ENDC}\n")
# ---------------------------------------------------------
# 1. 创建任务 (POST /tasks)
# ---------------------------------------------------------
log("STEP 1", f"创建任务 (Map): {TARGET_URL}")
try:
res = requests.post(f"{BASE_URL}/tasks", json={"url": TARGET_URL})
resp = res.json()
if resp['code'] != 1:
log("FAIL", f"任务创建失败: {resp}", Colors.FAIL)
return
data = resp['data']
task_id = data['task_id']
count = data.get('count', 0)
is_new = data.get('is_new', False)
status_text = "新任务" if is_new else "已有任务"
log("SUCCESS", f"ID: {task_id} | 状态: {status_text} | 发现链接: {count}", Colors.OKGREEN)
except Exception as e:
log("FAIL", f"请求异常: {e}", Colors.FAIL)
return
# ---------------------------------------------------------
# 2. 触发执行 (POST /tasks/{id}/run)
# ---------------------------------------------------------
log("STEP 2", f"触发后台多线程爬取 (Task {task_id})")
try:
# batch_size=10, 意味着会启动多线程处理这10个链接
res = requests.post(
f"{BASE_URL}/tasks/{task_id}/run",
json={"batch_size": 10}
)
resp = res.json()
if resp['code'] == 1:
log("SUCCESS", "后台任务已接受 (202 Accepted)", Colors.OKGREEN)
else:
log("FAIL", f"启动失败: {resp}", Colors.FAIL)
return
except Exception as e:
log("FAIL", f"请求异常: {e}", Colors.FAIL)
return
# ---------------------------------------------------------
# 3. 实时监控 (GET /tasks/{id})
# ---------------------------------------------------------
log("STEP 3", "进入实时监控模式 (轮询状态)...", Colors.OKCYAN)
max_retries = 20
is_completed_batch = False
for i in range(max_retries):
try:
res = requests.get(f"{BASE_URL}/tasks/{task_id}")
monitor = res.json()['data']
stats = monitor['stats']
active_threads = monitor['active_threads']
# 格式化输出状态
active_count = len(active_threads)
progress_bar = f"Pending: {stats['pending']} | Processing: {stats['processing']}/{active_count} | Completed: {stats['completed']}"
print(f" [{i+1}/{max_retries}] {progress_bar}")
# 打印正在爬取的 URL (抽样)
if active_threads:
# 显示全部
print("\n ⚡ 当前正在处理的 URL:")
for url in active_threads:
print(f"{url}")
# 判断完成条件:
# 1. 数据库 processing 为 0
# 2. 内存 active_threads 为 0
# 3. 至少有一个 completed (防止任务没开始就判定结束)
if stats['processing'] == 0 and active_count == 0 and stats['completed'] > 0:
is_completed_batch = True
print("\n")
log("SUCCESS", "✅ 当前批次处理完毕!", Colors.OKGREEN)
break
time.sleep(1.5) # 轮询间隔
except Exception as e:
print(f"\n ⚠️ 监控异常: {e}")
break
if not is_completed_batch:
print("\n")
log("WARN", "监控超时,爬虫可能仍在后台运行,继续测试搜索...", Colors.WARNING)
# ---------------------------------------------------------
# 4. 混合搜索与验证 (POST /search)
# ---------------------------------------------------------
log("STEP 4", f"测试混合检索 + Rerank: '{TEST_QUERY}'")
try:
res = requests.post(
f"{BASE_URL}/search",
json={
"query": TEST_QUERY,
"task_id": task_id,
"limit": 3
}
)
resp = res.json()
if resp['code'] != 1:
log("FAIL", f"搜索失败: {resp}", Colors.FAIL)
return
results = resp['data']['results']
if not results:
log("FAIL", "未搜索到结果 (Result Empty)", Colors.FAIL)
return
log("SUCCESS", f"搜索命中 {len(results)} 条结果", Colors.OKGREEN)
# === 详细验证 ===
first = results[0]
print(f"\n{Colors.WARNING}--- Top 1 结果详情 ---{Colors.ENDC}")
print(f"📄 标题: {first.get('title', 'N/A')}")
print(f"🔗 链接: {first.get('source_url')}")
print(f"🧭 路径: {first.get('meta_info', {}).get('header_path', 'N/A')}")
print(f"🎯 分数: {first.get('score')} " + ("(Reranked)" if first.get('reranked') else "(Rough)"))
print(f"📝 内容: {first.get('content')[:80].replace(chr(10), ' ')}...")
print(f"{Colors.WARNING}-----------------------{Colors.ENDC}\n")
# 自动断言
if first.get('meta_info') and 'header_path' in first['meta_info']:
print(f"✅ [Phase 1.5] 结构化元数据验证通过")
else:
print(f"❌ [Phase 1.5] 缺少元数据")
if first.get('reranked'):
print(f"✅ [Phase 2.5] Rerank 重排序生效")
else:
print(f"⚠️ [Phase 2.5] Rerank 未标记 (可能是降级或代码未更新)")
except Exception as e:
log("FAIL", str(e), Colors.FAIL)
if __name__ == "__main__":
run_v3_test()