v3接口restful风格,规范化接口;添加mcp服务器;新增log模块

This commit is contained in:
2026-01-19 23:54:29 +08:00
parent 389c13a2a7
commit 7c99e67a7f
14 changed files with 780 additions and 376 deletions

View File

@@ -3,165 +3,174 @@ import time
import json
import sys
# ================= 配置区域 =================
BASE_URL = "http://127.0.0.1:8000"
# 使用 Dify 文档作为测试对象 (结构清晰,适合验证 Markdown 切分)
TEST_URL = "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme"
# 测试查询词 (确保能命中上面的页面)
TEST_QUERY = "upload size limit"
# ===========================================
# ================= ⚙️ 配置区域 =================
BASE_URL = "http://127.0.0.1:8000/api/v3"
# 测试目标Firecrawl 官方文档 (结构清晰,适合测试)
TARGET_URL = "https://docs.firecrawl.dev"
# 测试搜索词
TEST_QUERY = "credits pricing"
# ==============================================
class Colors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
def log(step: str, msg: str, color=Colors.OKBLUE):
def log(step, msg, color=Colors.OKBLUE):
print(f"{color}[{step}] {msg}{Colors.ENDC}")
def run_e2e_test():
print(f"{Colors.HEADER}=== 开始 Wiki Crawler E2E 完整测试 ==={Colors.ENDC}")
def run_v3_test():
print(f"{Colors.HEADER}🚀 开始 Wiki Crawler V3 API 全链路测试{Colors.ENDC}\n")
# 0. 后端健康检查
# ---------------------------------------------------------
# 1. 创建任务 (POST /tasks)
# ---------------------------------------------------------
log("STEP 1", f"创建任务 (Map): {TARGET_URL}")
try:
requests.get(f"{BASE_URL}/docs", timeout=3)
except Exception:
log("FATAL", "无法连接后端,请确保 main.py 正在运行 (http://127.0.0.1:8000)", Colors.FAIL)
sys.exit(1)
res = requests.post(f"{BASE_URL}/tasks", json={"url": TARGET_URL})
resp = res.json()
if resp['code'] != 1:
log("FAIL", f"任务创建失败: {resp}", Colors.FAIL)
return
data = resp['data']
task_id = data['task_id']
count = data.get('count', 0)
is_new = data.get('is_new', False)
status_text = "新任务" if is_new else "已有任务"
log("SUCCESS", f"ID: {task_id} | 状态: {status_text} | 发现链接: {count}", Colors.OKGREEN)
except Exception as e:
log("FAIL", f"请求异常: {e}", Colors.FAIL)
return
# ---------------------------------------------------------
# Step 1: 地图式扫描 (Map)
# 2. 触发执行 (POST /tasks/{id}/run)
# ---------------------------------------------------------
# log("STEP 1", f"注册任务并扫描链接: {TEST_URL}")
# task_id = None
# try:
# res = requests.post(f"{BASE_URL}/api/v2/crawler/map", json={"url": TEST_URL})
# res_json = res.json()
log("STEP 2", f"触发后台多线程爬取 (Task {task_id})")
try:
# batch_size=10, 意味着会启动多线程处理这10个链接
res = requests.post(
f"{BASE_URL}/tasks/{task_id}/run",
json={"batch_size": 10}
)
resp = res.json()
# # 验证响应状态
# if res_json.get('code') != 1:
# log("FAIL", f"Map 接口返回错误: {res_json}", Colors.FAIL)
# sys.exit(1)
if resp['code'] == 1:
log("SUCCESS", "后台任务已接受 (202 Accepted)", Colors.OKGREEN)
else:
log("FAIL", f"启动失败: {resp}", Colors.FAIL)
return
# data = res_json['data']
# task_id = data['task_id']
# count = data.get('count', 0)
# log("SUCCESS", f"任务注册成功。Task ID: {task_id}, 待爬取链接数: {count}", Colors.OKGREEN)
# except Exception as e:
# log("FAIL", f"请求异常: {e}", Colors.FAIL)
# sys.exit(1)
except Exception as e:
log("FAIL", f"请求异常: {e}", Colors.FAIL)
return
# ---------------------------------------------------------
# Step 2: 触发后台处理 (Process)
# 3. 实时监控 (GET /tasks/{id})
# ---------------------------------------------------------
# task_id = 6
# log("STEP 2", f"触发后台处理 -> Task ID: {task_id}")
log("STEP 3", "进入实时监控模式 (轮询状态)...", Colors.OKCYAN)
max_retries = 20
is_completed_batch = False
# try:
# res = requests.post(
# f"{BASE_URL}/api/v2/crawler/process",
# json={"task_id": task_id, "batch_size": 5}
# )
# res_json = res.json()
# if res_json.get('code') == 1:
# log("SUCCESS", "后台处理任务已启动...", Colors.OKGREEN)
# else:
# log("FAIL", f"启动失败: {res_json}", Colors.FAIL)
# sys.exit(1)
# except Exception as e:
# log("FAIL", f"请求异常: {e}", Colors.FAIL)
# sys.exit(1)
# ---------------------------------------------------------
# Step 3: 轮询搜索结果 (Polling)
# ---------------------------------------------------------
log("STEP 3", "轮询搜索接口,等待数据入库...")
task_id = 6
max_retries = 12
found_data = False
search_results = []
for i in range(max_retries):
print(f" ⏳ 第 {i+1}/{max_retries} 次尝试搜索...", end="\r")
time.sleep(5) # 每次等待 5 秒,给爬虫和 Embedding 一点时间
try:
# 调用 V2 智能搜索接口
search_res = requests.post(
f"{BASE_URL}/api/v2/search",
json={
"query": TEST_QUERY,
"task_id": task_id,
"limit": 3
}
)
resp_json = search_res.json()
res = requests.get(f"{BASE_URL}/tasks/{task_id}")
monitor = res.json()['data']
stats = monitor['stats']
active_threads = monitor['active_threads']
# 格式化输出状态
active_count = len(active_threads)
progress_bar = f"Pending: {stats['pending']} | Processing: {stats['processing']}/{active_count} | Completed: {stats['completed']}"
print(f" [{i+1}/{max_retries}] {progress_bar}")
# 打印正在爬取的 URL (抽样)
if active_threads:
# 显示全部
print("\n ⚡ 当前正在处理的 URL:")
for url in active_threads:
print(f"{url}")
# 判断完成条件:
# 1. 数据库 processing 为 0
# 2. 内存 active_threads 为 0
# 3. 至少有一个 completed (防止任务没开始就判定结束)
if stats['processing'] == 0 and active_count == 0 and stats['completed'] > 0:
is_completed_batch = True
print("\n")
log("SUCCESS", "✅ 当前批次处理完毕!", Colors.OKGREEN)
break
time.sleep(1.5) # 轮询间隔
# 解析响应结构: {code: 1, msg: "...", data: {results: [...]}}
if resp_json['code'] == 1:
data_body = resp_json['data']
# 兼容性检查:确保 results 存在且不为空
if data_body and 'results' in data_body and len(data_body['results']) > 0:
search_results = data_body['results']
found_data = True
print("") # 换行
log("SUCCESS", f"✅ 成功搜索到 {len(search_results)} 条相关切片!", Colors.OKGREEN)
break
except Exception as e:
# 忽略网络抖动,继续重试
pass
print(f"\n ⚠️ 监控异常: {e}")
break
if not found_data:
print("")
log("FAIL", "❌ 超时:未能在规定时间内搜索到数据。请检查后端日志是否有报错。", Colors.FAIL)
sys.exit(1)
if not is_completed_batch:
print("\n")
log("WARN", "监控超时,爬虫可能仍在后台运行,继续测试搜索...", Colors.WARNING)
# ---------------------------------------------------------
# Step 4: 验证 Phase 1.5 成果 (Meta Info)
# 4. 混合搜索与验证 (POST /search)
# ---------------------------------------------------------
log("STEP 4", "验证结构化数据 (Phase 1.5 Check)")
log("STEP 4", f"测试混合检索 + Rerank: '{TEST_QUERY}'")
first_result = search_results[0]
# 打印第一条结果用于人工确认
print(f"\n{Colors.WARNING}--- 检索结果样本 ---{Colors.ENDC}")
print(f"Title: {first_result.get('title')}")
print(f"URL: {first_result.get('source_url')}")
print(f"Meta: {json.dumps(first_result.get('meta_info', {}), ensure_ascii=False)}")
print(f"Content Preview: {first_result.get('content')[:50]}...")
print(f"{Colors.WARNING}----------------------{Colors.ENDC}\n")
try:
res = requests.post(
f"{BASE_URL}/search",
json={
"query": TEST_QUERY,
"task_id": task_id,
"limit": 3
}
)
resp = res.json()
if resp['code'] != 1:
log("FAIL", f"搜索失败: {resp}", Colors.FAIL)
return
results = resp['data']['results']
if not results:
log("FAIL", "未搜索到结果 (Result Empty)", Colors.FAIL)
return
log("SUCCESS", f"搜索命中 {len(results)} 条结果", Colors.OKGREEN)
# === 详细验证 ===
first = results[0]
print(f"\n{Colors.WARNING}--- Top 1 结果详情 ---{Colors.ENDC}")
print(f"📄 标题: {first.get('title', 'N/A')}")
print(f"🔗 链接: {first.get('source_url')}")
print(f"🧭 路径: {first.get('meta_info', {}).get('header_path', 'N/A')}")
print(f"🎯 分数: {first.get('score')} " + ("(Reranked)" if first.get('reranked') else "(Rough)"))
print(f"📝 内容: {first.get('content')[:80].replace(chr(10), ' ')}...")
print(f"{Colors.WARNING}-----------------------{Colors.ENDC}\n")
# 自动断言
if first.get('meta_info') and 'header_path' in first['meta_info']:
print(f"✅ [Phase 1.5] 结构化元数据验证通过")
else:
print(f"❌ [Phase 1.5] 缺少元数据")
if first.get('reranked'):
print(f"✅ [Phase 2.5] Rerank 重排序生效")
else:
print(f"⚠️ [Phase 2.5] Rerank 未标记 (可能是降级或代码未更新)")
# 自动化断言
checks = {
"Has Content": bool(first_result.get('content')),
"Has Meta Info": 'meta_info' in first_result,
"Has Header Path": 'header_path' in first_result.get('meta_info', {}),
"Headers Dict Exists": 'headers' in first_result.get('meta_info', {})
}
all_pass = True
for name, passed in checks.items():
status = f"{Colors.OKGREEN}PASS{Colors.ENDC}" if passed else f"{Colors.FAIL}FAIL{Colors.ENDC}"
print(f"检查项 [{name}]: {status}")
if not passed:
all_pass = False
if all_pass:
meta = first_result['meta_info']
print(f"\n{Colors.OKBLUE}🎉 测试通过!系统已具备 Phase 1.5 (结构化 RAG) 能力。{Colors.ENDC}")
print(f"提取到的上下文路径: {Colors.HEADER}{meta.get('header_path', 'N/A')}{Colors.ENDC}")
else:
print(f"\n{Colors.FAIL}❌ 测试未完全通过:缺少必要的元数据字段。请检查 crawler_service.py 或 update_db.py。{Colors.ENDC}")
except Exception as e:
log("FAIL", str(e), Colors.FAIL)
if __name__ == "__main__":
run_e2e_test()
run_v3_test()