import requests import time import json import sys # ================= 配置区域 ================= BASE_URL = "http://127.0.0.1:8000" # 使用 Dify 文档作为测试对象 (结构清晰,适合验证 Markdown 切分) TEST_URL = "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme" # 测试查询词 (确保能命中上面的页面) TEST_QUERY = "upload size limit" # =========================================== class Colors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' def log(step: str, msg: str, color=Colors.OKBLUE): print(f"{color}[{step}] {msg}{Colors.ENDC}") def run_e2e_test(): print(f"{Colors.HEADER}=== 开始 Wiki Crawler E2E 完整测试 ==={Colors.ENDC}") # 0. 后端健康检查 try: requests.get(f"{BASE_URL}/docs", timeout=3) except Exception: log("FATAL", "无法连接后端,请确保 main.py 正在运行 (http://127.0.0.1:8000)", Colors.FAIL) sys.exit(1) # --------------------------------------------------------- # Step 1: 地图式扫描 (Map) # --------------------------------------------------------- # log("STEP 1", f"注册任务并扫描链接: {TEST_URL}") # task_id = None # try: # res = requests.post(f"{BASE_URL}/api/v2/crawler/map", json={"url": TEST_URL}) # res_json = res.json() # # 验证响应状态 # if res_json.get('code') != 1: # log("FAIL", f"Map 接口返回错误: {res_json}", Colors.FAIL) # sys.exit(1) # data = res_json['data'] # task_id = data['task_id'] # count = data.get('count', 0) # log("SUCCESS", f"任务注册成功。Task ID: {task_id}, 待爬取链接数: {count}", Colors.OKGREEN) # except Exception as e: # log("FAIL", f"请求异常: {e}", Colors.FAIL) # sys.exit(1) # --------------------------------------------------------- # Step 2: 触发后台处理 (Process) # --------------------------------------------------------- # task_id = 6 # log("STEP 2", f"触发后台处理 -> Task ID: {task_id}") # try: # res = requests.post( # f"{BASE_URL}/api/v2/crawler/process", # json={"task_id": task_id, "batch_size": 5} # ) # res_json = res.json() # if res_json.get('code') == 1: # log("SUCCESS", "后台处理任务已启动...", Colors.OKGREEN) # else: # log("FAIL", f"启动失败: {res_json}", Colors.FAIL) # sys.exit(1) # except Exception as e: # log("FAIL", f"请求异常: {e}", Colors.FAIL) # sys.exit(1) # --------------------------------------------------------- # Step 3: 轮询搜索结果 (Polling) # --------------------------------------------------------- log("STEP 3", "轮询搜索接口,等待数据入库...") max_retries = 12 found_data = False search_results = [] for i in range(max_retries): print(f" ⏳ 第 {i+1}/{max_retries} 次尝试搜索...", end="\r") time.sleep(5) # 每次等待 5 秒,给爬虫和 Embedding 一点时间 try: # 调用 V2 智能搜索接口 search_res = requests.post( f"{BASE_URL}/api/v2/search", json={ "query": TEST_QUERY, "task_id": task_id, "limit": 3 } ) resp_json = search_res.json() # 解析响应结构: {code: 1, msg: "...", data: {results: [...]}} if resp_json['code'] == 1: data_body = resp_json['data'] # 兼容性检查:确保 results 存在且不为空 if data_body and 'results' in data_body and len(data_body['results']) > 0: search_results = data_body['results'] found_data = True print("") # 换行 log("SUCCESS", f"✅ 成功搜索到 {len(search_results)} 条相关切片!", Colors.OKGREEN) break except Exception as e: # 忽略网络抖动,继续重试 pass if not found_data: print("") log("FAIL", "❌ 超时:未能在规定时间内搜索到数据。请检查后端日志是否有报错。", Colors.FAIL) sys.exit(1) # --------------------------------------------------------- # Step 4: 验证 Phase 1.5 成果 (Meta Info) # --------------------------------------------------------- log("STEP 4", "验证结构化数据 (Phase 1.5 Check)") first_result = search_results[0] # 打印第一条结果用于人工确认 print(f"\n{Colors.WARNING}--- 检索结果样本 ---{Colors.ENDC}") print(f"Title: {first_result.get('title')}") print(f"URL: {first_result.get('source_url')}") print(f"Meta: {json.dumps(first_result.get('meta_info', {}), ensure_ascii=False)}") print(f"Content Preview: {first_result.get('content')[:50]}...") print(f"{Colors.WARNING}----------------------{Colors.ENDC}\n") # 自动化断言 checks = { "Has Content": bool(first_result.get('content')), "Has Meta Info": 'meta_info' in first_result, "Has Header Path": 'header_path' in first_result.get('meta_info', {}), "Headers Dict Exists": 'headers' in first_result.get('meta_info', {}) } all_pass = True for name, passed in checks.items(): status = f"{Colors.OKGREEN}PASS{Colors.ENDC}" if passed else f"{Colors.FAIL}FAIL{Colors.ENDC}" print(f"检查项 [{name}]: {status}") if not passed: all_pass = False if all_pass: meta = first_result['meta_info'] print(f"\n{Colors.OKBLUE}🎉 测试通过!系统已具备 Phase 1.5 (结构化 RAG) 能力。{Colors.ENDC}") print(f"提取到的上下文路径: {Colors.HEADER}{meta.get('header_path', 'N/A')}{Colors.ENDC}") else: print(f"\n{Colors.FAIL}❌ 测试未完全通过:缺少必要的元数据字段。请检查 crawler_service.py 或 update_db.py。{Colors.ENDC}") if __name__ == "__main__": run_e2e_test()