import requests import time import json import sys # ================= ⚙️ 配置区域 ================= BASE_URL = "http://47.122.127.178/api/v3" # 测试目标:Firecrawl 官方文档 (结构清晰,适合测试) TARGET_URL = "https://docs.firecrawl.dev" # 测试搜索词 TEST_QUERY = "credits pricing" # ============================================== class Colors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKCYAN = '\033[96m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' def log(step, msg, color=Colors.OKBLUE): print(f"{color}[{step}] {msg}{Colors.ENDC}") def run_v3_test(): print(f"{Colors.HEADER}🚀 开始 Wiki Crawler V3 API 全链路测试{Colors.ENDC}\n") # --------------------------------------------------------- # 1. 创建任务 (POST /tasks) # --------------------------------------------------------- log("STEP 1", f"创建任务 (Map): {TARGET_URL}") try: res = requests.post(f"{BASE_URL}/tasks", json={"url": TARGET_URL}) resp = res.json() if resp['code'] != 1: log("FAIL", f"任务创建失败: {resp}", Colors.FAIL) return data = resp['data'] task_id = data['task_id'] count = data.get('count', 0) is_new = data.get('is_new', False) status_text = "新任务" if is_new else "已有任务" log("SUCCESS", f"ID: {task_id} | 状态: {status_text} | 发现链接: {count}", Colors.OKGREEN) except Exception as e: log("FAIL", f"请求异常: {e}", Colors.FAIL) return # --------------------------------------------------------- # 2. 触发执行 (POST /tasks/{id}/run) # --------------------------------------------------------- log("STEP 2", f"触发后台多线程爬取 (Task {task_id})") try: # batch_size=10, 意味着会启动多线程处理这10个链接 res = requests.post( f"{BASE_URL}/tasks/{task_id}/run", json={"batch_size": 10} ) resp = res.json() if resp['code'] == 1: log("SUCCESS", "后台任务已接受 (202 Accepted)", Colors.OKGREEN) else: log("FAIL", f"启动失败: {resp}", Colors.FAIL) return except Exception as e: log("FAIL", f"请求异常: {e}", Colors.FAIL) return # --------------------------------------------------------- # 3. 实时监控 (GET /tasks/{id}) # --------------------------------------------------------- log("STEP 3", "进入实时监控模式 (轮询状态)...", Colors.OKCYAN) max_retries = 20 is_completed_batch = False for i in range(max_retries): try: res = requests.get(f"{BASE_URL}/tasks/{task_id}") monitor = res.json()['data'] stats = monitor['stats'] active_threads = monitor['active_threads'] # 格式化输出状态 active_count = len(active_threads) progress_bar = f"Pending: {stats['pending']} | Processing: {stats['processing']}/{active_count} | Completed: {stats['completed']}" print(f" [{i+1}/{max_retries}] {progress_bar}") # 打印正在爬取的 URL (抽样) if active_threads: # 显示全部 print("\n ⚡ 当前正在处理的 URL:") for url in active_threads: print(f" ⚡ {url}") # 判断完成条件: # 1. 数据库 processing 为 0 # 2. 内存 active_threads 为 0 # 3. 至少有一个 completed (防止任务没开始就判定结束) if stats['processing'] == 0 and active_count == 0 and stats['completed'] > 0: is_completed_batch = True print("\n") log("SUCCESS", "✅ 当前批次处理完毕!", Colors.OKGREEN) break time.sleep(1.5) # 轮询间隔 except Exception as e: print(f"\n ⚠️ 监控异常: {e}") break if not is_completed_batch: print("\n") log("WARN", "监控超时,爬虫可能仍在后台运行,继续测试搜索...", Colors.WARNING) # --------------------------------------------------------- # 4. 混合搜索与验证 (POST /search) # --------------------------------------------------------- log("STEP 4", f"测试混合检索 + Rerank: '{TEST_QUERY}'") try: res = requests.post( f"{BASE_URL}/search", json={ "query": TEST_QUERY, "task_id": task_id, "limit": 3 } ) resp = res.json() if resp['code'] != 1: log("FAIL", f"搜索失败: {resp}", Colors.FAIL) return results = resp['data']['results'] if not results: log("FAIL", "未搜索到结果 (Result Empty)", Colors.FAIL) return log("SUCCESS", f"搜索命中 {len(results)} 条结果", Colors.OKGREEN) # === 详细验证 === first = results[0] print(f"\n{Colors.WARNING}--- Top 1 结果详情 ---{Colors.ENDC}") print(f"📄 标题: {first.get('title', 'N/A')}") print(f"🔗 链接: {first.get('source_url')}") print(f"🧭 路径: {first.get('meta_info', {}).get('header_path', 'N/A')}") print(f"🎯 分数: {first.get('score')} " + ("(Reranked)" if first.get('reranked') else "(Rough)")) print(f"📝 内容: {first.get('content')[:80].replace(chr(10), ' ')}...") print(f"{Colors.WARNING}-----------------------{Colors.ENDC}\n") # 自动断言 if first.get('meta_info') and 'header_path' in first['meta_info']: print(f"✅ [Phase 1.5] 结构化元数据验证通过") else: print(f"❌ [Phase 1.5] 缺少元数据") if first.get('reranked'): print(f"✅ [Phase 2.5] Rerank 重排序生效") else: print(f"⚠️ [Phase 2.5] Rerank 未标记 (可能是降级或代码未更新)") except Exception as e: log("FAIL", str(e), Colors.FAIL) if __name__ == "__main__": run_v3_test()