wiki_crawler/scripts/test_apis.py

import requests
import time
import json
import sys

# ================= 配置区域 =================
BASE_URL = "http://127.0.0.1:8000"

# 使用 Dify 文档作为测试对象 (结构清晰，适合验证 Markdown 切分)
TEST_URL = "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme"

# 测试查询词 (确保能命中上面的页面)
TEST_QUERY = "upload size limit"
# ===========================================

class Colors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'

def log(step: str, msg: str, color=Colors.OKBLUE):
    print(f"{color}[{step}] {msg}{Colors.ENDC}")

def run_e2e_test():
    print(f"{Colors.HEADER}=== 开始 Wiki Crawler E2E 完整测试 ==={Colors.ENDC}")

    # 0. 后端健康检查
    try:
        requests.get(f"{BASE_URL}/docs", timeout=3)
    except Exception:
        log("FATAL", "无法连接后端，请确保 main.py 正在运行 (http://127.0.0.1:8000)", Colors.FAIL)
        sys.exit(1)

    # ---------------------------------------------------------
    # Step 1: 地图式扫描 (Map)
    # ---------------------------------------------------------
    # log("STEP 1", f"注册任务并扫描链接: {TEST_URL}")

    # task_id = None
    # try:
    #     res = requests.post(f"{BASE_URL}/api/v2/crawler/map", json={"url": TEST_URL})
    #     res_json = res.json()

    #     # 验证响应状态
    #     if res_json.get('code') != 1:
    #         log("FAIL", f"Map 接口返回错误: {res_json}", Colors.FAIL)
    #         sys.exit(1)

    #     data = res_json['data']
    #     task_id = data['task_id']
    #     count = data.get('count', 0)

    #     log("SUCCESS", f"任务注册成功。Task ID: {task_id}, 待爬取链接数: {count}", Colors.OKGREEN)

    # except Exception as e:
    #     log("FAIL", f"请求异常: {e}", Colors.FAIL)
    #     sys.exit(1)

    # ---------------------------------------------------------
    # Step 2: 触发后台处理 (Process)
    # ---------------------------------------------------------
    # task_id = 6
    # log("STEP 2", f"触发后台处理 -> Task ID: {task_id}")

    # try:
    #     res = requests.post(
    #         f"{BASE_URL}/api/v2/crawler/process",
    #         json={"task_id": task_id, "batch_size": 5}
    #     )
    #     res_json = res.json()

    #     if res_json.get('code') == 1:
    #         log("SUCCESS", "后台处理任务已启动...", Colors.OKGREEN)
    #     else:
    #         log("FAIL", f"启动失败: {res_json}", Colors.FAIL)
    #         sys.exit(1)

    # except Exception as e:
    #     log("FAIL", f"请求异常: {e}", Colors.FAIL)
    #     sys.exit(1)

    # ---------------------------------------------------------
    # Step 3: 轮询搜索结果 (Polling)
    # ---------------------------------------------------------
    log("STEP 3", "轮询搜索接口，等待数据入库...")
    task_id = 6
    max_retries = 12
    found_data = False
    search_results = []

    for i in range(max_retries):
        print(f"   ⏳ 第 {i+1}/{max_retries} 次尝试搜索...", end="\r")
        time.sleep(5) # 每次等待 5 秒，给爬虫和 Embedding 一点时间

        try:
            # 调用 V2 智能搜索接口
            search_res = requests.post(
                f"{BASE_URL}/api/v2/search",
                json={
                    "query": TEST_QUERY,
                    "task_id": task_id,
                    "limit": 3
                }
            )
            resp_json = search_res.json()

            # 解析响应结构: {code: 1, msg: "...", data: {results: [...]}}
            if resp_json['code'] == 1:
                data_body = resp_json['data']
                # 兼容性检查：确保 results 存在且不为空
                if data_body and 'results' in data_body and len(data_body['results']) > 0:
                    search_results = data_body['results']
                    found_data = True
                    print("") # 换行
                    log("SUCCESS", f"✅ 成功搜索到 {len(search_results)} 条相关切片！", Colors.OKGREEN)
                    break
        except Exception as e:
            # 忽略网络抖动，继续重试
            pass

    if not found_data:
        print("")
        log("FAIL", "❌ 超时：未能在规定时间内搜索到数据。请检查后端日志是否有报错。", Colors.FAIL)
        sys.exit(1)

    # ---------------------------------------------------------
    # Step 4: 验证 Phase 1.5 成果 (Meta Info)
    # ---------------------------------------------------------
    log("STEP 4", "验证结构化数据 (Phase 1.5 Check)")

    first_result = search_results[0]

    # 打印第一条结果用于人工确认
    print(f"\n{Colors.WARNING}--- 检索结果样本 ---{Colors.ENDC}")
    print(f"Title: {first_result.get('title')}")
    print(f"URL:   {first_result.get('source_url')}")
    print(f"Meta:  {json.dumps(first_result.get('meta_info', {}), ensure_ascii=False)}")
    print(f"Content Preview: {first_result.get('content')[:50]}...")
    print(f"{Colors.WARNING}----------------------{Colors.ENDC}\n")

    # 自动化断言
    checks = {
        "Has Content": bool(first_result.get('content')),
        "Has Meta Info": 'meta_info' in first_result,
        "Has Header Path": 'header_path' in first_result.get('meta_info', {}),
        "Headers Dict Exists": 'headers' in first_result.get('meta_info', {})
    }

    all_pass = True
    for name, passed in checks.items():
        status = f"{Colors.OKGREEN}PASS{Colors.ENDC}" if passed else f"{Colors.FAIL}FAIL{Colors.ENDC}"
        print(f"检查项 [{name}]: {status}")
        if not passed:
            all_pass = False

    if all_pass:
        meta = first_result['meta_info']
        print(f"\n{Colors.OKBLUE}🎉 测试通过！系统已具备 Phase 1.5 (结构化 RAG) 能力。{Colors.ENDC}")
        print(f"提取到的上下文路径: {Colors.HEADER}{meta.get('header_path', 'N/A')}{Colors.ENDC}")
    else:
        print(f"\n{Colors.FAIL}❌ 测试未完全通过：缺少必要的元数据字段。请检查 crawler_service.py 或 update_db.py。{Colors.ENDC}")

if __name__ == "__main__":
    run_e2e_test()