变更项目架构，提高扩展性

2026-01-13 01:37:26 +08:00
parent b9dbf1e8f7
commit 9190fee16f
22 changed files with 740 additions and 723 deletions
--- a/scripts/test_apis.py
+++ b/scripts/test_apis.py
@@ -1,96 +1,167 @@
 import requests
+import time
 import json
-import random
+import sys

-# 配置后端地址
-BASE_URL = "http://47.122.127.178"
+# ================= 配置区域 =================
+BASE_URL = "http://127.0.0.1:8000"

-def log_res(name, response):
-    print(f"\n=== 测试接口: {name} ===")
-    if response.status_code == 200:
-        res_json = response.json()
-        print(f"状态: 成功 (HTTP 200)")
-        print(f"返回数据: {json.dumps(res_json, indent=2, ensure_ascii=False)}")
-        return res_json
-    else:
-        print(f"状态: 失败 (HTTP {response.status_code})")
-        print(f"错误信息: {response.text}")
-        return None
+# 使用 Dify 文档作为测试对象 (结构清晰，适合验证 Markdown 切分)
+TEST_URL = "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme"

-def run_tests():
-    # 测试数据准备
-    test_root_url = f"https://example.com/wiki_{random.randint(1000, 9999)}"
+# 测试查询词 (确保能命中上面的页面)
+TEST_QUERY = "upload size limit" 
+# ===========================================
+
+class Colors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+
+def log(step: str, msg: str, color=Colors.OKBLUE):
+    print(f"{color}[{step}] {msg}{Colors.ENDC}")
+
+def run_e2e_test():
+    print(f"{Colors.HEADER}=== 开始 Wiki Crawler E2E 完整测试 ==={Colors.ENDC}")
+
+    # 0. 后端健康检查
+    try:
+        requests.get(f"{BASE_URL}/docs", timeout=3)
+    except Exception:
+        log("FATAL", "无法连接后端，请确保 main.py 正在运行 (http://127.0.0.1:8000)", Colors.FAIL)
+        sys.exit(1)
+
+    # ---------------------------------------------------------
+    # Step 1: 地图式扫描 (Map)
+    # ---------------------------------------------------------
+    # log("STEP 1", f"注册任务并扫描链接: {TEST_URL}")
    
-    # 1. 测试 /register
-    print("步骤 1: 注册新任务...")
-    res = requests.post(f"{BASE_URL}/register", json={"url": test_root_url})
-    data = log_res("注册任务", res)
-    if not data or data['code'] != 1: return
-    task_id = data['data']['task_id']
+    # task_id = None
+    # try:
+    #     res = requests.post(f"{BASE_URL}/api/v2/crawler/map", json={"url": TEST_URL})
+    #     res_json = res.json()
+        
+    #     # 验证响应状态
+    #     if res_json.get('code') != 1:
+    #         log("FAIL", f"Map 接口返回错误: {res_json}", Colors.FAIL)
+    #         sys.exit(1)
+            
+    #     data = res_json['data']
+    #     task_id = data['task_id']
+    #     count = data.get('count', 0)
+        
+    #     log("SUCCESS", f"任务注册成功。Task ID: {task_id}, 待爬取链接数: {count}", Colors.OKGREEN)
+        
+    # except Exception as e:
+    #     log("FAIL", f"请求异常: {e}", Colors.FAIL)
+    #     sys.exit(1)

-    # 2. 测试 /add_urls
-    print("\n步骤 2: 模拟爬虫发现了新链接，存入队列...")
-    sub_urls = [
-        f"{test_root_url}/page1",
-        f"{test_root_url}/page2",
-        f"{test_root_url}/page1" # 故意重复一个，测试后端去重
-    ]
-    res = requests.post(f"{BASE_URL}/add_urls", json={
-        "task_id": task_id,
-        "urls": sub_urls
-    })
-    log_res("存入新链接", res)
-
-    # 3. 测试 /pending_urls
-    print("\n步骤 3: 模拟爬虫节点获取待处理任务...")
-    res = requests.post(f"{BASE_URL}/pending_urls", json={
-        "task_id": task_id,
-        "limit": 2
-    })
-    data = log_res("获取待处理URL", res)
-    if not data or not data['data']['urls']: 
-        print("没有获取到待处理URL，停止后续测试")
-        return
+    # ---------------------------------------------------------
+    # Step 2: 触发后台处理 (Process)
+    # ---------------------------------------------------------
+    # task_id = 6
+    # log("STEP 2", f"触发后台处理 -> Task ID: {task_id}")
    
-    target_url = data['data']['urls'][0]
+    # try:
+    #     res = requests.post(
+    #         f"{BASE_URL}/api/v2/crawler/process", 
+    #         json={"task_id": task_id, "batch_size": 5}
+    #     )
+    #     res_json = res.json()
+        
+    #     if res_json.get('code') == 1:
+    #         log("SUCCESS", "后台处理任务已启动...", Colors.OKGREEN)
+    #     else:
+    #         log("FAIL", f"启动失败: {res_json}", Colors.FAIL)
+    #         sys.exit(1)
+            
+    # except Exception as e:
+    #     log("FAIL", f"请求异常: {e}", Colors.FAIL)
+    #     sys.exit(1)

-    # 4. 测试 /save_results
-    print("\n步骤 4: 模拟爬虫抓取完成，存入知识片段和向量...")
-    # 模拟一个 1536 维的向量（已处理精度）
-    mock_embedding = [round(random.uniform(-1, 1), 8) for _ in range(1536)]
+    # ---------------------------------------------------------
+    # Step 3: 轮询搜索结果 (Polling)
+    # ---------------------------------------------------------
+    log("STEP 3", "轮询搜索接口，等待数据入库...")
    
-    payload = {
-        "task_id": task_id,
-        "results": [
-            {
-                "source_url": target_url,
-                "chunk_index": 0,
-                "title": "测试页面标题 - 切片1",
-                "content": "这是模拟抓取到的第一段网页内容...",
-                "embedding": mock_embedding
-            },
-            {
-                "source_url": target_url,
-                "chunk_index": 1,
-                "title": "测试页面标题 - 切片2",
-                "content": "这是模拟抓取到的第二段网页内容...",
-                "embedding": mock_embedding
-            }
-        ]
+    max_retries = 12
+    found_data = False
+    search_results = []
+
+    for i in range(max_retries):
+        print(f"   ⏳ 第 {i+1}/{max_retries} 次尝试搜索...", end="\r")
+        time.sleep(5) # 每次等待 5 秒，给爬虫和 Embedding 一点时间
+        
+        try:
+            # 调用 V2 智能搜索接口
+            search_res = requests.post(
+                f"{BASE_URL}/api/v2/search",
+                json={
+                    "query": TEST_QUERY,
+                    "task_id": task_id,
+                    "limit": 3
+                }
+            )
+            resp_json = search_res.json()
+            
+            # 解析响应结构: {code: 1, msg: "...", data: {results: [...]}}
+            if resp_json['code'] == 1:
+                data_body = resp_json['data']
+                # 兼容性检查：确保 results 存在且不为空
+                if data_body and 'results' in data_body and len(data_body['results']) > 0:
+                    search_results = data_body['results']
+                    found_data = True
+                    print("") # 换行
+                    log("SUCCESS", f"✅ 成功搜索到 {len(search_results)} 条相关切片！", Colors.OKGREEN)
+                    break
+        except Exception as e:
+            # 忽略网络抖动，继续重试
+            pass
+            
+    if not found_data:
+        print("")
+        log("FAIL", "❌ 超时：未能在规定时间内搜索到数据。请检查后端日志是否有报错。", Colors.FAIL)
+        sys.exit(1)
+
+    # ---------------------------------------------------------
+    # Step 4: 验证 Phase 1.5 成果 (Meta Info)
+    # ---------------------------------------------------------
+    log("STEP 4", "验证结构化数据 (Phase 1.5 Check)")
+    
+    first_result = search_results[0]
+    
+    # 打印第一条结果用于人工确认
+    print(f"\n{Colors.WARNING}--- 检索结果样本 ---{Colors.ENDC}")
+    print(f"Title: {first_result.get('title')}")
+    print(f"URL:   {first_result.get('source_url')}")
+    print(f"Meta:  {json.dumps(first_result.get('meta_info', {}), ensure_ascii=False)}")
+    print(f"Content Preview: {first_result.get('content')[:50]}...")
+    print(f"{Colors.WARNING}----------------------{Colors.ENDC}\n")
+
+    # 自动化断言
+    checks = {
+        "Has Content": bool(first_result.get('content')),
+        "Has Meta Info": 'meta_info' in first_result,
+        "Has Header Path": 'header_path' in first_result.get('meta_info', {}),
+        "Headers Dict Exists": 'headers' in first_result.get('meta_info', {})
    }
-    res = requests.post(f"{BASE_URL}/save_results", json=payload)
-    log_res("保存结果", res)
-    # 5. 测试 /search
-    print("\n步骤 5: 测试基于向量的搜索...")
-    query = [round(random.uniform(-1, 1), 8) for _ in range(1536)]
-    res = requests.post(f"{BASE_URL}/search", json={
-        "task_id": None,
-        "query_embedding": query,
-        "limit": 5
-    })
-    log_res("基于向量的搜索", res)
-    
-    print("\n✅ 所有 API 流程测试完成！")
+
+    all_pass = True
+    for name, passed in checks.items():
+        status = f"{Colors.OKGREEN}PASS{Colors.ENDC}" if passed else f"{Colors.FAIL}FAIL{Colors.ENDC}"
+        print(f"检查项 [{name}]: {status}")
+        if not passed:
+            all_pass = False
+
+    if all_pass:
+        meta = first_result['meta_info']
+        print(f"\n{Colors.OKBLUE}🎉 测试通过！系统已具备 Phase 1.5 (结构化 RAG) 能力。{Colors.ENDC}")
+        print(f"提取到的上下文路径: {Colors.HEADER}{meta.get('header_path', 'N/A')}{Colors.ENDC}")
+    else:
+        print(f"\n{Colors.FAIL}❌ 测试未完全通过：缺少必要的元数据字段。请检查 crawler_service.py 或 update_db.py。{Colors.ENDC}")

 if __name__ == "__main__":
-    run_tests()
+    run_e2e_test()
--- a/scripts/update_sql.py
+++ b/scripts/update_sql.py
@@ -0,0 +1,82 @@
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sqlalchemy import create_engine, text
+from backend.core.config import settings
+
+def update_database_schema():
+    """
+    数据库无损升级脚本
+    """
+    print(f"🔌 连接数据库: {settings.DB_NAME}...")
+    engine = create_engine(settings.DATABASE_URL)
+
+    commands = [
+        # 1. 安全添加 meta_info 列 (旧数据会自动填充为 {})
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='knowledge_chunks' AND column_name='meta_info') THEN
+                ALTER TABLE knowledge_chunks ADD COLUMN meta_info JSONB DEFAULT '{}';
+                RAISE NOTICE '已添加 meta_info 列';
+            END IF;
+        END $$;
+        """,
+        
+        # 2. 安全添加 content_tsvector 列
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='knowledge_chunks' AND column_name='content_tsvector') THEN
+                ALTER TABLE knowledge_chunks ADD COLUMN content_tsvector TSVECTOR;
+                RAISE NOTICE '已添加 content_tsvector 列';
+            END IF;
+        END $$;
+        """,
+
+        # 3. 创建索引 (不影响现有数据)
+        "CREATE INDEX IF NOT EXISTS idx_chunks_meta ON knowledge_chunks USING GIN (meta_info);",
+        "CREATE INDEX IF NOT EXISTS idx_chunks_tsvector ON knowledge_chunks USING GIN (content_tsvector);",
+
+        # 4. 创建触发器函数 (用于新插入的数据)
+        """
+        CREATE OR REPLACE FUNCTION chunks_tsvector_trigger() RETURNS trigger AS $$
+        BEGIN
+          new.content_tsvector := to_tsvector('english', coalesce(new.title, '') || ' ' || new.content);
+          return new;
+        END
+        $$ LANGUAGE plpgsql;
+        """,
+
+        # 5. 绑定触发器
+        """
+        DO $$
+        BEGIN
+            IF NOT EXISTS (SELECT 1 FROM pg_trigger WHERE tgname = 'tsvectorupdate') THEN
+                CREATE TRIGGER tsvectorupdate BEFORE INSERT OR UPDATE
+                ON knowledge_chunks FOR EACH ROW EXECUTE PROCEDURE chunks_tsvector_trigger();
+            END IF;
+        END $$;
+        """,
+        
+        # 6. 【新增】回填旧数据
+        # 让以前存的 task_id=6 的数据也能生成关键词索引
+        """
+        UPDATE knowledge_chunks 
+        SET content_tsvector = to_tsvector('english', coalesce(title, '') || ' ' || content)
+        WHERE content_tsvector IS NULL;
+        """
+    ]
+
+    with engine.begin() as conn:
+        for cmd in commands:
+            try:
+                conn.execute(text(cmd))
+            except Exception as e:
+                print(f"⚠️ 执行警告 (通常可忽略): {e}")
+    
+    print("✅ 数据库结构升级完成！旧数据已保留并兼容。")
+
+if __name__ == "__main__":
+    update_database_schema()