变更项目架构,提高扩展性
This commit is contained in:
@@ -1,96 +1,167 @@
|
||||
import requests
|
||||
import time
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
|
||||
# 配置后端地址
|
||||
BASE_URL = "http://47.122.127.178"
|
||||
# ================= 配置区域 =================
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
|
||||
def log_res(name, response):
|
||||
print(f"\n=== 测试接口: {name} ===")
|
||||
if response.status_code == 200:
|
||||
res_json = response.json()
|
||||
print(f"状态: 成功 (HTTP 200)")
|
||||
print(f"返回数据: {json.dumps(res_json, indent=2, ensure_ascii=False)}")
|
||||
return res_json
|
||||
else:
|
||||
print(f"状态: 失败 (HTTP {response.status_code})")
|
||||
print(f"错误信息: {response.text}")
|
||||
return None
|
||||
# 使用 Dify 文档作为测试对象 (结构清晰,适合验证 Markdown 切分)
|
||||
TEST_URL = "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme"
|
||||
|
||||
def run_tests():
|
||||
# 测试数据准备
|
||||
test_root_url = f"https://example.com/wiki_{random.randint(1000, 9999)}"
|
||||
# 测试查询词 (确保能命中上面的页面)
|
||||
TEST_QUERY = "upload size limit"
|
||||
# ===========================================
|
||||
|
||||
class Colors:
|
||||
HEADER = '\033[95m'
|
||||
OKBLUE = '\033[94m'
|
||||
OKGREEN = '\033[92m'
|
||||
WARNING = '\033[93m'
|
||||
FAIL = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
|
||||
def log(step: str, msg: str, color=Colors.OKBLUE):
|
||||
print(f"{color}[{step}] {msg}{Colors.ENDC}")
|
||||
|
||||
def run_e2e_test():
|
||||
print(f"{Colors.HEADER}=== 开始 Wiki Crawler E2E 完整测试 ==={Colors.ENDC}")
|
||||
|
||||
# 0. 后端健康检查
|
||||
try:
|
||||
requests.get(f"{BASE_URL}/docs", timeout=3)
|
||||
except Exception:
|
||||
log("FATAL", "无法连接后端,请确保 main.py 正在运行 (http://127.0.0.1:8000)", Colors.FAIL)
|
||||
sys.exit(1)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Step 1: 地图式扫描 (Map)
|
||||
# ---------------------------------------------------------
|
||||
# log("STEP 1", f"注册任务并扫描链接: {TEST_URL}")
|
||||
|
||||
# 1. 测试 /register
|
||||
print("步骤 1: 注册新任务...")
|
||||
res = requests.post(f"{BASE_URL}/register", json={"url": test_root_url})
|
||||
data = log_res("注册任务", res)
|
||||
if not data or data['code'] != 1: return
|
||||
task_id = data['data']['task_id']
|
||||
# task_id = None
|
||||
# try:
|
||||
# res = requests.post(f"{BASE_URL}/api/v2/crawler/map", json={"url": TEST_URL})
|
||||
# res_json = res.json()
|
||||
|
||||
# # 验证响应状态
|
||||
# if res_json.get('code') != 1:
|
||||
# log("FAIL", f"Map 接口返回错误: {res_json}", Colors.FAIL)
|
||||
# sys.exit(1)
|
||||
|
||||
# data = res_json['data']
|
||||
# task_id = data['task_id']
|
||||
# count = data.get('count', 0)
|
||||
|
||||
# log("SUCCESS", f"任务注册成功。Task ID: {task_id}, 待爬取链接数: {count}", Colors.OKGREEN)
|
||||
|
||||
# except Exception as e:
|
||||
# log("FAIL", f"请求异常: {e}", Colors.FAIL)
|
||||
# sys.exit(1)
|
||||
|
||||
# 2. 测试 /add_urls
|
||||
print("\n步骤 2: 模拟爬虫发现了新链接,存入队列...")
|
||||
sub_urls = [
|
||||
f"{test_root_url}/page1",
|
||||
f"{test_root_url}/page2",
|
||||
f"{test_root_url}/page1" # 故意重复一个,测试后端去重
|
||||
]
|
||||
res = requests.post(f"{BASE_URL}/add_urls", json={
|
||||
"task_id": task_id,
|
||||
"urls": sub_urls
|
||||
})
|
||||
log_res("存入新链接", res)
|
||||
|
||||
# 3. 测试 /pending_urls
|
||||
print("\n步骤 3: 模拟爬虫节点获取待处理任务...")
|
||||
res = requests.post(f"{BASE_URL}/pending_urls", json={
|
||||
"task_id": task_id,
|
||||
"limit": 2
|
||||
})
|
||||
data = log_res("获取待处理URL", res)
|
||||
if not data or not data['data']['urls']:
|
||||
print("没有获取到待处理URL,停止后续测试")
|
||||
return
|
||||
# ---------------------------------------------------------
|
||||
# Step 2: 触发后台处理 (Process)
|
||||
# ---------------------------------------------------------
|
||||
# task_id = 6
|
||||
# log("STEP 2", f"触发后台处理 -> Task ID: {task_id}")
|
||||
|
||||
target_url = data['data']['urls'][0]
|
||||
# try:
|
||||
# res = requests.post(
|
||||
# f"{BASE_URL}/api/v2/crawler/process",
|
||||
# json={"task_id": task_id, "batch_size": 5}
|
||||
# )
|
||||
# res_json = res.json()
|
||||
|
||||
# if res_json.get('code') == 1:
|
||||
# log("SUCCESS", "后台处理任务已启动...", Colors.OKGREEN)
|
||||
# else:
|
||||
# log("FAIL", f"启动失败: {res_json}", Colors.FAIL)
|
||||
# sys.exit(1)
|
||||
|
||||
# except Exception as e:
|
||||
# log("FAIL", f"请求异常: {e}", Colors.FAIL)
|
||||
# sys.exit(1)
|
||||
|
||||
# 4. 测试 /save_results
|
||||
print("\n步骤 4: 模拟爬虫抓取完成,存入知识片段和向量...")
|
||||
# 模拟一个 1536 维的向量(已处理精度)
|
||||
mock_embedding = [round(random.uniform(-1, 1), 8) for _ in range(1536)]
|
||||
# ---------------------------------------------------------
|
||||
# Step 3: 轮询搜索结果 (Polling)
|
||||
# ---------------------------------------------------------
|
||||
log("STEP 3", "轮询搜索接口,等待数据入库...")
|
||||
|
||||
payload = {
|
||||
"task_id": task_id,
|
||||
"results": [
|
||||
{
|
||||
"source_url": target_url,
|
||||
"chunk_index": 0,
|
||||
"title": "测试页面标题 - 切片1",
|
||||
"content": "这是模拟抓取到的第一段网页内容...",
|
||||
"embedding": mock_embedding
|
||||
},
|
||||
{
|
||||
"source_url": target_url,
|
||||
"chunk_index": 1,
|
||||
"title": "测试页面标题 - 切片2",
|
||||
"content": "这是模拟抓取到的第二段网页内容...",
|
||||
"embedding": mock_embedding
|
||||
}
|
||||
]
|
||||
max_retries = 12
|
||||
found_data = False
|
||||
search_results = []
|
||||
|
||||
for i in range(max_retries):
|
||||
print(f" ⏳ 第 {i+1}/{max_retries} 次尝试搜索...", end="\r")
|
||||
time.sleep(5) # 每次等待 5 秒,给爬虫和 Embedding 一点时间
|
||||
|
||||
try:
|
||||
# 调用 V2 智能搜索接口
|
||||
search_res = requests.post(
|
||||
f"{BASE_URL}/api/v2/search",
|
||||
json={
|
||||
"query": TEST_QUERY,
|
||||
"task_id": task_id,
|
||||
"limit": 3
|
||||
}
|
||||
)
|
||||
resp_json = search_res.json()
|
||||
|
||||
# 解析响应结构: {code: 1, msg: "...", data: {results: [...]}}
|
||||
if resp_json['code'] == 1:
|
||||
data_body = resp_json['data']
|
||||
# 兼容性检查:确保 results 存在且不为空
|
||||
if data_body and 'results' in data_body and len(data_body['results']) > 0:
|
||||
search_results = data_body['results']
|
||||
found_data = True
|
||||
print("") # 换行
|
||||
log("SUCCESS", f"✅ 成功搜索到 {len(search_results)} 条相关切片!", Colors.OKGREEN)
|
||||
break
|
||||
except Exception as e:
|
||||
# 忽略网络抖动,继续重试
|
||||
pass
|
||||
|
||||
if not found_data:
|
||||
print("")
|
||||
log("FAIL", "❌ 超时:未能在规定时间内搜索到数据。请检查后端日志是否有报错。", Colors.FAIL)
|
||||
sys.exit(1)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# Step 4: 验证 Phase 1.5 成果 (Meta Info)
|
||||
# ---------------------------------------------------------
|
||||
log("STEP 4", "验证结构化数据 (Phase 1.5 Check)")
|
||||
|
||||
first_result = search_results[0]
|
||||
|
||||
# 打印第一条结果用于人工确认
|
||||
print(f"\n{Colors.WARNING}--- 检索结果样本 ---{Colors.ENDC}")
|
||||
print(f"Title: {first_result.get('title')}")
|
||||
print(f"URL: {first_result.get('source_url')}")
|
||||
print(f"Meta: {json.dumps(first_result.get('meta_info', {}), ensure_ascii=False)}")
|
||||
print(f"Content Preview: {first_result.get('content')[:50]}...")
|
||||
print(f"{Colors.WARNING}----------------------{Colors.ENDC}\n")
|
||||
|
||||
# 自动化断言
|
||||
checks = {
|
||||
"Has Content": bool(first_result.get('content')),
|
||||
"Has Meta Info": 'meta_info' in first_result,
|
||||
"Has Header Path": 'header_path' in first_result.get('meta_info', {}),
|
||||
"Headers Dict Exists": 'headers' in first_result.get('meta_info', {})
|
||||
}
|
||||
res = requests.post(f"{BASE_URL}/save_results", json=payload)
|
||||
log_res("保存结果", res)
|
||||
# 5. 测试 /search
|
||||
print("\n步骤 5: 测试基于向量的搜索...")
|
||||
query = [round(random.uniform(-1, 1), 8) for _ in range(1536)]
|
||||
res = requests.post(f"{BASE_URL}/search", json={
|
||||
"task_id": None,
|
||||
"query_embedding": query,
|
||||
"limit": 5
|
||||
})
|
||||
log_res("基于向量的搜索", res)
|
||||
|
||||
print("\n✅ 所有 API 流程测试完成!")
|
||||
|
||||
all_pass = True
|
||||
for name, passed in checks.items():
|
||||
status = f"{Colors.OKGREEN}PASS{Colors.ENDC}" if passed else f"{Colors.FAIL}FAIL{Colors.ENDC}"
|
||||
print(f"检查项 [{name}]: {status}")
|
||||
if not passed:
|
||||
all_pass = False
|
||||
|
||||
if all_pass:
|
||||
meta = first_result['meta_info']
|
||||
print(f"\n{Colors.OKBLUE}🎉 测试通过!系统已具备 Phase 1.5 (结构化 RAG) 能力。{Colors.ENDC}")
|
||||
print(f"提取到的上下文路径: {Colors.HEADER}{meta.get('header_path', 'N/A')}{Colors.ENDC}")
|
||||
else:
|
||||
print(f"\n{Colors.FAIL}❌ 测试未完全通过:缺少必要的元数据字段。请检查 crawler_service.py 或 update_db.py。{Colors.ENDC}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
run_e2e_test()
|
||||
82
scripts/update_sql.py
Normal file
82
scripts/update_sql.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from sqlalchemy import create_engine, text
|
||||
from backend.core.config import settings
|
||||
|
||||
def update_database_schema():
|
||||
"""
|
||||
数据库无损升级脚本
|
||||
"""
|
||||
print(f"🔌 连接数据库: {settings.DB_NAME}...")
|
||||
engine = create_engine(settings.DATABASE_URL)
|
||||
|
||||
commands = [
|
||||
# 1. 安全添加 meta_info 列 (旧数据会自动填充为 {})
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='knowledge_chunks' AND column_name='meta_info') THEN
|
||||
ALTER TABLE knowledge_chunks ADD COLUMN meta_info JSONB DEFAULT '{}';
|
||||
RAISE NOTICE '已添加 meta_info 列';
|
||||
END IF;
|
||||
END $$;
|
||||
""",
|
||||
|
||||
# 2. 安全添加 content_tsvector 列
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='knowledge_chunks' AND column_name='content_tsvector') THEN
|
||||
ALTER TABLE knowledge_chunks ADD COLUMN content_tsvector TSVECTOR;
|
||||
RAISE NOTICE '已添加 content_tsvector 列';
|
||||
END IF;
|
||||
END $$;
|
||||
""",
|
||||
|
||||
# 3. 创建索引 (不影响现有数据)
|
||||
"CREATE INDEX IF NOT EXISTS idx_chunks_meta ON knowledge_chunks USING GIN (meta_info);",
|
||||
"CREATE INDEX IF NOT EXISTS idx_chunks_tsvector ON knowledge_chunks USING GIN (content_tsvector);",
|
||||
|
||||
# 4. 创建触发器函数 (用于新插入的数据)
|
||||
"""
|
||||
CREATE OR REPLACE FUNCTION chunks_tsvector_trigger() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
new.content_tsvector := to_tsvector('english', coalesce(new.title, '') || ' ' || new.content);
|
||||
return new;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
""",
|
||||
|
||||
# 5. 绑定触发器
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_trigger WHERE tgname = 'tsvectorupdate') THEN
|
||||
CREATE TRIGGER tsvectorupdate BEFORE INSERT OR UPDATE
|
||||
ON knowledge_chunks FOR EACH ROW EXECUTE PROCEDURE chunks_tsvector_trigger();
|
||||
END IF;
|
||||
END $$;
|
||||
""",
|
||||
|
||||
# 6. 【新增】回填旧数据
|
||||
# 让以前存的 task_id=6 的数据也能生成关键词索引
|
||||
"""
|
||||
UPDATE knowledge_chunks
|
||||
SET content_tsvector = to_tsvector('english', coalesce(title, '') || ' ' || content)
|
||||
WHERE content_tsvector IS NULL;
|
||||
"""
|
||||
]
|
||||
|
||||
with engine.begin() as conn:
|
||||
for cmd in commands:
|
||||
try:
|
||||
conn.execute(text(cmd))
|
||||
except Exception as e:
|
||||
print(f"⚠️ 执行警告 (通常可忽略): {e}")
|
||||
|
||||
print("✅ 数据库结构升级完成!旧数据已保留并兼容。")
|
||||
|
||||
if __name__ == "__main__":
|
||||
update_database_schema()
|
||||
Reference in New Issue
Block a user