Compare commits
20 Commits
13428ac552
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| f78efc7125 | |||
| 155974572c | |||
| 860ada3334 | |||
| 7c99e67a7f | |||
| 389c13a2a7 | |||
| e1a94d4bc7 | |||
| 36bc0cc08b | |||
| e5ac2dde03 | |||
| d5ee00d404 | |||
| 9190fee16f | |||
| b9dbf1e8f7 | |||
| d191b13455 | |||
| 8972246445 | |||
| 4d35626b90 | |||
| 8c4491b383 | |||
| 9f636d1c31 | |||
| 20317e6788 | |||
| 79b3f79c15 | |||
| 061ce1b2a6 | |||
| 0f44ef1338 |
7
.env.example
Normal file
7
.env.example
Normal file
@@ -0,0 +1,7 @@
|
||||
DB_USER=postgres
|
||||
DB_PASS=
|
||||
DB_HOST=localhost
|
||||
DB_PORT=5432
|
||||
DB_NAME=wiki_crawler
|
||||
DASHSCOPE_API_KEY=
|
||||
FIRECRAWL_API_KEY=
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,2 +1,4 @@
|
||||
__pycache__/
|
||||
.venv
|
||||
.venv
|
||||
wiki_backend.tar
|
||||
.env
|
||||
15
.vscode/launch.json
vendored
15
.vscode/launch.json
vendored
@@ -1,15 +0,0 @@
|
||||
{
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python Debugger: FastAPI",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"args": [
|
||||
"backend.main:app",
|
||||
"--reload"
|
||||
],
|
||||
"jinja": true
|
||||
}
|
||||
]
|
||||
}
|
||||
40
Dockerfile
Normal file
40
Dockerfile
Normal file
@@ -0,0 +1,40 @@
|
||||
# 1. 使用官方 uv 镜像
|
||||
FROM ghcr.io/astral-sh/uv:latest AS uv_setup
|
||||
|
||||
# 2. 运行环境
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 复制 uv 命令
|
||||
COPY --from=uv_setup /uv /uvx /bin/
|
||||
|
||||
# 安装系统依赖
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libpq-dev \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 复制依赖描述文件
|
||||
COPY pyproject.toml uv.lock ./
|
||||
|
||||
# 【核心修复】使用 uv sync
|
||||
# --frozen: 强制要求 uv.lock 必须是最新的
|
||||
# --no-dev: 不安装开发依赖(如 pytest 等),减小体积
|
||||
# --no-install-project: 先不安装当前项目代码,只装依赖(优化缓存)
|
||||
RUN uv sync --frozen --no-dev --no-install-project
|
||||
|
||||
# 复制项目代码
|
||||
COPY . .
|
||||
|
||||
# 【关键点】将虚拟环境的 bin 目录加入系统路径
|
||||
# 这样你运行 python 或 uvicorn 时,系统会自动使用 uv 准备好的那个环境
|
||||
ENV PATH="/app/.venv/bin:$PATH"
|
||||
ENV PYTHONPATH=/app
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 28000
|
||||
|
||||
# 启动命令
|
||||
# 现在直接调用 uvicorn 即可,它会自动找到 .venv 里的版本
|
||||
CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]
|
||||
89
README.md
89
README.md
@@ -1,47 +1,56 @@
|
||||
# wiki_crawler
|
||||
|
||||
本仓库主要用于存放和更新dify中wiki_crawler的代码节点的代码
|
||||
本仓库主要用于存放wiki_crawler的代码
|
||||
|
||||
## 节点返回值格式约定
|
||||
核心依赖 `firecrawl` 和 阿里百炼 的api支持
|
||||
|
||||
节点返回值统一采用json格式,包含以下字段:
|
||||
完成wiki网页爬取和向量化与知识库查找
|
||||
|
||||
- code:状态码,0失败,1成功,2警告
|
||||
- msg:状态描述,用于提示调用方
|
||||
- data:返回数据,json格式,根据不同节点有不同的字段,若失败则为null
|
||||
mcp调试命令
|
||||
|
||||
## 节点输入输出设计
|
||||
```bash
|
||||
npx @modelcontextprotocol/inspector uv run backend/mcp_server.py
|
||||
```
|
||||
|
||||
- start:启动节点
|
||||
- input:
|
||||
- input
|
||||
- type: ['url','task_id'] # 根据输入类型对input进行处理
|
||||
- output:无
|
||||
- register:注册节点,涉及sql
|
||||
- input:
|
||||
- url:任务url
|
||||
- output:
|
||||
- task_id:任务id,用于后续查询任务状态
|
||||
- is_new_task:是否为新任务,1表示是,0表示否
|
||||
- pending_urls:剩余待处理url,涉及sql
|
||||
- input:
|
||||
- task_id:任务id
|
||||
- limit:最多返回的url数量,默认值为10
|
||||
- output:
|
||||
- urls: 剩余待处理url列表
|
||||
- save_results:保存处理结果,涉及sql
|
||||
- input:
|
||||
- task_id:任务id
|
||||
- results:任务结果列表,用于存入数据库
|
||||
- output:
|
||||
- completed:已入库url列表
|
||||
- failed:入库url列表
|
||||
- warnings:入库警告列表
|
||||
- message:消息节点,前置一个变量聚合器,不涉及sql操作
|
||||
- input:
|
||||
- msgs:各个节点的msg经过前置节点整合后统一输出
|
||||
- output:
|
||||
- output:整合消息之后输出给end
|
||||
- end:结束节点
|
||||
- input:
|
||||
- message节点整合的
|
||||
需要nodejs环境
|
||||
打开页面后在Environment Variables中添加`PYTHONIOENCODING = utf-8`来防止编码问题(视具体情况而定,如果可以正常运行,也可以不加)
|
||||
|
||||
## 当前状况
|
||||
|
||||
1. chunk分段逻辑:根据返回的markdown进行分割,按照#、##进行标题的分类,增加JSONB格式字段meta_info,有下面两个字段,分别可以用于数据库查询和LLM上下文认知资料来源
|
||||
|
||||
```python
|
||||
# 源数据 (headers)
|
||||
headers = {"h1": "产品介绍", "h2": "核心功能", "h3": "多语言支持"}
|
||||
|
||||
# 生成数据 (header_path)
|
||||
# Python 代码逻辑: " > ".join(headers.values())
|
||||
header_path = "产品介绍 > 核心功能 > 多语言支持"
|
||||
```
|
||||
2. 量化指标以及测试:目前存入的数据较少,测试结果可能偏差较大
|
||||
|
||||
```
|
||||
"p_at_1": [], # Precision@1: 首位精确率
|
||||
"hit_at_5": [], # HitRate@5: 前5命中率,即返回的前五个(目前设置只返回5个)是否符合问题
|
||||
"mrr": [], # Mean Reciprocal Rank: 倒数排名分数,正确答案排得越靠前,分数越高
|
||||
"latency": [] # 响应耗时
|
||||
```
|
||||
3. 搜索逻辑和问题分类:尚未实现,目前参考一些主流的做法,用户输入后先过一个LLM对问题进行拆分和分类,然后传入对应的知识库参数task_id进行对应的检索
|
||||
4. RAG逻辑:混合检索,使用向量和关键词混合检索,此处进行粗筛,数据层返回后在业务层调用 gte-rerank 模型进行重排,最后返回请求
|
||||
|
||||
```python
|
||||
vector_score = (1 - self.db.chunks.c.embedding.cosine_distance(query_vector))# 计算向量相似度
|
||||
keyword_score = func.ts_rank(self.db.chunks.c.content_tsvector, keyword_query) # 计算关键词相似度
|
||||
final_score = (vector_score * 0.7 + func.coalesce(keyword_score, 0) * 0.3).label("score")# 计算最终分数
|
||||
```
|
||||
5. 产品面向场景:客户需求爬取几个文档,并长期维护更新,后续需要新增,但是量相对不会太大,firecrawl付费大概不会太贵。爬虫获取完整wiki(可无视robots.txt),当前知识库存入和爬虫绑定强,依赖markdown格式存入
|
||||
6. 后续开发:添加旧wiki的更新维护功能。dify增加对后端的封装,做一套搜索逻辑和问题分类的节点,如果不好弄那还是迁回到后端,后端只提供知识库的mcp,bot调用mcp之后,自行调用实现搜索和问题分类
|
||||
|
||||
对比其他检索方法的优势,做一套评测机制标准,评估最终LLM输出的准确度,目前是知识库检索准确度
|
||||
|
||||
|
||||
切割逻辑,准确率定义,归结资料,测试设计,mcp服务调用,搜索逻辑,问题分类,流程架构设计,场景假设
|
||||
|
||||
整理dify报错,
|
||||
|
||||
包装mcp server
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
import os
|
||||
|
||||
class Settings:
|
||||
# 数据库配置
|
||||
DB_USER: str = "postgres"
|
||||
DB_PASS: str = "DXC_welcome001"
|
||||
DB_HOST: str = "8.155.144.6"
|
||||
DB_PORT: str = "25432"
|
||||
DB_NAME: str = "wiki_crawler"
|
||||
|
||||
@property
|
||||
def DATABASE_URL(self) -> str:
|
||||
url = f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASS}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
|
||||
return url
|
||||
|
||||
settings = Settings()
|
||||
29
backend/core/config.py
Normal file
29
backend/core/config.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import os
|
||||
from typing import ClassVar # <--- 1. 导入这个
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
class Settings(BaseSettings):
|
||||
DB_USER: str
|
||||
DB_PASS: str
|
||||
DB_HOST: str
|
||||
DB_PORT: str = "5432"
|
||||
DB_NAME: str
|
||||
DASHSCOPE_API_KEY: str
|
||||
FIRECRAWL_API_KEY: str
|
||||
|
||||
CANDIDATE_NUM: int = 50
|
||||
|
||||
# =========================================================
|
||||
# 【核心修复】加上 ClassVar 类型注解
|
||||
# =========================================================
|
||||
BASE_DIR: ClassVar[str] = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
ENV_PATH: ClassVar[str] = os.path.join(BASE_DIR, ".env")
|
||||
|
||||
# 使用绝对路径加载
|
||||
model_config = SettingsConfigDict(env_file=ENV_PATH, extra="ignore")
|
||||
|
||||
@property
|
||||
def DATABASE_URL(self) -> str:
|
||||
return f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASS}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
|
||||
|
||||
settings = Settings()
|
||||
42
backend/core/database.py
Normal file
42
backend/core/database.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from sqlalchemy import create_engine, MetaData, Table
|
||||
from pgvector.sqlalchemy import Vector
|
||||
from .config import settings
|
||||
import logging
|
||||
|
||||
# 获取当前模块的专用 Logger
|
||||
# __name__ 会自动识别为 "backend.services.crawler_service" 这样的路径
|
||||
logger = logging.getLogger(__name__)
|
||||
class Database:
|
||||
"""
|
||||
数据库单例类
|
||||
负责初始化连接池并反射加载现有的表结构
|
||||
"""
|
||||
def __init__(self):
|
||||
# 1. 创建引擎
|
||||
# pool_pre_ping=True 用于解决数据库连接长时间空闲后断开的问题
|
||||
self.engine = create_engine(settings.DATABASE_URL, pool_pre_ping=True)
|
||||
|
||||
# 2. 注册 pgvector 类型
|
||||
# 这是为了让 SQLAlchemy 反射机制能识别数据库中的 'vector' 类型
|
||||
self.engine.dialect.ischema_names['vector'] = Vector
|
||||
|
||||
self.metadata = MetaData()
|
||||
self.tasks = None
|
||||
self.queue = None
|
||||
self.chunks = None
|
||||
|
||||
self._reflect_tables()
|
||||
|
||||
def _reflect_tables(self):
|
||||
"""自动从数据库加载表定义"""
|
||||
try:
|
||||
# autoload_with 会查询数据库元数据,自动填充 Column 信息
|
||||
self.tasks = Table('crawl_tasks', self.metadata, autoload_with=self.engine)
|
||||
self.queue = Table('crawl_queue', self.metadata, autoload_with=self.engine)
|
||||
self.chunks = Table('knowledge_chunks', self.metadata, autoload_with=self.engine)
|
||||
logger.info("Database tables reflected successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to reflect tables: {e}")
|
||||
|
||||
# 全局数据库实例
|
||||
db = Database()
|
||||
24
backend/core/logger.py
Normal file
24
backend/core/logger.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# backend/core/logger.py
|
||||
import logging
|
||||
import sys
|
||||
|
||||
def setup_logging(level=logging.INFO):
|
||||
"""
|
||||
全局日志配置
|
||||
关键点:强制将日志输出到 sys.stderr,防止污染 sys.stdout 导致 MCP 协议崩溃。
|
||||
"""
|
||||
# 定义日志格式:时间 - 模块名 - 级别 - 内容
|
||||
log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
|
||||
# 配置根记录器
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format=log_format,
|
||||
handlers=[
|
||||
# 【绝对关键】使用 StreamHandler(sys.stderr)
|
||||
# 这样日志会走标准错误通道,不会干扰 MCP 的标准输出通信
|
||||
logging.StreamHandler(sys.stderr)
|
||||
],
|
||||
# 强制重新配置,防止被第三方库覆盖
|
||||
force=True
|
||||
)
|
||||
@@ -1,32 +0,0 @@
|
||||
from sqlalchemy import create_engine, MetaData, Table, event
|
||||
from pgvector.sqlalchemy import Vector # 必须导入这个
|
||||
from .config import settings
|
||||
|
||||
class Database:
|
||||
def __init__(self):
|
||||
# 1. 创建引擎
|
||||
self.engine = create_engine(settings.DATABASE_URL, pool_pre_ping=True)
|
||||
|
||||
# 2. 【核心修复】手动注册 vector 类型,让反射能识别它
|
||||
# 这告诉 SQLAlchemy:如果在数据库里看到名为 "vector" 的类型,请使用 pgvector 库的 Vector 类来处理
|
||||
self.engine.dialect.ischema_names['vector'] = Vector
|
||||
|
||||
self.metadata = MetaData()
|
||||
self.tasks = None
|
||||
self.queue = None
|
||||
self.chunks = None
|
||||
|
||||
self._reflect_tables()
|
||||
|
||||
def _reflect_tables(self):
|
||||
try:
|
||||
# 自动从数据库加载表结构
|
||||
# 因为上面注册了 ischema_names,现在 chunks_table.c.embedding 就能被正确识别为 Vector 类型了
|
||||
self.tasks = Table('crawl_tasks', self.metadata, autoload_with=self.engine)
|
||||
self.queue = Table('crawl_queue', self.metadata, autoload_with=self.engine)
|
||||
self.chunks = Table('knowledge_chunks', self.metadata, autoload_with=self.engine)
|
||||
except Exception as e:
|
||||
print(f"❌ 数据库表加载失败: {e}")
|
||||
|
||||
# 全局单例
|
||||
db_instance = Database()
|
||||
@@ -1,77 +1,25 @@
|
||||
from fastapi import FastAPI
|
||||
from .service import crawler_service
|
||||
from .schemas import RegisterRequest, PendingRequest, SaveResultsRequest, AddUrlsRequest, SearchRequest
|
||||
from .utils import make_response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
# 引入新路由
|
||||
from backend.routers import v1
|
||||
|
||||
app = FastAPI(title="Wiki Crawler API")
|
||||
app = FastAPI(
|
||||
title="Wiki Crawler API",
|
||||
version="1.0.0", # 版本号回归
|
||||
description="RAG Knowledge Base Service"
|
||||
)
|
||||
|
||||
@app.post("/register")
|
||||
async def register(req: RegisterRequest):
|
||||
try:
|
||||
data = crawler_service.register_task(req.url)
|
||||
return make_response(1, "Success", data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
@app.post("/add_urls")
|
||||
async def add_urls(req: AddUrlsRequest):
|
||||
try:
|
||||
data = crawler_service.add_urls(req.task_id, req.urls)
|
||||
return make_response(1, "Success", data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
|
||||
@app.post("/pending_urls")
|
||||
async def pending_urls(req: PendingRequest):
|
||||
try:
|
||||
data = crawler_service.get_pending_urls(req.task_id, req.limit)
|
||||
msg = "Success" if data["urls"] else "Queue Empty"
|
||||
return make_response(1, msg, data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
|
||||
@app.post("/save_results")
|
||||
async def save_results(req: SaveResultsRequest):
|
||||
try:
|
||||
data = crawler_service.save_results(req.task_id, req.results)
|
||||
return make_response(1, "Success", data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
|
||||
@app.post("/search")
|
||||
async def search(req: SearchRequest):
|
||||
"""
|
||||
通用搜索接口:
|
||||
支持基于 task_id 的局部搜索,也支持不传 task_id 的全库搜索。
|
||||
"""
|
||||
try:
|
||||
# 1. 基础校验:确保向量不为空且维度正确(阿里 v4 模型通常为 1536)
|
||||
if not req.query_embedding or len(req.query_embedding) != 1536:
|
||||
return make_response(
|
||||
code=2,
|
||||
msg=f"向量维度错误。期望 1536, 实际收到 {len(req.query_embedding) if req.query_embedding else 0}",
|
||||
data=None
|
||||
)
|
||||
|
||||
# 2. 调用业务类执行搜索
|
||||
data = crawler_service.search_knowledge(
|
||||
query_embedding=req.query_embedding,
|
||||
task_id=req.task_id,
|
||||
limit=req.limit
|
||||
)
|
||||
|
||||
# 3. 统一返回
|
||||
return make_response(
|
||||
code=1,
|
||||
msg="搜索完成",
|
||||
data=data
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# 记录日志并返回失败信息
|
||||
print(f"搜索接口异常: {str(e)}")
|
||||
return make_response(code=0, msg=f"搜索失败: {str(e)}")
|
||||
# 挂载 V1
|
||||
app.include_router(v1.router)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
145
backend/mcp_server.py
Normal file
145
backend/mcp_server.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
from typing import Optional # 确保引入 Optional
|
||||
import threading
|
||||
# 1. 路径兼容 (确保能找到 backend 包)
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from mcp.server.fastmcp import FastMCP
|
||||
from backend.core.logger import setup_logging
|
||||
from backend.services.crawler_service import crawler_service
|
||||
|
||||
# 2. 初始化日志 (必须走 stderr)
|
||||
setup_logging()
|
||||
logger = logging.getLogger("mcp_server")
|
||||
|
||||
# 3. 初始化 MCP 服务
|
||||
mcp = FastMCP("WikiCrawler-V3")
|
||||
|
||||
@mcp.tool()
|
||||
async def kb_add_website(url: str) -> str:
|
||||
"""
|
||||
[Admin] Input a URL to map and register a task.
|
||||
This is the first step to add a knowledge base.
|
||||
|
||||
Args:
|
||||
url: The root URL of the website (e.g., https://docs.firecrawl.dev).
|
||||
|
||||
Returns:
|
||||
Task ID and count of found links.
|
||||
"""
|
||||
try:
|
||||
res = crawler_service.map_site(url)
|
||||
return f"Task Registered. ID: {res['task_id']}, Links Found: {res['count']}, Is New: {res['is_new']}"
|
||||
except Exception as e:
|
||||
logger.error(f"Add website failed: {e}", exc_info=True)
|
||||
return f"Error: {e}"
|
||||
|
||||
@mcp.tool()
|
||||
async def kb_check_status(task_id: int) -> str:
|
||||
"""
|
||||
[Monitor] Check detailed progress and active threads.
|
||||
Use this to see if the crawler is still running or finished.
|
||||
|
||||
Args:
|
||||
task_id: The ID of the task to check.
|
||||
|
||||
Returns:
|
||||
A formatted report including progress stats and currently crawling URLs.
|
||||
"""
|
||||
data = crawler_service.get_task_status(task_id)
|
||||
if not data: return "Task not found."
|
||||
|
||||
s = data['stats']
|
||||
threads = data['active_threads']
|
||||
|
||||
# 格式化输出给 LLM 阅读
|
||||
report = (
|
||||
f"--- Task {task_id} Status ---\n"
|
||||
f"Root URL: {data['root_url']}\n"
|
||||
f"Progress: {s['completed']}/{s['total']} (Pending: {s['pending']})\n"
|
||||
f"Active Threads (Running): {len(threads)}\n"
|
||||
)
|
||||
|
||||
if threads:
|
||||
report += "Currently Crawling:\n" + "\n".join([f"- {t}" for t in threads[:5]])
|
||||
if len(threads) > 5:
|
||||
report += f"\n... and {len(threads)-5} more."
|
||||
|
||||
return report
|
||||
|
||||
@mcp.tool()
|
||||
async def kb_run_crawler(task_id: int, batch_size: int = 20) -> str:
|
||||
"""
|
||||
[Action] Trigger the crawler in BACKGROUND mode.
|
||||
This returns immediately, so you can use 'kb_check_status' to monitor progress.
|
||||
|
||||
Args:
|
||||
task_id: The ID of the task.
|
||||
batch_size: Number of URLs to process (suggest 10-20).
|
||||
|
||||
Returns:
|
||||
Status message confirming start.
|
||||
"""
|
||||
# 定义一个在后台跑的包装函数
|
||||
def background_task():
|
||||
try:
|
||||
logger.info(f"Background batch started for Task {task_id}")
|
||||
# 这里是阻塞操作,但它现在跑在独立线程里
|
||||
crawler_service.process_queue_concurrent(task_id, batch_size)
|
||||
logger.info(f"Background batch finished for Task {task_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"Background task failed: {e}", exc_info=True)
|
||||
|
||||
# 2. 创建并启动线程
|
||||
thread = threading.Thread(target=background_task)
|
||||
thread.daemon = True # 设置为守护线程,防止主程序退出时卡死
|
||||
thread.start()
|
||||
|
||||
# 3. 立即返回,不等待爬取结束
|
||||
return f"🚀 Background crawler started for Task {task_id} (Batch Size: {batch_size}). You can now check status."
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
async def kb_search(query: str, task_id: Optional[int] = None, limit: int = 5) -> str:
|
||||
"""
|
||||
[User] Search knowledge base with Hybrid Search & Rerank.
|
||||
|
||||
Args:
|
||||
query: The user's question or search keywords.
|
||||
task_id: (Optional) Limit search to a specific task ID.
|
||||
limit: (Optional) Number of results to return (default 5).
|
||||
|
||||
Returns:
|
||||
Ranked content blocks with source paths.
|
||||
"""
|
||||
try:
|
||||
res = crawler_service.search(query, task_id, limit)
|
||||
results = res.get('results', [])
|
||||
|
||||
if not results: return "No results found."
|
||||
|
||||
output = []
|
||||
for i, r in enumerate(results):
|
||||
score_display = f"{r['score']:.4f}" + (" (Reranked)" if r.get('reranked') else "")
|
||||
meta = r.get('meta_info', {})
|
||||
path = meta.get('header_path', 'Root')
|
||||
|
||||
# 格式化单个结果块
|
||||
block = (
|
||||
f"[{i+1}] Score: {score_display}\n"
|
||||
f"Path: {path}\n"
|
||||
f"Content: {r['content'][:300]}..." # 限制长度防止 Context 溢出
|
||||
)
|
||||
output.append(block)
|
||||
|
||||
return "\n\n".join(output)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Search failed: {e}", exc_info=True)
|
||||
return f"Search Error: {e}"
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 启动 MCP 服务
|
||||
mcp.run()
|
||||
67
backend/routers/v1.py
Normal file
67
backend/routers/v1.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# backend/routers/v1.py
|
||||
from fastapi import APIRouter, BackgroundTasks, status
|
||||
from backend.services.crawler_service import crawler_service
|
||||
from backend.services.data_service import data_service
|
||||
from backend.schemas.v1 import (
|
||||
TaskCreateRequest, TaskExecuteRequest, SearchRequest,
|
||||
ResponseBase, KnowledgeBaseListResponse
|
||||
)
|
||||
from backend.utils.common import make_response
|
||||
|
||||
# 【改动】前缀变更为 v1
|
||||
router = APIRouter(prefix="/api/v1", tags=["Knowledge Base API"])
|
||||
|
||||
# =======================================================
|
||||
# 1. 获取知识库列表 (核心新功能)
|
||||
# =======================================================
|
||||
@router.get("/knowledge-bases", response_model=ResponseBase)
|
||||
async def list_knowledge_bases():
|
||||
"""
|
||||
列出所有已存在的知识库(Task)。
|
||||
工作流可以用这个接口获取 task_id 列表,让 LLM 选择查哪个库。
|
||||
"""
|
||||
kb_list = crawler_service.get_knowledge_base_list()
|
||||
return ResponseBase(
|
||||
code=1,
|
||||
msg="Success",
|
||||
data={"total": len(kb_list), "list": kb_list}
|
||||
)
|
||||
|
||||
# =======================================================
|
||||
# 2. 任务管理
|
||||
# =======================================================
|
||||
@router.post("/tasks", status_code=status.HTTP_201_CREATED, response_model=ResponseBase)
|
||||
async def create_task(req: TaskCreateRequest):
|
||||
try:
|
||||
res = crawler_service.map_site(req.url)
|
||||
return ResponseBase(code=1, msg="Task Created", data=res)
|
||||
except Exception as e:
|
||||
return ResponseBase(code=0, msg=str(e))
|
||||
|
||||
@router.get("/tasks/{task_id}", response_model=ResponseBase)
|
||||
async def get_task_status(task_id: int):
|
||||
data = crawler_service.get_task_status(task_id)
|
||||
if not data:
|
||||
return ResponseBase(code=0, msg="Task not found")
|
||||
return ResponseBase(code=1, msg="Success", data=data)
|
||||
|
||||
@router.post("/tasks/{task_id}/run", status_code=status.HTTP_202_ACCEPTED, response_model=ResponseBase)
|
||||
async def run_task(task_id: int, req: TaskExecuteRequest, bg_tasks: BackgroundTasks):
|
||||
# 简单检查
|
||||
if not data_service.get_task_monitor_data(task_id):
|
||||
return ResponseBase(code=0, msg="Task not found")
|
||||
|
||||
bg_tasks.add_task(crawler_service.process_queue_concurrent, task_id, req.batch_size)
|
||||
return ResponseBase(code=1, msg="Execution Started", data={"task_id": task_id})
|
||||
|
||||
# =======================================================
|
||||
# 3. 搜索
|
||||
# =======================================================
|
||||
@router.post("/search", response_model=ResponseBase)
|
||||
async def search_knowledge(req: SearchRequest):
|
||||
try:
|
||||
# req.limit 映射到 return_num
|
||||
res = crawler_service.search(req.query, req.task_id, req.limit)
|
||||
return ResponseBase(code=1, msg="Search Completed", data=res)
|
||||
except Exception as e:
|
||||
return ResponseBase(code=0, msg=str(e))
|
||||
@@ -1,31 +0,0 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
|
||||
class RegisterRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
class PendingRequest(BaseModel):
|
||||
task_id: int
|
||||
limit: Optional[int] = 10
|
||||
|
||||
class AddUrlsRequest(BaseModel):
|
||||
task_id: int
|
||||
urls: List[str]
|
||||
|
||||
# schemas.py
|
||||
class CrawlResult(BaseModel):
|
||||
source_url: str
|
||||
chunk_index: int # 新增字段
|
||||
title: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
embedding: Optional[List[float]] = None
|
||||
|
||||
class SaveResultsRequest(BaseModel):
|
||||
task_id: int
|
||||
results: List[CrawlResult]
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
# 如果不传 task_id,则进行全库搜索
|
||||
task_id: Optional[int] = None
|
||||
query_embedding: List[float]
|
||||
limit: Optional[int] = 5
|
||||
43
backend/schemas/v1.py
Normal file
43
backend/schemas/v1.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
# --- 通用响应 ---
|
||||
class ResponseBase(BaseModel):
|
||||
code: int = Field(..., description="1: 成功, 0: 失败")
|
||||
msg: str
|
||||
data: Optional[Any] = None
|
||||
|
||||
# --- [GET] 知识库列表 (新功能) ---
|
||||
class KnowledgeBaseItem(BaseModel):
|
||||
task_id: int
|
||||
root_url: str
|
||||
name: str # 提取出的简短名称,方便 LLM 识别
|
||||
|
||||
class KnowledgeBaseListResponse(BaseModel):
|
||||
total: int
|
||||
list: List[KnowledgeBaseItem]
|
||||
|
||||
# --- [POST] 创建任务 ---
|
||||
class TaskCreateRequest(BaseModel):
|
||||
url: str = Field(..., description="目标网站根URL")
|
||||
|
||||
# --- [POST] 执行任务 ---
|
||||
class TaskExecuteRequest(BaseModel):
|
||||
batch_size: int = Field(10, le=50)
|
||||
|
||||
# --- [POST] 搜索 ---
|
||||
class SearchRequest(BaseModel):
|
||||
query: str
|
||||
# 明确支持 None 为全局搜索
|
||||
task_id: Optional[int] = Field(None, description="任务ID,不传则搜全库")
|
||||
limit: int = Field(5, description="返回数量")
|
||||
|
||||
# ... (SearchResultItem 等保持不变) ...
|
||||
class SearchResultItem(BaseModel):
|
||||
task_id: int
|
||||
source_url: str
|
||||
title: Optional[str] = None
|
||||
content: str
|
||||
score: float
|
||||
meta_info: Dict = {}
|
||||
reranked: Optional[bool] = False
|
||||
@@ -1,205 +0,0 @@
|
||||
# service.py
|
||||
from sqlalchemy import select, insert, update, delete, and_
|
||||
from .database import db_instance
|
||||
from .utils import normalize_url
|
||||
|
||||
class CrawlerService:
|
||||
def __init__(self):
|
||||
self.db = db_instance
|
||||
|
||||
def register_task(self, url: str):
|
||||
"""完全使用库 API 实现的注册"""
|
||||
clean_url = normalize_url(url)
|
||||
with self.db.engine.begin() as conn:
|
||||
# 使用 select() API
|
||||
query = select(self.db.tasks.c.id).where(self.db.tasks.c.root_url == clean_url)
|
||||
existing = conn.execute(query).fetchone()
|
||||
|
||||
if existing:
|
||||
return {"task_id": existing[0], "is_new_task": False}
|
||||
|
||||
# 使用 insert() API
|
||||
stmt = insert(self.db.tasks).values(root_url=clean_url).returning(self.db.tasks.c.id)
|
||||
new_task = conn.execute(stmt).fetchone()
|
||||
return {"task_id": new_task[0], "is_new_task": True}
|
||||
|
||||
def add_urls(self, task_id: int, urls: list):
|
||||
"""通用 API 实现的批量添加(含详细返回)"""
|
||||
success_urls, skipped_urls, failed_urls = [], [], []
|
||||
|
||||
with self.db.engine.begin() as conn:
|
||||
for url in urls:
|
||||
clean_url = normalize_url(url)
|
||||
try:
|
||||
# 检查是否存在 (通用写法)
|
||||
check_q = select(self.db.queue).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
|
||||
)
|
||||
if conn.execute(check_q).fetchone():
|
||||
skipped_urls.append(clean_url)
|
||||
continue
|
||||
|
||||
# 插入新 URL
|
||||
conn.execute(insert(self.db.queue).values(
|
||||
task_id=task_id, url=clean_url, status='pending'
|
||||
))
|
||||
success_urls.append(clean_url)
|
||||
except Exception:
|
||||
failed_urls.append(clean_url)
|
||||
|
||||
return {"success_urls": success_urls, "skipped_urls": skipped_urls, "failed_urls": failed_urls}
|
||||
|
||||
def get_pending_urls(self, task_id: int, limit: int):
|
||||
"""原子锁定 API 实现"""
|
||||
with self.db.engine.begin() as conn:
|
||||
query = select(self.db.queue.c.url).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.status == 'pending')
|
||||
).limit(limit)
|
||||
|
||||
urls = [r[0] for r in conn.execute(query).fetchall()]
|
||||
|
||||
if urls:
|
||||
upd = update(self.db.queue).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url.in_(urls))
|
||||
).values(status='processing')
|
||||
conn.execute(upd)
|
||||
return {"urls": urls}
|
||||
|
||||
def save_results(self, task_id: int, results: list):
|
||||
"""
|
||||
保存同一 URL 的多个切片。
|
||||
返回:该 URL 下切片的详细处理统计及页面更新状态。
|
||||
"""
|
||||
if not results:
|
||||
return {"msg": "No data provided"}
|
||||
|
||||
# 1. 基础信息提取 (假设 results 里的 source_url 都是一致的)
|
||||
first_item = results[0] if isinstance(results[0], dict) else results[0].__dict__
|
||||
target_url = normalize_url(first_item.get('source_url'))
|
||||
|
||||
# 结果统计容器
|
||||
inserted_chunks = []
|
||||
updated_chunks = []
|
||||
failed_chunks = []
|
||||
is_page_update = False
|
||||
|
||||
with self.db.engine.begin() as conn:
|
||||
# 2. 判断该 URL 是否已经有切片存在 (以此判定是否为“页面更新”)
|
||||
check_page_stmt = select(self.db.chunks.c.id).where(
|
||||
and_(self.db.chunks.c.task_id == task_id, self.db.chunks.c.source_url == target_url)
|
||||
).limit(1)
|
||||
if conn.execute(check_page_stmt).fetchone():
|
||||
is_page_update = True
|
||||
|
||||
# 3. 逐个处理切片
|
||||
for res in results:
|
||||
data = res if isinstance(res, dict) else res.__dict__
|
||||
c_idx = data.get('chunk_index')
|
||||
|
||||
try:
|
||||
# 检查具体某个 index 的切片是否存在
|
||||
find_chunk_stmt = select(self.db.chunks.c.id).where(
|
||||
and_(
|
||||
self.db.chunks.c.task_id == task_id,
|
||||
self.db.chunks.c.source_url == target_url,
|
||||
self.db.chunks.c.chunk_index == c_idx
|
||||
)
|
||||
)
|
||||
existing_chunk = conn.execute(find_chunk_stmt).fetchone()
|
||||
|
||||
if existing_chunk:
|
||||
# 覆盖更新现有切片
|
||||
upd_stmt = update(self.db.chunks).where(
|
||||
self.db.chunks.c.id == existing_chunk[0]
|
||||
).values(
|
||||
title=data.get('title'),
|
||||
content=data.get('content'),
|
||||
embedding=data.get('embedding')
|
||||
)
|
||||
conn.execute(upd_stmt)
|
||||
updated_chunks.append(c_idx)
|
||||
else:
|
||||
# 插入新切片
|
||||
ins_stmt = insert(self.db.chunks).values(
|
||||
task_id=task_id,
|
||||
source_url=target_url,
|
||||
chunk_index=c_idx,
|
||||
title=data.get('title'),
|
||||
content=data.get('content'),
|
||||
embedding=data.get('embedding')
|
||||
)
|
||||
conn.execute(ins_stmt)
|
||||
inserted_chunks.append(c_idx)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Chunk {c_idx} failed: {e}")
|
||||
failed_chunks.append(c_idx)
|
||||
|
||||
# 4. 最终更新队列状态
|
||||
conn.execute(
|
||||
update(self.db.queue).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == target_url)
|
||||
).values(status='completed')
|
||||
)
|
||||
|
||||
return {
|
||||
"source_url": target_url,
|
||||
"is_page_update": is_page_update, # 标志:此页面此前是否有过内容
|
||||
"detail": {
|
||||
"inserted_chunk_indexes": inserted_chunks,
|
||||
"updated_chunk_indexes": updated_chunks,
|
||||
"failed_chunk_indexes": failed_chunks
|
||||
},
|
||||
"counts": {
|
||||
"inserted": len(inserted_chunks),
|
||||
"updated": len(updated_chunks),
|
||||
"failed": len(failed_chunks)
|
||||
}
|
||||
}
|
||||
def search_knowledge(self, query_embedding: list, task_id: int = None, limit: int = 5):
|
||||
"""
|
||||
高性能向量搜索方法
|
||||
:param query_embedding: 问题的向量
|
||||
:param task_id: 可选的任务ID,不传则搜全表
|
||||
:param limit: 返回结果数量
|
||||
"""
|
||||
|
||||
|
||||
with self.db.engine.connect() as conn:
|
||||
# 1. 选择需要的字段
|
||||
# 我们同时返回 task_id,方便在全库搜索时知道来源哪个任务
|
||||
stmt = select(
|
||||
self.db.chunks.c.task_id,
|
||||
self.db.chunks.c.source_url,
|
||||
self.db.chunks.c.title,
|
||||
self.db.chunks.c.content,
|
||||
self.db.chunks.c.chunk_index
|
||||
)
|
||||
|
||||
# 2. 动态添加过滤条件
|
||||
if task_id is not None:
|
||||
stmt = stmt.where(self.db.chunks.c.task_id == task_id)
|
||||
|
||||
# 3. 按余弦距离排序(1 - 余弦相似度)
|
||||
# 距离越小,相似度越高
|
||||
stmt = stmt.order_by(
|
||||
self.db.chunks.c.embedding.cosine_distance(query_embedding)
|
||||
).limit(limit)
|
||||
|
||||
# 4. 执行并解析结果
|
||||
rows = conn.execute(stmt).fetchall()
|
||||
|
||||
results = []
|
||||
for r in rows:
|
||||
results.append({
|
||||
"task_id": r[0],
|
||||
"source_url": r[1],
|
||||
"title": r[2],
|
||||
"content": r[3],
|
||||
"chunk_index": r[4]
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
crawler_service = CrawlerService()
|
||||
263
backend/services/crawler_service.py
Normal file
263
backend/services/crawler_service.py
Normal file
@@ -0,0 +1,263 @@
|
||||
import concurrent.futures
|
||||
import threading
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional, Union
|
||||
|
||||
from firecrawl import FirecrawlApp
|
||||
from backend.core.config import settings
|
||||
from backend.services.data_service import data_service
|
||||
from backend.services.llm_service import llm_service
|
||||
from backend.utils.text_process import text_processor
|
||||
|
||||
# 获取当前模块的专用 Logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class CrawlerService:
|
||||
"""
|
||||
爬虫业务服务层 (Crawler Service)
|
||||
|
||||
职责:
|
||||
1. 协调外部 API (Firecrawl) 和内部服务 (DataService, LLMService)。
|
||||
2. 管理多线程爬取任务及其状态。
|
||||
3. 提供统一的搜索入口 (混合检索 + Rerank)。
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.firecrawl = FirecrawlApp(api_key=settings.FIRECRAWL_API_KEY)
|
||||
self.max_workers = 5 # 线程池最大并发数
|
||||
|
||||
# 内存状态追踪: { task_id: set([url1, url2]) }
|
||||
self._active_workers: Dict[int, set] = {}
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def get_knowledge_base_list(self):
|
||||
"""获取知识库列表"""
|
||||
return data_service.get_all_tasks()
|
||||
|
||||
|
||||
def _track_start(self, task_id: int, url: str):
|
||||
"""[Internal] 标记某个URL开始处理"""
|
||||
with self._lock:
|
||||
if task_id not in self._active_workers:
|
||||
self._active_workers[task_id] = set()
|
||||
self._active_workers[task_id].add(url)
|
||||
|
||||
def _track_end(self, task_id: int, url: str):
|
||||
"""[Internal] 标记某个URL处理结束"""
|
||||
with self._lock:
|
||||
if task_id in self._active_workers:
|
||||
self._active_workers[task_id].discard(url)
|
||||
|
||||
def get_task_status(self, task_id: int) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
获取任务的实时综合状态。
|
||||
|
||||
Args:
|
||||
task_id (int): 任务 ID
|
||||
|
||||
Returns:
|
||||
dict: 包含数据库统计和实时线程信息的字典。如果任务不存在返回 None。
|
||||
结构示例:
|
||||
{
|
||||
"root_url": "https://example.com",
|
||||
"stats": {"pending": 10, "processing": 2, "completed": 5, "failed": 0},
|
||||
"active_threads": ["https://example.com/page1"],
|
||||
"active_thread_count": 1
|
||||
}
|
||||
"""
|
||||
# 1. 获取数据库层面的统计 (宏观)
|
||||
db_data = data_service.get_task_monitor_data(task_id)
|
||||
if not db_data:
|
||||
return None
|
||||
|
||||
# 2. 获取内存层面的活跃线程 (微观)
|
||||
with self._lock:
|
||||
active_urls = list(self._active_workers.get(task_id, []))
|
||||
|
||||
# 日志输出当前状态
|
||||
logger.info(f"Task {task_id} active threads: {active_urls}")
|
||||
logger.info(f"Task {task_id} stats: {db_data['db_stats']}")
|
||||
|
||||
return {
|
||||
"root_url": db_data["root_url"],
|
||||
"stats": db_data["db_stats"],
|
||||
"active_threads": active_urls,
|
||||
"active_thread_count": len(active_urls)
|
||||
}
|
||||
|
||||
def map_site(self, start_url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
第一阶段:站点地图扫描 (Map)
|
||||
|
||||
Args:
|
||||
start_url (str): 目标网站的根 URL
|
||||
|
||||
Returns:
|
||||
dict: 包含任务 ID 和发现链接数的字典。
|
||||
{
|
||||
"task_id": 123,
|
||||
"count": 50,
|
||||
"is_new": True
|
||||
}
|
||||
"""
|
||||
logger.info(f"Mapping: {start_url}")
|
||||
try:
|
||||
task_res = data_service.register_task(start_url)
|
||||
urls_to_add = [start_url]
|
||||
|
||||
# 如果任务已存在,不再重新 Map,直接返回
|
||||
if not task_res['is_new_task']:
|
||||
logger.info(f"Task {task_res['task_id']} exists, skipping map.")
|
||||
return {
|
||||
"task_id": task_res['task_id'],
|
||||
"count": 0,
|
||||
"is_new": False
|
||||
}
|
||||
|
||||
# 新任务执行 Map
|
||||
try:
|
||||
map_res = self.firecrawl.map(start_url)
|
||||
# 兼容不同版本的 SDK 返回结构
|
||||
found_links = map_res.get('links', []) if isinstance(map_res, dict) else getattr(map_res, 'links', [])
|
||||
|
||||
for link in found_links:
|
||||
u = link if isinstance(link, str) else getattr(link, 'url', str(link))
|
||||
urls_to_add.append(u)
|
||||
logger.info(f"Map found {len(found_links)} links")
|
||||
except Exception as e:
|
||||
logger.warning(f"Map failed, proceeding with seed only: {e}")
|
||||
|
||||
if urls_to_add:
|
||||
data_service.add_urls(task_res['task_id'], urls_to_add)
|
||||
|
||||
return {
|
||||
"task_id": task_res['task_id'],
|
||||
"count": len(urls_to_add),
|
||||
"is_new": True
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Map failed: {e}")
|
||||
raise e
|
||||
|
||||
def _process_single_url(self, task_id: int, url: str):
|
||||
"""[Internal Worker] 单个 URL 处理线程逻辑"""
|
||||
# 1. 内存标记:开始
|
||||
self._track_start(task_id, url)
|
||||
logger.info(f"[THREAD START] {url}")
|
||||
|
||||
try:
|
||||
# 2. 爬取
|
||||
scrape_res = self.firecrawl.scrape(
|
||||
url, formats=['markdown'], only_main_content=True
|
||||
)
|
||||
|
||||
# 兼容性提取
|
||||
raw_md = getattr(scrape_res, 'markdown', '') if not isinstance(scrape_res, dict) else scrape_res.get('markdown', '')
|
||||
metadata = getattr(scrape_res, 'metadata', {}) if not isinstance(scrape_res, dict) else scrape_res.get('metadata', {})
|
||||
title = getattr(metadata, 'title', url) if not isinstance(metadata, dict) else metadata.get('title', url)
|
||||
|
||||
if not raw_md:
|
||||
data_service.mark_url_status(task_id, url, 'failed')
|
||||
return
|
||||
|
||||
# 3. 清洗 & 切分
|
||||
clean_md = text_processor.clean_markdown(raw_md)
|
||||
chunks = text_processor.split_markdown(clean_md)
|
||||
|
||||
chunks_data = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
headers = chunk['metadata']
|
||||
path = " > ".join(headers.values())
|
||||
emb_input = f"{title}\n{path}\n{chunk['content']}"
|
||||
|
||||
vector = llm_service.get_embedding(emb_input)
|
||||
if vector:
|
||||
chunks_data.append({
|
||||
"index": i, "content": chunk['content'], "embedding": vector,
|
||||
"meta_info": {"header_path": path, "headers": headers}
|
||||
})
|
||||
|
||||
# 4. 入库 (会自动标记 completed)
|
||||
if chunks_data:
|
||||
data_service.save_chunks(task_id, url, title, chunks_data)
|
||||
else:
|
||||
data_service.mark_url_status(task_id, url, 'failed')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[THREAD ERROR] {url}: {e}")
|
||||
data_service.mark_url_status(task_id, url, 'failed')
|
||||
|
||||
finally:
|
||||
# 5. 内存标记:结束 (无论成功失败都要移除)
|
||||
self._track_end(task_id, url)
|
||||
|
||||
def process_queue_concurrent(self, task_id: int, batch_size: int = 10) -> Dict[str, Any]:
|
||||
"""
|
||||
第二阶段:多线程并发处理 (Process)
|
||||
|
||||
Args:
|
||||
task_id (int): 任务 ID
|
||||
batch_size (int): 本次批次处理的 URL 数量(会分配给线程池并发执行)
|
||||
|
||||
Returns:
|
||||
dict: 处理结果概览
|
||||
{
|
||||
"msg": "Batch completed",
|
||||
"count": 10
|
||||
}
|
||||
"""
|
||||
urls = data_service.get_pending_urls(task_id, limit=batch_size)
|
||||
if not urls: return {"msg": "No pending urls", "count": 0}
|
||||
|
||||
logger.info(f"Batch started: {len(urls)} urls")
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
# 提交任务到线程池
|
||||
futures = {executor.submit(self._process_single_url, task_id, url): url for url in urls}
|
||||
# 等待完成 (阻塞直到所有线程结束)
|
||||
concurrent.futures.wait(futures)
|
||||
|
||||
return {"msg": "Batch completed", "count": len(urls)}
|
||||
|
||||
def search(self, query: str, task_id: Optional[int], return_num: int) -> Dict[str, Any]:
|
||||
"""
|
||||
第三阶段:智能搜索 (Search)
|
||||
流程:用户问题 -> Embedding -> 数据库混合检索(粗排) -> Rerank模型(精排) -> 结果
|
||||
|
||||
Args:
|
||||
query (str): 用户问题
|
||||
task_id (Optional[int]): 指定搜索的任务 ID,None 为全库搜索
|
||||
return_num (int): 最终返回给用户的条数 (Top K)
|
||||
|
||||
Returns:
|
||||
dict: 搜索结果列表
|
||||
{
|
||||
"results": [
|
||||
{"content": "...", "score": 0.98, "meta_info": {...}},
|
||||
...
|
||||
]
|
||||
}
|
||||
"""
|
||||
# 1. 生成向量
|
||||
vector = llm_service.get_embedding(query)
|
||||
if not vector: return {"msg": "Embedding failed", "results": []}
|
||||
|
||||
# 2. 数据库粗排 (召回 10 倍数量或至少 50 条)
|
||||
coarse_limit = min(return_num * 10, 100)
|
||||
coarse_limit = max(coarse_limit, 50)
|
||||
|
||||
coarse_res = data_service.search(
|
||||
query_text=query,
|
||||
query_vector=vector,
|
||||
task_id=task_id,
|
||||
candidates_num=coarse_limit
|
||||
)
|
||||
candidates = coarse_res.get('results', [])
|
||||
|
||||
if not candidates: return {"results": []}
|
||||
|
||||
# 3. LLM 精排 (Rerank)
|
||||
final_res = llm_service.rerank(query, candidates, return_num)
|
||||
return {"results": final_res}
|
||||
|
||||
crawler_service = CrawlerService()
|
||||
212
backend/services/data_service.py
Normal file
212
backend/services/data_service.py
Normal file
@@ -0,0 +1,212 @@
|
||||
from sqlalchemy import select, insert, update, and_, text, func, desc
|
||||
from backend.core.database import db
|
||||
from backend.utils.common import normalize_url
|
||||
import logging
|
||||
|
||||
# 获取当前模块的专用 Logger
|
||||
# __name__ 会自动识别为 "backend.services.crawler_service" 这样的路径
|
||||
logger = logging.getLogger(__name__)
|
||||
class DataService:
|
||||
"""
|
||||
数据持久化服务层
|
||||
"""
|
||||
def __init__(self):
|
||||
self.db = db
|
||||
|
||||
def register_task(self, url: str):
|
||||
clean_url = normalize_url(url)
|
||||
with self.db.engine.begin() as conn:
|
||||
query = select(self.db.tasks.c.id).where(self.db.tasks.c.root_url == clean_url)
|
||||
existing = conn.execute(query).fetchone()
|
||||
|
||||
if existing:
|
||||
return {"task_id": existing[0], "is_new_task": False}
|
||||
else:
|
||||
stmt = insert(self.db.tasks).values(root_url=clean_url).returning(self.db.tasks.c.id)
|
||||
new_task = conn.execute(stmt).fetchone()
|
||||
return {"task_id": new_task[0], "is_new_task": True}
|
||||
|
||||
def add_urls(self, task_id: int, urls: list[str]):
|
||||
success_urls = []
|
||||
with self.db.engine.begin() as conn:
|
||||
for url in urls:
|
||||
clean_url = normalize_url(url)
|
||||
try:
|
||||
check_q = select(self.db.queue.c.id).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
|
||||
)
|
||||
if not conn.execute(check_q).fetchone():
|
||||
conn.execute(insert(self.db.queue).values(task_id=task_id, url=clean_url, status='pending'))
|
||||
success_urls.append(clean_url)
|
||||
except Exception:
|
||||
pass
|
||||
return {"msg": f"Added {len(success_urls)} new urls"}
|
||||
|
||||
def get_pending_urls(self, task_id: int, limit: int):
|
||||
with self.db.engine.begin() as conn:
|
||||
# 原子锁定:获取并标记为 processing
|
||||
subquery = select(self.db.queue.c.id).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.status == 'pending')
|
||||
).limit(limit).with_for_update(skip_locked=True)
|
||||
|
||||
stmt = update(self.db.queue).where(
|
||||
self.db.queue.c.id.in_(subquery)
|
||||
).values(status='processing').returning(self.db.queue.c.url)
|
||||
|
||||
result = conn.execute(stmt).fetchall()
|
||||
return [r[0] for r in result]
|
||||
|
||||
def mark_url_status(self, task_id: int, url: str, status: str):
|
||||
clean_url = normalize_url(url)
|
||||
with self.db.engine.begin() as conn:
|
||||
conn.execute(update(self.db.queue).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
|
||||
).values(status=status))
|
||||
|
||||
def get_all_tasks(self):
|
||||
"""
|
||||
[新增] 获取所有已注册的任务(知识库列表)
|
||||
用于前端展示或工作流的路由选择
|
||||
"""
|
||||
with self.db.engine.connect() as conn:
|
||||
# 查询 id, root_url, created_at (如果有的话)
|
||||
# 这里假设 tasks 表里有 id 和 root_url
|
||||
stmt = select(self.db.tasks.c.id, self.db.tasks.c.root_url).order_by(self.db.tasks.c.id)
|
||||
rows = conn.execute(stmt).fetchall()
|
||||
|
||||
# 返回精简列表
|
||||
return [
|
||||
{"task_id": r[0], "root_url": r[1], "name": self._extract_name(r[1])}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
def _extract_name(self, url: str) -> str:
|
||||
"""辅助方法:从 URL 提取一个简短的名字作为 Alias"""
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
domain = urlparse(url).netloc
|
||||
# 比如 docs.firecrawl.dev -> firecrawl
|
||||
parts = domain.split('.')
|
||||
if len(parts) >= 2:
|
||||
return parts[-2]
|
||||
return domain
|
||||
except:
|
||||
return url
|
||||
|
||||
# ... (保持 get_task_monitor_data, save_chunks, search 等方法不变) ...
|
||||
|
||||
def get_task_monitor_data(self, task_id: int):
|
||||
"""[数据库层监控] 获取持久化的任务状态"""
|
||||
with self.db.engine.connect() as conn:
|
||||
# 1. 检查任务是否存在
|
||||
task_exists = conn.execute(select(self.db.tasks.c.root_url).where(self.db.tasks.c.id == task_id)).fetchone()
|
||||
if not task_exists:
|
||||
return None
|
||||
|
||||
# 2. 统计各状态数量
|
||||
stats_rows = conn.execute(select(
|
||||
self.db.queue.c.status, func.count(self.db.queue.c.id)
|
||||
).where(self.db.queue.c.task_id == task_id).group_by(self.db.queue.c.status)).fetchall()
|
||||
|
||||
stats = {"pending": 0, "processing": 0, "completed": 0, "failed": 0}
|
||||
for status, count in stats_rows:
|
||||
if status in stats: stats[status] = count
|
||||
stats["total"] = sum(stats.values())
|
||||
|
||||
return {
|
||||
"root_url": task_exists[0],
|
||||
"db_stats": stats
|
||||
}
|
||||
|
||||
def save_chunks(self, task_id: int, source_url: str, title: str, chunks_data: list):
|
||||
clean_url = normalize_url(source_url)
|
||||
with self.db.engine.begin() as conn:
|
||||
for item in chunks_data:
|
||||
idx = item['index']
|
||||
meta = item.get('meta_info', {})
|
||||
existing = conn.execute(select(self.db.chunks.c.id).where(
|
||||
and_(self.db.chunks.c.task_id == task_id,
|
||||
self.db.chunks.c.source_url == clean_url,
|
||||
self.db.chunks.c.chunk_index == idx)
|
||||
)).fetchone()
|
||||
values = {
|
||||
"task_id": task_id, "source_url": clean_url, "chunk_index": idx,
|
||||
"title": title, "content": item['content'], "embedding": item['embedding'],
|
||||
"meta_info": meta
|
||||
}
|
||||
if existing:
|
||||
conn.execute(update(self.db.chunks).where(self.db.chunks.c.id == existing[0]).values(**values))
|
||||
else:
|
||||
conn.execute(insert(self.db.chunks).values(**values))
|
||||
|
||||
conn.execute(update(self.db.queue).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
|
||||
).values(status='completed'))
|
||||
|
||||
def search(self, query_text: str, query_vector: list, task_id=None, candidates_num: int = 50, vector_weight: float = 0.7):
|
||||
# 向量格式清洗
|
||||
if hasattr(query_vector, 'tolist'): query_vector = query_vector.tolist()
|
||||
if isinstance(query_vector, list) and len(query_vector) > 0 and isinstance(query_vector[0], list):
|
||||
query_vector = query_vector[0]
|
||||
|
||||
results = []
|
||||
with self.db.engine.connect() as conn:
|
||||
# 1. 构造 Query 对象 (这是 tsquery 类型)
|
||||
keyword_query = func.websearch_to_tsquery('english', query_text)
|
||||
|
||||
# 计算分数 (逻辑不变)
|
||||
vector_dist = self.db.chunks.c.embedding.cosine_distance(query_vector)
|
||||
vector_score = (1 - vector_dist)
|
||||
|
||||
# 注意:ts_rank 需要 (tsvector, tsquery)
|
||||
keyword_rank = func.ts_rank(self.db.chunks.c.content_tsvector, keyword_query)
|
||||
keyword_score = func.coalesce(keyword_rank, 0)
|
||||
|
||||
keyword_weight = 1.0 - vector_weight
|
||||
final_score = (vector_score * vector_weight + keyword_score * keyword_weight).label("score")
|
||||
|
||||
stmt = select(
|
||||
self.db.chunks.c.task_id, self.db.chunks.c.source_url, self.db.chunks.c.title,
|
||||
self.db.chunks.c.content, self.db.chunks.c.meta_info, final_score
|
||||
)
|
||||
|
||||
# ================= 修复点开始 =================
|
||||
# 只有当 vector_weight 为 0 (纯关键词模式) 时,才强制加 WHERE 过滤
|
||||
if vector_weight == 0:
|
||||
# 错误写法 (SQLAlchemy 自动生成 plainto_tsquery 导致报错):
|
||||
# stmt = stmt.where(keyword_query.match(self.db.chunks.c.content_tsvector))
|
||||
|
||||
# 正确写法 (直接使用 PG 的 @@ 操作符):
|
||||
# 含义: content_tsvector @@ keyword_query
|
||||
stmt = stmt.where(self.db.chunks.c.content_tsvector.op('@@')(keyword_query))
|
||||
# ================= 修复点结束 =================
|
||||
|
||||
if task_id: stmt = stmt.where(self.db.chunks.c.task_id == task_id)
|
||||
|
||||
stmt = stmt.order_by(desc("score")).limit(candidates_num)
|
||||
|
||||
try:
|
||||
rows = conn.execute(stmt).fetchall()
|
||||
results = [{"task_id": r[0], "source_url": r[1], "title": r[2], "content": r[3], "meta_info": r[4], "score": float(r[5])} for r in rows]
|
||||
except Exception as e:
|
||||
# 打印详细错误方便调试
|
||||
logger.error(f"Search failed: {e}")
|
||||
# 只有混合或向量模式才回退,如果是纯关键词模式报错,回退也没用
|
||||
if vector_weight > 0:
|
||||
return self._fallback_vector_search(query_vector, task_id, candidates_num)
|
||||
return {"results": [], "msg": "Keyword search failed"}
|
||||
|
||||
return {"results": results, "msg": f"Found {len(results)}"}
|
||||
|
||||
def _fallback_vector_search(self, vector, task_id, limit):
|
||||
logger.warning("Fallback to pure vector search")
|
||||
with self.db.engine.connect() as conn:
|
||||
stmt = select(
|
||||
self.db.chunks.c.task_id, self.db.chunks.c.source_url, self.db.chunks.c.title,
|
||||
self.db.chunks.c.content, self.db.chunks.c.meta_info
|
||||
).order_by(self.db.chunks.c.embedding.cosine_distance(vector)).limit(limit)
|
||||
if task_id: stmt = stmt.where(self.db.chunks.c.task_id == task_id)
|
||||
rows = conn.execute(stmt).fetchall()
|
||||
return {"results": [{"content": r[3], "meta_info": r[4], "score": 0.0} for r in rows], "msg": "Fallback found"}
|
||||
|
||||
data_service = DataService()
|
||||
120
backend/services/llm_service.py
Normal file
120
backend/services/llm_service.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import dashscope
|
||||
from http import HTTPStatus
|
||||
from backend.core.config import settings
|
||||
import logging
|
||||
|
||||
# 获取当前模块的专用 Logger
|
||||
# __name__ 会自动识别为 "backend.services.crawler_service" 这样的路径
|
||||
logger = logging.getLogger(__name__)
|
||||
class LLMService:
|
||||
"""
|
||||
LLM 服务封装层
|
||||
负责与 DashScope (通义千问/GTE) 交互,包括 Embedding 和 Rerank
|
||||
"""
|
||||
def __init__(self):
|
||||
dashscope.api_key = settings.DASHSCOPE_API_KEY
|
||||
|
||||
def get_embedding(self, text: str, dimension: int = 1536):
|
||||
"""生成文本向量 (Bi-Encoder)"""
|
||||
try:
|
||||
resp = dashscope.TextEmbedding.call(
|
||||
model=dashscope.TextEmbedding.Models.text_embedding_v4, # 或 v4,视你的数据库维度而定
|
||||
input=text,
|
||||
dimension=dimension
|
||||
)
|
||||
if resp.status_code == HTTPStatus.OK:
|
||||
return resp.output['embeddings'][0]['embedding']
|
||||
else:
|
||||
logger.error(f"Embedding API Error: {resp}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Embedding Exception: {e}")
|
||||
return None
|
||||
|
||||
def rerank(self, query: str, documents: list, top_n: int = 5):
|
||||
"""
|
||||
执行重排序 (Cross-Encoder)
|
||||
|
||||
Args:
|
||||
query: 用户问题
|
||||
documents: 粗排召回的切片列表 (List[dict]),必须包含 'content' 字段
|
||||
top_n: 最终返回多少个结果
|
||||
|
||||
Returns:
|
||||
List[dict]: 排序后并截取 Top N 的文档列表,包含新的 'score'
|
||||
"""
|
||||
if not documents:
|
||||
return []
|
||||
|
||||
# 1. 准备输入数据
|
||||
# Rerank API 需要纯文本列表,但我们需要保留 documents 里的 meta_info 和 id
|
||||
# 所以我们提取 content 给 API,拿到 index 后再映射回去
|
||||
doc_contents = [doc.get('content', '') for doc in documents]
|
||||
|
||||
# 如果文档太多(比如超过 100 个),建议先截断,避免 API 超时或报错
|
||||
if len(doc_contents) > 50:
|
||||
doc_contents = doc_contents[:50]
|
||||
documents = documents[:50]
|
||||
|
||||
try:
|
||||
# 2. 调用 DashScope GTE-Rerank
|
||||
resp = dashscope.TextReRank.call(
|
||||
model='gte-rerank',
|
||||
query=query,
|
||||
documents=doc_contents,
|
||||
top_n=top_n,
|
||||
return_documents=False # 我们只需要索引和分数,不需要它把文本再传回来
|
||||
)
|
||||
|
||||
if resp.status_code == HTTPStatus.OK:
|
||||
# 3. 结果重组
|
||||
# API 返回结构示例: output.results = [{'index': 2, 'relevance_score': 0.98}, {'index': 0, ...}]
|
||||
reranked_results = []
|
||||
|
||||
for item in resp.output.results:
|
||||
# 根据 API 返回的 index 找到原始文档对象
|
||||
original_doc = documents[item.index]
|
||||
|
||||
# 更新分数为 Rerank 的精准分数 (通常是 0~1 之间的置信度)
|
||||
original_doc['score'] = item.relevance_score
|
||||
|
||||
# 标记来源,方便调试知道这是 Rerank 过的
|
||||
original_doc['reranked'] = True
|
||||
|
||||
reranked_results.append(original_doc)
|
||||
|
||||
return reranked_results
|
||||
else:
|
||||
logger.error(f"Rerank API Error: {resp}")
|
||||
# 降级策略:如果 Rerank 挂了,直接返回粗排的前 N 个
|
||||
return documents[:top_n]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Rerank Exception: {e}")
|
||||
# 降级策略
|
||||
return documents[:top_n]
|
||||
def chat(self, prompt: str, system_prompt: str = None, model: str = "qwen-turbo") -> str:
|
||||
"""
|
||||
[新增] 通用对话生成接口,用于RAG的最终回答或作为测试裁判(Judge)
|
||||
"""
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({'role': 'system', 'content': system_prompt})
|
||||
messages.append({'role': 'user', 'content': prompt})
|
||||
|
||||
try:
|
||||
resp = dashscope.Generation.call(
|
||||
model=model,
|
||||
messages=messages,
|
||||
result_format='message'
|
||||
)
|
||||
if resp.status_code == HTTPStatus.OK:
|
||||
return resp.output.choices[0].message.content
|
||||
else:
|
||||
logger.error(f"Chat API Error: {resp}")
|
||||
return "Error generating response."
|
||||
except Exception as e:
|
||||
logger.error(f"Chat Exception: {e}")
|
||||
return "Error generating response."
|
||||
|
||||
llm_service = LLMService()
|
||||
@@ -1,15 +0,0 @@
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from sqlalchemy import create_engine, MetaData, Table, select, update, and_
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
if not url: return ""
|
||||
url = url.strip()
|
||||
parsed = urlparse(url)
|
||||
scheme = parsed.scheme.lower()
|
||||
netloc = parsed.netloc.lower()
|
||||
path = parsed.path.rstrip('/')
|
||||
if not path: path = ""
|
||||
return urlunparse((scheme, netloc, path, parsed.params, parsed.query, ""))
|
||||
|
||||
def make_response(code: int, msg: str, data: any = None):
|
||||
return {"code": code, "msg": msg, "data": data}
|
||||
29
backend/utils/common.py
Normal file
29
backend/utils/common.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
def make_response(code: int, msg: str = "Success", data: any = None):
|
||||
"""统一 API 响应格式封装"""
|
||||
return {"code": code, "msg": msg, "data": data}
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""
|
||||
URL 标准化处理
|
||||
1. 去除首尾空格
|
||||
2. 移除 fragment (#后面的内容)
|
||||
3. 移除 query 参数 (视业务需求而定,这里假设不同 query 是同一页面)
|
||||
4. 移除尾部斜杠
|
||||
示例:
|
||||
"https://www.example.com/path/" -> "https://www.example.com/path"
|
||||
"https://www.example.com/path?query=1" -> "https://www.example.com/path"
|
||||
"""
|
||||
if not url:
|
||||
return ""
|
||||
|
||||
parsed = urlparse(url.strip())
|
||||
# 重新组合:scheme, netloc, path, params, query, fragment
|
||||
# 这里我们只保留 scheme, netloc, path
|
||||
clean_path = parsed.path.rstrip('/')
|
||||
|
||||
# 构造新的 parsed 对象 (param, query, fragment 置空)
|
||||
new_parsed = parsed._replace(path=clean_path, params='', query='', fragment='')
|
||||
|
||||
return urlunparse(new_parsed)
|
||||
61
backend/utils/text_process.py
Normal file
61
backend/utils/text_process.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import re
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
|
||||
|
||||
class TextProcessor:
|
||||
"""文本处理工具类:负责 Markdown 清洗和切分"""
|
||||
|
||||
def __init__(self):
|
||||
# 基于 Markdown 标题的语义切分器
|
||||
self.md_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=[
|
||||
("#", "h1"),
|
||||
("##", "h2"),
|
||||
("###", "h3"),
|
||||
],
|
||||
strip_headers=False
|
||||
)
|
||||
|
||||
# 备用的字符切分器
|
||||
self.char_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=800,
|
||||
chunk_overlap=100,
|
||||
separators=["\n\n", "\n", "。", "!", "?", " ", ""]
|
||||
)
|
||||
|
||||
def clean_markdown(self, text: str) -> str:
|
||||
"""清洗 Markdown 中的网页噪音"""
|
||||
if not text: return ""
|
||||
|
||||
# 去除 'Skip to main content'
|
||||
text = re.sub(r'\[Skip to main content\].*?\n', '', text, flags=re.IGNORECASE)
|
||||
# 去除页脚导航 (Previous / Next)
|
||||
text = re.sub(r'\[Previous\].*?\[Next\].*', '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def split_markdown(self, text: str):
|
||||
"""执行切分策略:先按标题切,过长则按字符切"""
|
||||
md_chunks = self.md_splitter.split_text(text)
|
||||
final_chunks = []
|
||||
|
||||
for chunk in md_chunks:
|
||||
# chunk.page_content 是文本
|
||||
# chunk.metadata 是标题层级
|
||||
|
||||
if len(chunk.page_content) > 1000:
|
||||
sub_texts = self.char_splitter.split_text(chunk.page_content)
|
||||
for sub in sub_texts:
|
||||
final_chunks.append({
|
||||
"content": sub,
|
||||
"metadata": chunk.metadata
|
||||
})
|
||||
else:
|
||||
final_chunks.append({
|
||||
"content": chunk.page_content,
|
||||
"metadata": chunk.metadata
|
||||
})
|
||||
|
||||
return final_chunks
|
||||
|
||||
# 单例工具
|
||||
text_processor = TextProcessor()
|
||||
16
docker-compose.yml
Normal file
16
docker-compose.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
api:
|
||||
build: . # 自动寻找当前目录下的 Dockerfile
|
||||
image: wiki-backend:latest
|
||||
container_name: wiki_backend
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
- DB_USER=postgres
|
||||
- DB_PASS=DXC_welcome001
|
||||
- DB_HOST=8.155.144.6
|
||||
- DB_PORT=25432
|
||||
- DB_NAME=wiki_crawler
|
||||
restart: always
|
||||
157
docs/RAGtest.md
Normal file
157
docs/RAGtest.md
Normal file
@@ -0,0 +1,157 @@
|
||||
这是一份标准的 **RAG 系统测试与验证设计文档**。你可以将其作为项目文档的一部分,用于指导开发团队进行自动化测试框架的搭建、QA 团队进行测试用例的编写,以及算法工程师进行模型选型。
|
||||
|
||||
---
|
||||
|
||||
# RAG 知识库检索与生成系统测试设计规范
|
||||
**RAG System Test Design Specification**
|
||||
|
||||
**版本**: 1.0
|
||||
**状态**: 拟定中
|
||||
|
||||
---
|
||||
|
||||
## 1. 测试背景与目标 (Background & Objectives)
|
||||
|
||||
RAG(检索增强生成)系统涉及“检索器(Retriever)”与“生成器(Generator)”的复杂交互。传统的软件测试(单元测试、集成测试)无法有效评估其回答质量。本设计文档旨在建立一套**端到端(End-to-End)**与**分段式(Component-wise)**结合的测试框架,以达成以下目标:
|
||||
|
||||
1. **量化评估**:将模糊的“回答好坏”转化为可度量的指标(如召回率、忠实度得分)。
|
||||
2. **归因分析**:当系统表现不佳时,能快速定位是“检索没找对”还是“LLM 没答好”。
|
||||
3. **选型决策**:通过横向对比(Benchmark),为检索算法(BM25/Vector/Hybrid)和模型选择提供数据支撑。
|
||||
4. **短板识别**:自动识别系统在特定场景(如多语言、数值推理、负向测试)下的薄弱环节。
|
||||
|
||||
---
|
||||
|
||||
## 2. 测试架构原理 (Test Architecture)
|
||||
|
||||
测试框架基于 **RAG 三元组 (RAG Triad)** 理论进行设计,分别针对链路中的三个关键节点进行评估:
|
||||
|
||||
### 2.1 评估对象
|
||||
* **Query (用户提问)**: 测试的输入。
|
||||
* **Context (检索上下文)**: 检索器从知识库召回的 Top-K 文档片段。
|
||||
* **Response (系统回答)**: LLM 基于 Query 和 Context 生成的最终文本。
|
||||
|
||||
### 2.2 测试流向
|
||||
1. **检索层评估 (Retrieval Evaluation)**: `Query` $\leftrightarrow$ `Context`
|
||||
* *核心问题*: 检索到的内容是否包含回答问题所需的全部信息?
|
||||
2. **生成层评估 (Generation Evaluation)**:
|
||||
* `Context` $\leftrightarrow$ `Response`: **忠实度 (Faithfulness)**。回答是否完全基于上下文?有无幻觉?
|
||||
* `Query` $\leftrightarrow$ `Response`: **相关度 (Relevance)**。回答是否解决了用户的问题?
|
||||
|
||||
---
|
||||
|
||||
## 3. 测试数据集设计 (Dataset Design)
|
||||
|
||||
为了全面评估系统能力,构建“黄金数据集”是测试的基础。数据集需包含以下维度的字段:
|
||||
|
||||
### 3.1 数据字段定义
|
||||
| 字段名 | 说明 | 用途 |
|
||||
| :--- | :--- | :--- |
|
||||
| **ID** | 唯一标识符 | 用于追踪 Case |
|
||||
| **Type** | 问题分类 | 用于计算 "Weakest Category" (见 3.2) |
|
||||
| **Query** | 用户模拟问题 | 输入 |
|
||||
| **Ground Truth Answer** | 标准答案 | 用于生成层对比评分 |
|
||||
| **Keywords/Key Information** | 关键信息点/术语 | 用于计算检索层的 Recall |
|
||||
| **Ground Truth Context IDs** | (可选) 预期命中的文档ID | 用于计算 Hit Rate |
|
||||
|
||||
### 3.2 场景分类体系 (Category Taxonomy)
|
||||
测试集必须覆盖以下场景,以防止模型“偏科”:
|
||||
|
||||
1. **Core Function (核心功能)**: 基础的概念解释和流程指引。
|
||||
2. **Detail/Numeric (细节与数值)**: 涉及具体参数、价格、限制阈值等精确信息(考察向量检索的弱点)。
|
||||
3. **Inference (推理集成)**: 需要综合多段文档才能回答的复杂问题(考察 Context Window 和推理能力)。
|
||||
4. **Multilingual (跨语言)**: 中文问英文文档,或反之(考察 Embedding 对齐能力)。
|
||||
5. **Negative Test (负向/拒答)**: 知识库中不存在的问题(考察系统抗幻觉能力,预期输出为“未找到信息”)。
|
||||
6. **Safety/Injection (安全)**: Prompt 注入攻击防御测试。
|
||||
|
||||
---
|
||||
|
||||
## 4. 控制变量与横向对比设计 (Experimental Design)
|
||||
|
||||
为选出最佳技术方案,需设计控制变量矩阵(Control Matrix),进行 A/B 测试或横向评测。
|
||||
|
||||
### 4.1 实验组配置 (Configuration Matrix)
|
||||
|
||||
| 实验组 | 检索机制 (Retrieval) | 权重策略 | 重排序 (Rerank) | 测试目的 |
|
||||
| :--- | :--- | :--- | :--- | :--- |
|
||||
| **Group A (Baseline)** | **Keyword (BM25)** | Vector=0, Keyword=1 | OFF | 模拟传统全文检索,设立基准线。 |
|
||||
| **Group B (Semantic)** | **Dense Vector** | Vector=1, Keyword=0 | OFF | 验证语义理解能力,检测对专有名词的丢失情况。 |
|
||||
| **Group C (Hybrid)** | **Hybrid Search** | Vector=0.7, Keyword=0.3 | OFF | 验证双路召回的互补性(当前主流方案)。 |
|
||||
| **Group D (High Acc)** | **Hybrid Search** | Vector=0.7, Keyword=0.3 | **ON (Top N)** | 验证引入 Rerank 模型对长尾、复杂问题的提升效果及延迟代价。 |
|
||||
|
||||
### 4.2 性能监控
|
||||
在运行上述配置时,同步记录以下工程指标:
|
||||
* **Latency**: 端到端耗时(P50, P95)。
|
||||
* **Token Usage**: 消耗的 Token 数量(计算成本 ROI)。
|
||||
|
||||
---
|
||||
|
||||
## 5. 核心评估指标 (Key Metrics)
|
||||
|
||||
采用 **规则计算 + LLM裁判 (LLM-as-a-Judge)** 相结合的方式。
|
||||
|
||||
### 5.1 检索层指标 (Retrieval Metrics)
|
||||
|
||||
| 指标 | 定义 | 计算方法 | 优劣判断 |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| **Keyword Recall** | 关键词召回率 | $\frac{\text{检索内容中命中的关键词数}}{\text{标准答案中的关键词总数}}$ | 低于 50% 说明切片策略或检索算法严重失效。 |
|
||||
| **Hit Rate** | 命中率 | 检索结果 Top-K 中是否包含至少一个相关文档片段。 | 二分类指标 (0/1)。 |
|
||||
|
||||
### 5.2 生成层指标 (Generation Metrics)
|
||||
|
||||
使用高智能模型(如 GPT-4o / Qwen-Max)作为裁判,进行 1-5 分打分。
|
||||
|
||||
| 指标 | 定义 | 评分标准 (Rubric) |
|
||||
| :--- | :--- | :--- |
|
||||
| **Correctness** | 正确性 | **5分**: 含义与 Ground Truth 完全一致。<br>**1分**: 完全错误或包含严重幻觉。 |
|
||||
| **Completeness** | 完整性 | 答案是否涵盖了问题询问的所有方面?(主要针对列表类问题)。 |
|
||||
| **Honesty** | 诚实度 | (针对负向测试) 当无相关信息时,是否诚实回答“不知道”而非编造。 |
|
||||
|
||||
### 5.3 诊断性指标 (Diagnostic Metrics)
|
||||
|
||||
* **Weakest Category (最弱类别)**:
|
||||
* 计算逻辑:按 `Type` 分组,计算各组的平均分,取最低者。
|
||||
* 作用:直接指出系统短板(例如:“多语言能力最差”或“数值细节最差”),指导后续优化方向。
|
||||
|
||||
---
|
||||
|
||||
## 6. 自动化测试流程 (Workflow)
|
||||
|
||||
1. **Setup Phase**:
|
||||
* 加载 `dataset.json`。
|
||||
* 初始化所有待测的 RAG Pipeline 配置。
|
||||
|
||||
2. **Execution Phase (Loop)**:
|
||||
* 遍历测试集中的每个 Query。
|
||||
* **Step 1 Retrieve**: 调用检索接口,获取 Context。
|
||||
* **Step 2 Generate**: 将 Context + Query 送入 LLM 生成 Answer。
|
||||
* **Step 3 Measure**: 记录 Latency。
|
||||
|
||||
3. **Evaluation Phase**:
|
||||
* **Rule Check**: 计算 Keyword Recall。
|
||||
* **AI Judge**: 将 (Query, Answer, Ground Truth) 组装 Prompt 发送给裁判 LLM 打分。
|
||||
|
||||
4. **Reporting Phase**:
|
||||
* 输出汇总报表,包含:各配置的平均分、平均召回率、延迟、最弱类别。
|
||||
* 输出 Bad Case 列表(得分 < 3 的用例)。
|
||||
|
||||
---
|
||||
|
||||
## 7. 附录:LLM 裁判 Prompt 模板
|
||||
|
||||
```markdown
|
||||
你是一名 RAG 系统测试裁判。请评估【系统回答】相对于【标准答案】的质量。
|
||||
|
||||
【问题类型】: {type}
|
||||
【用户问题】: {query}
|
||||
【标准答案】: {ground_truth}
|
||||
【系统回答】: {prediction}
|
||||
|
||||
请打分 (1-5):
|
||||
- 5分: 完美。逻辑正确,无幻觉,细节精准。
|
||||
- 3分: 及格。包含核心信息,但有遗漏或啰嗦。
|
||||
- 1分: 错误。答非所问,或在负向测试中产生幻觉。
|
||||
|
||||
特别规则:对于 "Negative Test",如果标准答案是“不支持/未提及”,而系统回答了“未找到相关信息”,请给 5 分。
|
||||
|
||||
输出 JSON: {"score": int, "reason": "string"}
|
||||
```
|
||||
140
docs/docker.md
Normal file
140
docs/docker.md
Normal file
@@ -0,0 +1,140 @@
|
||||
# Wiki Crawler Backend 部署操作手册
|
||||
|
||||
## 核心配置信息 (每次只需修改这里)
|
||||
|
||||
**在执行命令前,请先确定本次发布的** **版本号**。
|
||||
|
||||
| **字段** | **当前值 (示例)** | **说明** | **每次要改吗?** |
|
||||
| -------------------- | ---------------------------------- | ------------------------------ | ---------------------- |
|
||||
| **Version** | **v1.0.7** | **镜像的版本标签 (Tag)** | **是 (必须改)** |
|
||||
| **Image Name** | **wiki-crawl-backend** | **镜像/容器的名字** | **否 (固定)** |
|
||||
| **Namespace** | **qg-demo** | **阿里云命名空间** | **否 (固定)** |
|
||||
| **Registry** | **crpi-1rwd6fvain6t49g2...** | **阿里云仓库地址** | **否 (固定)** |
|
||||
|
||||
---
|
||||
|
||||
## 第一阶段:本地电脑 (Windows) - 打包与上传
|
||||
|
||||
**打开 PowerShell 或 CMD,进入项目根目录。**
|
||||
|
||||
### 1. 构建镜像 (Build)
|
||||
|
||||
**修改命令最后的版本号** **v1.0.7**
|
||||
|
||||
```powershell
|
||||
docker build -t crpi-1rwd6fvain6t49g2.cn-hangzhou.personal.cr.aliyuncs.com/qg-demo/wiki-crawl-backend:v1.0.7 .
|
||||
```
|
||||
|
||||
### 2. 推送镜像 (Push)
|
||||
|
||||
**修改命令最后的版本号** **v1.0.7**
|
||||
|
||||
```powershell
|
||||
docker push crpi-1rwd6fvain6t49g2.cn-hangzhou.personal.cr.aliyuncs.com/qg-demo/wiki-crawl-backend:v1.0.7
|
||||
```
|
||||
|
||||
> **成功标准:** **看到进度条走完,且最后显示** **Pushed**。
|
||||
|
||||
---
|
||||
|
||||
## 第二阶段:云服务器 (Linux) - 部署更新
|
||||
|
||||
**使用 SSH 登录阿里云服务器,按顺序执行。**
|
||||
|
||||
### 1. 拉取新镜像 (Pull)
|
||||
|
||||
**修改命令最后的版本号** **v1.0.7**
|
||||
|
||||
```bash
|
||||
docker pull crpi-1rwd6fvain6t49g2.cn-hangzhou.personal.cr.aliyuncs.com/qg-demo/wiki-crawl-backend:v1.0.7
|
||||
```
|
||||
|
||||
### 2. 停止并删除旧容器
|
||||
|
||||
**这一步是为了释放端口,不会删除镜像文件**
|
||||
|
||||
```bash
|
||||
docker stop wiki-crawl-backend
|
||||
|
||||
docker rm wiki-crawl-backend
|
||||
```
|
||||
|
||||
### 3. 启动新容器 (Run) - 关键步骤
|
||||
|
||||
**修改命令最后的版本号** **v1.0.7**
|
||||
|
||||
**code**Bash
|
||||
|
||||
```bash
|
||||
docker run -d --name wiki-crawl-backend \
|
||||
-e PYTHONUNBUFFERED=1 \
|
||||
-p 80:8000 \
|
||||
crpi-1rwd6fvain6t49g2.cn-hangzhou.personal.cr.aliyuncs.com/qg-demo/wiki-crawl-backend:v1.0.7
|
||||
```
|
||||
|
||||
### 4. 验证与日志查看
|
||||
|
||||
**code**Bash
|
||||
|
||||
```
|
||||
# 查看容器状态 (STATUS 应该是 Up)
|
||||
docker ps
|
||||
|
||||
# 查看实时日志 (按 Ctrl+C 退出)
|
||||
docker logs -f wiki-crawl-backend
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 第三阶段:清理工作 (可选)
|
||||
|
||||
**为了防止服务器硬盘被旧版本的镜像塞满,建议定期执行清理。**
|
||||
|
||||
**code**Bash
|
||||
|
||||
```
|
||||
# 删除所有“未被使用”的旧镜像
|
||||
docker image prune -a -f
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 附录:命令参数详解 (小白必读)
|
||||
|
||||
**在** **docker run** **命令中,各个参数的含义如下:**
|
||||
|
||||
### 1. **-d** **(Detached)**
|
||||
|
||||
* **含义:** **后台运行。**
|
||||
* **作用:** **容器启动后会默默在后台跑,不会占用你的黑窗口。如果不加这个,你一关 SSH 窗口,服务就停了。**
|
||||
|
||||
### 2. **--name wiki-crawl-backend**
|
||||
|
||||
* **含义:** **给容器起个名字。**
|
||||
* **作用:** **有了名字,以后你要停止它、重启它、看日志,直接叫它的名字就行(比如** **docker stop wiki-crawl-backend**),不用去查那串随机的容器 ID。
|
||||
|
||||
### 3. **-e PYTHONUNBUFFERED=1**
|
||||
|
||||
* **含义:** **设置环境变量 (Environment Variable)。**
|
||||
* **作用:** **这是一个 Python 专用的设置。意思是**“不要缓存输出”**。**
|
||||
|
||||
* **如果不加:Python 会把日志憋在肚子里,攒够了一坨才吐出来,导致你用** **docker logs** **只能看到几分钟前的日志。**
|
||||
* **加了:日志实时打印,报错立刻能看到。**
|
||||
|
||||
### 4. **-p 80:8000** **(Port Mapping)**
|
||||
|
||||
* **含义:** **端口映射。格式是** **宿主机端口:容器内部端口**。
|
||||
* **解析:**
|
||||
|
||||
* **80 (左边)**:这是阿里云服务器对外的门牌号。用户访问 **http://1.2.3.4** **时,默认就是找 80 端口。**
|
||||
* **8000 (右边)**:这是你 Python 代码 (FastAPI/Uvicorn) 实际监听的端口。
|
||||
* **作用:** **把服务器大门 (80) 接到的客人,领到 Python 小房间 (8000) 去。**
|
||||
|
||||
### 5. 那个超长的 URL
|
||||
|
||||
**crpi-1rwd6fvain6t49g2.cn-hangzhou.personal.cr.aliyuncs.com/qg-demo/wiki-crawl-backend:v1.0.7**
|
||||
|
||||
* **Registry (仓库地址)**: **crpi-1rwd...aliyuncs.com** **-> 你的专属阿里云仓库服务器。**
|
||||
* **Namespace (命名空间)**: **qg-demo** **-> 你在仓库里划出的个人地盘。**
|
||||
* **Image Name (镜像名)**: **wiki-crawl-backend** **-> 这个项目的名字。**
|
||||
* **Tag (标签)**: **v1.0.7** **-> 相当于软件的版本号。如果不写 Tag,默认就是** **latest**。**生产环境强烈建议写明确的版本号**,方便回滚(比如 1.0.3 挂了,你可以立马用 1.0.2 启动)。
|
||||
149
docs/t.md
149
docs/t.md
@@ -1,149 +0,0 @@
|
||||
针对你希望实现“通用、支持全库或特定任务搜索”的需求,我重新设计并实现了搜索 API。
|
||||
|
||||
这一版代码继续采用 **SQLAlchemy Core API**,实现了逻辑上的“存取分离”:Dify 只管发向量,后端决定怎么搜。
|
||||
|
||||
### 1. 修改 `schemas.py`
|
||||
|
||||
增加搜索请求模型,将 `task_id` 设为可选(`Optional`),从而支持全局搜索。
|
||||
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
# 如果不传 task_id,则进行全库搜索
|
||||
task_id: Optional[int] = None
|
||||
query_embedding: List[float]
|
||||
limit: Optional[int] = 5
|
||||
```
|
||||
|
||||
### 2. 重新实现 `service.py` 中的搜索方法
|
||||
|
||||
利用 `pgvector` 的 `cosine_distance` 算子。注意这里使用了动态构建查询的技巧。
|
||||
|
||||
```python
|
||||
from sqlalchemy import select, and_
|
||||
from .database import db_instance
|
||||
|
||||
class CrawlerService:
|
||||
def __init__(self):
|
||||
self.db = db_instance
|
||||
|
||||
def search_knowledge(self, query_embedding: list, task_id: int = None, limit: int = 5):
|
||||
"""
|
||||
高性能向量搜索方法
|
||||
:param query_embedding: 问题的向量
|
||||
:param task_id: 可选的任务ID,不传则搜全表
|
||||
:param limit: 返回结果数量
|
||||
"""
|
||||
with self.db.engine.connect() as conn:
|
||||
# 1. 选择需要的字段
|
||||
# 我们同时返回 task_id,方便在全库搜索时知道来源哪个任务
|
||||
stmt = select(
|
||||
self.db.chunks.c.task_id,
|
||||
self.db.chunks.c.source_url,
|
||||
self.db.chunks.c.title,
|
||||
self.db.chunks.c.content,
|
||||
self.db.chunks.c.chunk_index
|
||||
)
|
||||
|
||||
# 2. 动态添加过滤条件
|
||||
if task_id is not None:
|
||||
stmt = stmt.where(self.db.chunks.c.task_id == task_id)
|
||||
|
||||
# 3. 按余弦距离排序(1 - 余弦相似度)
|
||||
# 距离越小,相似度越高
|
||||
stmt = stmt.order_by(
|
||||
self.db.chunks.c.embedding.cosine_distance(query_embedding)
|
||||
).limit(limit)
|
||||
|
||||
# 4. 执行并解析结果
|
||||
rows = conn.execute(stmt).fetchall()
|
||||
|
||||
results = []
|
||||
for r in rows:
|
||||
results.append({
|
||||
"task_id": r[0],
|
||||
"source_url": r[1],
|
||||
"title": r[2],
|
||||
"content": r[3],
|
||||
"chunk_index": r[4]
|
||||
})
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
### 3. 重新实现 `main.py` 中的接口
|
||||
|
||||
增加了对向量维度的基础校验,确保系统健壮性。
|
||||
|
||||
```python
|
||||
@app.post("/search")
|
||||
async def search(req: SearchRequest):
|
||||
"""
|
||||
通用搜索接口:
|
||||
支持基于 task_id 的局部搜索,也支持不传 task_id 的全库搜索。
|
||||
"""
|
||||
try:
|
||||
# 1. 基础校验:确保向量不为空且维度正确(阿里 v4 模型通常为 1536)
|
||||
if not req.query_embedding or len(req.query_embedding) != 1536:
|
||||
return make_response(
|
||||
code=2,
|
||||
msg=f"向量维度错误。期望 1536, 实际收到 {len(req.query_embedding) if req.query_embedding else 0}",
|
||||
data=None
|
||||
)
|
||||
|
||||
# 2. 调用业务类执行搜索
|
||||
data = crawler_service.search_knowledge(
|
||||
query_embedding=req.query_embedding,
|
||||
task_id=req.task_id,
|
||||
limit=req.limit
|
||||
)
|
||||
|
||||
# 3. 统一返回
|
||||
return make_response(
|
||||
code=1,
|
||||
msg="搜索完成",
|
||||
data=data
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# 记录日志并返回失败信息
|
||||
print(f"搜索接口异常: {str(e)}")
|
||||
return make_response(code=0, msg=f"搜索失败: {str(e)}")
|
||||
```
|
||||
|
||||
### 4. 配合性能优化的 SQL 建议
|
||||
|
||||
如果你在 Dify 中运行一段时间后发现搜索变慢(数据达到万级或十万级),请在数据库执行以下语句开启 **HNSW 索引**。这是目前向量搜索性能最高、内存占用最平衡的方案。
|
||||
|
||||
```sql
|
||||
-- 针对余弦距离优化搜索速度
|
||||
-- m: 列表最大连接数,ef_construction: 动态候选列表大小
|
||||
-- 这两个参数决定了索引的精度和构建速度
|
||||
CREATE INDEX idx_chunks_embedding_hnsw
|
||||
ON knowledge_chunks
|
||||
USING hnsw (embedding vector_cosine_ops)
|
||||
WITH (m = 16, ef_construction = 64);
|
||||
```
|
||||
|
||||
### 5. 在 Dify 中这个节点的输出怎么用?
|
||||
|
||||
由于 `/search` 返回的是一个列表,在 Dify 中接一个 **“代码节点”** 处理成文本最方便:
|
||||
|
||||
```python
|
||||
def main(search_data: list) -> dict:
|
||||
# search_data 是从 HTTP 节点获得的 data 数组
|
||||
context_list = []
|
||||
for i, item in enumerate(search_data):
|
||||
# 格式化每条资料,包含来源和内容
|
||||
block = f"【资料{i+1}】来源: {item['source_url']}\n内容: {item['content']}"
|
||||
context_list.append(block)
|
||||
|
||||
# 用换行符连接所有资料
|
||||
return {
|
||||
"final_context": "\n\n".join(context_list)
|
||||
}
|
||||
```
|
||||
|
||||
最后把这个 `final_context` 塞进 LLM 节点的 Prompt 即可。这样的设计确保了你的 Dify 流程非常干净:**输入 -> 转向量 -> 搜后端 -> 出答案**。
|
||||
49
docs/开发计划.md
Normal file
49
docs/开发计划.md
Normal file
@@ -0,0 +1,49 @@
|
||||
# 下一步开发计划
|
||||
|
||||
## 2025.1.20
|
||||
|
||||
1. 知识库RAG
|
||||
1. 测试
|
||||
测试相关资料参考链接: <https://1988251901502969000zhuanlan.zhihu.com/p/>
|
||||
|
||||
- [ ] 参照主流知识库架构增减修改当前知识库字段
|
||||
- [ ] 根据主流RAG测试要求完善知识库检索测试
|
||||
- [ ] 开发LLM输出测试
|
||||
- [ ] 横向对比不同检索方法或模型下的测试效果
|
||||
2. dify原生知识库
|
||||
dify原生支持的知识库可以直接配置firecrawl的api_key,实现知识库爬取,因此
|
||||
2. 后端封装backend
|
||||
|
||||
1. 重置接口,v1版本,restfulAPI
|
||||
预期实现效果:
|
||||
- [x] 添加任务
|
||||
- [x] 查询任务
|
||||
- [x] 执行任务
|
||||
- [x] 获取所有知识库
|
||||
- [x] 知识库搜索
|
||||
2. 新增功能
|
||||
- [ ] 业务操作原子化
|
||||
- [ ] 知识库更新相关
|
||||
- [ ] 日志功能
|
||||
3. 包装成MCP工具供dify调用
|
||||
- [x] 完成backend的MCP包装并测试
|
||||
- [x] 发现dify可以直接用http接口封装工具,所以就用原来的fastapi去做了
|
||||
4. 阿里云部署
|
||||
- [x] 将后端部署阿里云
|
||||
|
||||
3. dify节点
|
||||
|
||||
- [x] 完成dify的LLM输出工具,主要负责处理搜索逻辑和问题分类,调用api,发布工具。
|
||||
也可能直接在backend里全部实现,直接集成到bot里
|
||||
|
||||
4. firecrawl方案
|
||||
1. 闭源方案购买
|
||||
年费大概9000元/年,不支持绕过robots.txt
|
||||
2. 开源方案
|
||||
这个可以考虑自己部署一套,然后分发apikey,问题在于firecrawl的开源证书的法律风险,以及需要修改robots和计费相关的部分,以下是调研任务
|
||||
- [x] robots.txt问题
|
||||
在apps/api,这个是可以直接修改代码改的,但是关键是要吃透相关代码在各层级间的传递
|
||||
- [x] 部署的计费相关
|
||||
在apps/nuq-postgres,后续可能部署之后单独分发apikey给客户,因此需要一个计费方案
|
||||
- [x] .env问题
|
||||
需要查阅资料了解里面的一些参数配置重新配置之后好上云,包括API_KEY配置、流量统计和配置
|
||||
6
main.py
6
main.py
@@ -1,6 +0,0 @@
|
||||
def main():
|
||||
print("Hello from wiki-crawler!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -48,4 +48,64 @@
|
||||
"success": true
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"DASHSCOPE_API_KEY": "sk-8b091493de594c5e9eb42f12f1cc5805",
|
||||
"scrape_json": [
|
||||
{
|
||||
"error": "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing or try changing the request limit to a lower value.",
|
||||
"success": false
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
[
|
||||
{
|
||||
"data": {
|
||||
"markdown": "[Skip to main content](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#content-area)\n\n[Dify Docs home page](https://docs.dify.ai/)\n\nLatest\n\n\nEnglish\n\nSearch...\n\nCtrl K\n\nSearch...\n\nNavigation\n\n1\\. Import Text Data\n\n1\\. Import Text Data\n\nClick on Knowledge in the main navigation bar of Dify. On this page, you can see your existing knowledge bases. Click **Create Knowledge** to enter the setup wizard. The Knowledge supports the import of the following two online data:Click **Knowledge** in the top navigation bar of the Dify, then select **Create Knowledge**. You can upload documents to the knowledge or importing online data to it.\n\n## [](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#upload-local-files) Upload Local Files\n\nDrag and drop or select files to upload. The number of files allowed for **batch upload** depends on your [subscription plan](https://dify.ai/pricing).**Limitations for uploading documents:**\n\n- The upload size limit for a single document is 15MB;\n- Different [subscription plans](https://dify.ai/pricing) for the SaaS version limit **batch upload numbers, total document uploads, and vector storage**\n\n\n\n## [](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#import-from-online-data-source) Import From Online Data Source\n\nWhen creating a **Knowledge**, you can import data from online sources. The knowledge supports the following two types of online data: [**1.1 Import Data from Notion** \\\\\n\\\\\nLearn how to import data from Notion](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion) [**1.2 Sync from Website** \\\\\n\\\\\nLearn how to sync data from websites](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-website)\n\nIf a knowledge base is set up to use online data, you won’t be able to add local documents later or switch it to a local file-based mode. This prevents a single knowledge base from mixing multiple data sources, avoiding management complications.\n\n## [](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#adding-data-later) Adding Data Later\n\nIf you haven’t prepared your documents or other content yet, simply create an empty knowledge first. You can then upload local files or import online data whenever you’re ready.\n\nWas this page helpful?\n\nYesNo\n\n[Previous](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/introduction) [1.1 Sync Data from Notion\\\\\n\\\\\nNext](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion)\n\nCtrl+I\n\nOn this page\n\n- [Upload Local Files](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#upload-local-files)\n- [Import From Online Data Source](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#import-from-online-data-source)\n- [Adding Data Later](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#adding-data-later)\n\nAssistant\n\nResponses are generated using AI and may contain mistakes.\n\n",
|
||||
"metadata": {
|
||||
"apple-mobile-web-app-title": "Dify Docs",
|
||||
"application-name": "Dify Docs",
|
||||
"cacheState": "hit",
|
||||
"cachedAt": "2025-12-09T08:12:32.803Z",
|
||||
"canonical": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"charset": "utf-8",
|
||||
"concurrencyLimited": true,
|
||||
"concurrencyQueueDurationMs": 371,
|
||||
"contentType": "text/html; charset=utf-8",
|
||||
"creditsUsed": 1,
|
||||
"favicon": "https://docs.dify.ai/mintlify-assets/_mintlify/favicons/dify-6c0370d8/tWYYD8GkT0MUJV0z/_generated/favicon/favicon-16x16.png",
|
||||
"generator": "Mintlify",
|
||||
"language": "en",
|
||||
"msapplication-TileColor": "#0060FF",
|
||||
"msapplication-config": "/mintlify-assets/_mintlify/favicons/dify-6c0370d8/tWYYD8GkT0MUJV0z/_generated/favicon/browserconfig.xml",
|
||||
"next-size-adjust": "",
|
||||
"og:image": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100",
|
||||
"og:image:height": "630",
|
||||
"og:image:width": "1200",
|
||||
"og:site_name": "Dify Docs",
|
||||
"og:title": "1. Import Text Data - Dify Docs",
|
||||
"og:type": "website",
|
||||
"og:url": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"ogImage": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100",
|
||||
"ogTitle": "1. Import Text Data - Dify Docs",
|
||||
"ogUrl": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"proxyUsed": "basic",
|
||||
"scrapeId": "019b024f-f76e-746b-b13c-6ca4884fdd64",
|
||||
"sourceURL": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"statusCode": 200,
|
||||
"title": "1. Import Text Data - Dify Docs",
|
||||
"twitter:card": "summary_large_image",
|
||||
"twitter:image": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100",
|
||||
"twitter:image:height": "630",
|
||||
"twitter:image:width": "1200",
|
||||
"twitter:title": "1. Import Text Data - Dify Docs",
|
||||
"url": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"viewport": "width=device-width, initial-scale=1"
|
||||
},
|
||||
"warning": "This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan."
|
||||
},
|
||||
"success": true
|
||||
}
|
||||
]
|
||||
@@ -41,6 +41,9 @@ def chunks_embedding(texts: list[str], api_key: str) -> list[list[float]]:
|
||||
def main(text: str, api_key: str):
|
||||
|
||||
vector = chunks_embedding([text], api_key)[0]
|
||||
|
||||
return {
|
||||
'vector': vector
|
||||
'vector': {
|
||||
'vector': vector
|
||||
}
|
||||
}
|
||||
1
nodes/llm.md
Normal file
1
nodes/llm.md
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
957
nodes/parse_and_add_urls.json
Normal file
957
nodes/parse_and_add_urls.json
Normal file
@@ -0,0 +1,957 @@
|
||||
{
|
||||
"files": [],
|
||||
"json": [
|
||||
{
|
||||
"links": [
|
||||
"https://docs.dify.ai",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/start",
|
||||
"https://docs.dify.ai/zh/self-host/troubleshooting/weaviate-v4-migration",
|
||||
"https://docs.dify.ai/en/use-dify/debug/variable-inspect",
|
||||
"https://docs.dify.ai/api-reference/metadata-&-tags/modify-knowledge-base-type-tag-name",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/advanced-development/reverse-invocation-tool",
|
||||
"https://docs.dify.ai/en/develop-plugin/getting-started/cli",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E8%A8%AD%E5%AE%9A/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%AE%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF%E6%83%85%E5%A0%B1%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/en/use-dify/build/goto-anything",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/integrations/integrate-langfuse",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/external-knowledge-api",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/agent-strategy-plugin",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/webapp/web-app-access",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%A1%E3%83%83%E3%82%BB%E3%83%BC%E3%82%B8%E3%83%95%E3%82%A3%E3%83%BC%E3%83%89%E3%83%90%E3%83%83%E3%82%AF/%E3%82%A2%E3%83%97%E3%83%AA%E3%81%AE%E3%83%A1%E3%83%83%E3%82%BB%E3%83%BC%E3%82%B8%E3%81%AE%E3%80%8C%E3%81%84%E3%81%84%E3%81%AD%E3%80%8D%E3%81%A8%E3%83%95%E3%82%A3%E3%83%BC%E3%83%89%E3%83%90%E3%83%83%E3%82%AF%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/api-reference/agent/send-message",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/integrations/integrate-langfuse",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/integrate-knowledge-within-application",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/analysis",
|
||||
"https://docs.dify.ai/api-reference/%E5%85%83%E6%95%B0%E6%8D%AE%E5%92%8C%E6%A0%87%E7%AD%BE/%E4%BF%AE%E6%94%B9%E7%9F%A5%E8%AF%86%E5%BA%93%E7%B1%BB%E5%9E%8B%E6%A0%87%E7%AD%BE%E5%90%8D%E7%A7%B0",
|
||||
"https://docs.dify.ai/ja/use-dify/workspace/personal-account-management",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/management/personal-account-management",
|
||||
"https://docs.dify.ai/zh/develop-plugin/getting-started/getting-started-dify-plugin",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/variable-aggregator",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/structured-outputs",
|
||||
"https://docs.dify.ai/zh/use-dify/workspace/app-management",
|
||||
"https://docs.dify.ai/en/use-dify/tutorials/article-reader",
|
||||
"https://docs.dify.ai/zh/use-dify/build/mcp",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/advanced-development/reverse-invocation-app",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E8%A8%AD%E5%AE%9A/%E3%82%A2%E3%83%97%E3%83%AA%E3%81%AEwebapp%E8%A8%AD%E5%AE%9A%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/api-reference/feedback/message-feedback",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/plugin-types/plugin-logging",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/file-upload",
|
||||
"https://docs.dify.ai/zh/self-host/configuration/environments",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%87%E3%83%BC%E3%82%BF%E3%82%BB%E3%83%83%E3%83%88/%E3%83%8A%E3%83%AC%E3%83%83%E3%82%B8%E3%83%99%E3%83%BC%E3%82%B9%E3%81%8B%E3%82%89%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E3%82%92%E5%8F%96%E5%BE%97-%E3%83%86%E3%82%B9%E3%83%88%E6%A4%9C%E7%B4%A2",
|
||||
"https://docs.dify.ai/en/self-host/troubleshooting/storage-and-migration",
|
||||
"https://docs.dify.ai/ja/develop-plugin/publishing/marketplace-listing/release-overview",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/create-knowledge/import-text-data/sync-from-website",
|
||||
"https://docs.dify.ai/api-reference/completion/create-completion-message",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/answer",
|
||||
"https://docs.dify.ai/plugin-dev-en/9242-reverse-invocation-model",
|
||||
"https://docs.dify.ai/api-reference/datasets/get-knowledge-base-details",
|
||||
"https://docs.dify.ai/en/use-dify/debug/step-run",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/knowledge-pipeline/publish-knowledge-pipeline",
|
||||
"https://docs.dify.ai/ja/develop-plugin/publishing/marketplace-listing/release-to-dify-marketplace",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/setting-indexing-methods",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/develop-flomo-plugin",
|
||||
"https://docs.dify.ai/ja/use-dify/workspace/readme",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-pipeline/publish-knowledge-pipeline",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/integrations/integrate-arize",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/plugin-types/model-designing-rules",
|
||||
"https://docs.dify.ai/en/use-dify/publish/webapp/web-app-settings",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/tools/readme",
|
||||
"https://docs.dify.ai/api-reference/workflow-execution/execute-workflow",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/doc-extractor",
|
||||
"https://docs.dify.ai/ja/use-dify/debug/variable-inspect",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/advanced-development/reverse-invocation-model",
|
||||
"https://docs.dify.ai/api-reference/documents/update-document-status",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/doc-extractor",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/knowledge-retrieval",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/integrations/integrate-langsmith",
|
||||
"https://docs.dify.ai/ja/develop-plugin/publishing/marketplace-listing/release-to-individual-github-repo",
|
||||
"https://docs.dify.ai/api-reference/annotations/query-initial-annotation-reply-settings-task-status",
|
||||
"https://docs.dify.ai/ja/use-dify/build/version-control",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/develop-a-slack-bot-plugin",
|
||||
"https://docs.dify.ai/zh/use-dify/build/shortcut-key",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/http-request",
|
||||
"https://docs.dify.ai/zh/use-dify/workspace/model-providers",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/knowledge-retrieval",
|
||||
"https://docs.dify.ai/ja/self-host/advanced-deployments/local-source-code",
|
||||
"https://docs.dify.ai/en/use-dify/debug/error-types",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/ifelse",
|
||||
"https://docs.dify.ai/api-reference/datasets/update-knowledge-base",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E8%A8%AD%E5%AE%9A-%E3%83%AF%E3%83%BC%E3%82%AF%E3%83%95%E3%83%AD%E3%83%BC/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%AE%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF%E6%83%85%E5%A0%B1%E3%82%92%E5%8F%96%E5%BE%97-%E3%83%AF%E3%83%BC%E3%82%AF%E3%83%95%E3%83%AD%E3%83%BC",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/plugin-types/remote-debug-a-plugin",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/question-classifier",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/manage-knowledge/maintain-dataset-via-api",
|
||||
"https://docs.dify.ai/en/develop-plugin/publishing/marketplace-listing/plugin-auto-publish-pr",
|
||||
"https://docs.dify.ai/ja/self-host/platform-guides/dify-premium",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/agent",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/plugin-types/general-specifications",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/plugin-types/model-schema",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/code",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/annotation-reply",
|
||||
"https://docs.dify.ai/ja/self-host/quick-start/faqs",
|
||||
"https://docs.dify.ai/api-reference/%E5%BA%94%E7%94%A8%E8%AE%BE%E7%BD%AE/%E8%8E%B7%E5%8F%96%E5%BA%94%E7%94%A8%E5%9F%BA%E6%9C%AC%E4%BF%A1%E6%81%AF",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/create-knowledge-and-upload-documents/setting-indexing-methods",
|
||||
"https://docs.dify.ai/zh/use-dify/build/goto-anything",
|
||||
"https://docs.dify.ai/en/use-dify/build/orchestrate-node",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/endpoint",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/advanced-development/bundle",
|
||||
"https://docs.dify.ai/en/use-dify/build/mcp",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/trigger/webhook-trigger",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/creating-new-model-provider",
|
||||
"https://docs.dify.ai/api-reference/chatflow/stop-advanced-chat-message-generation",
|
||||
"https://docs.dify.ai/en/develop-plugin/publishing/marketplace-listing/release-overview",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/http-request",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/trigger/overview",
|
||||
"https://docs.dify.ai/api-reference/annotations/initial-annotation-reply-settings",
|
||||
"https://docs.dify.ai/zh/use-dify/workspace/team-members-management",
|
||||
"https://docs.dify.ai/en/self-host/troubleshooting/integrations",
|
||||
"https://docs.dify.ai/en/use-dify/workspace/personal-account-management",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/tool-oauth",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/advanced-development/reverse-invocation-node",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/variable-assigner",
|
||||
"https://docs.dify.ai/en/use-dify/publish/embedding-in-websites",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/tools",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/list-operator",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/readme",
|
||||
"https://docs.dify.ai/api-reference/files/file-preview",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion",
|
||||
"https://docs.dify.ai/zh/use-dify/debug/step-run",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/plugin-types/model-designing-rules",
|
||||
"https://docs.dify.ai/api-reference/completion/stop-generate",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/create-knowledge/chunking-and-cleaning-text",
|
||||
"https://docs.dify.ai/api-reference/completion/send-completion-message",
|
||||
"https://docs.dify.ai/api-reference/documents/delete-a-document",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-pipeline/upload-files",
|
||||
"https://docs.dify.ai/ja/self-host/configuration/environments",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/parameter-extractor",
|
||||
"https://docs.dify.ai/zh/use-dify/debug/error-type",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF/%E5%AD%90%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E3%82%92%E6%9B%B4%E6%96%B0",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF/%E3%83%89%E3%82%AD%E3%83%A5%E3%83%A1%E3%83%B3%E3%83%88%E5%86%85%E3%81%AE%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E8%A9%B3%E7%B4%B0%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/zh/use-dify/tutorials/article-reader",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/output",
|
||||
"https://docs.dify.ai/ja/self-host/quick-start/docker-compose",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/plugin-types/persistent-storage-kv",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/webapp/workflow-webapp",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-publishing/developing-with-apis",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/integrations/integrate-opik",
|
||||
"https://docs.dify.ai/api-reference/conversations/get-conversation-variables",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/readme",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-pipeline/authorize-data-source",
|
||||
"https://docs.dify.ai/zh/develop-plugin/publishing/standards/contributor-covenant-code-of-conduct",
|
||||
"https://docs.dify.ai/en/use-dify/getting-started/quick-start",
|
||||
"https://docs.dify.ai/api-reference/metadata-&-tags/unbind-dataset-and-knowledge-base-type-tag",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/external-knowledge-api",
|
||||
"https://docs.dify.ai/ja/use-dify/getting-started/key-concepts",
|
||||
"https://docs.dify.ai/api-reference/metadata-&-tags/delete-knowledge-base-type-tag",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/integrations/integrate-weave",
|
||||
"https://docs.dify.ai/zh/use-dify/build/orchestrate-node",
|
||||
"https://docs.dify.ai/api-reference/annotations/get-annotation-list",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/parameter-extractor",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/datasource-plugin",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/doc-extractor",
|
||||
"https://docs.dify.ai/zh-self/quick-start/faqs",
|
||||
"https://docs.dify.ai/en/use-dify/getting-started/introduction",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/ifelse",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/knowledge-pipeline/manage-knowledge-base",
|
||||
"https://docs.dify.ai/zh/self-host/quick-start/docker-compose",
|
||||
"https://docs.dify.ai/api-reference/%E6%96%87%E6%A1%A3/%E4%BB%8E%E6%96%87%E6%9C%AC%E5%88%9B%E5%BB%BA%E6%96%87%E6%A1%A3",
|
||||
"https://docs.dify.ai/en/self-host/platform-guides/bt-panel",
|
||||
"https://docs.dify.ai/zh/use-dify/debug/variable-inspect",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/logs",
|
||||
"https://docs.dify.ai/api-reference/chat/stop-chat-message-generation",
|
||||
"https://docs.dify.ai/ja/use-dify/build/shortcut-key",
|
||||
"https://docs.dify.ai/api-reference/datasets/create-an-empty-knowledge-base",
|
||||
"https://docs.dify.ai/ja/use-dify/build/predefined-error-handling-logic",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-orchestrate/readme",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/loop",
|
||||
"https://docs.dify.ai/zh/use-dify/applications/workflow-overview",
|
||||
"https://docs.dify.ai/zh/develop-plugin/publishing/faq/faq",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E6%83%85%E5%A0%B1/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%AE%E3%83%91%E3%83%A9%E3%83%A1%E3%83%BC%E3%82%BF%E6%83%85%E5%A0%B1%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/en/develop-plugin/getting-started/getting-started-dify-plugin",
|
||||
"https://docs.dify.ai/ja/develop-plugin/publishing/marketplace-listing/release-by-file",
|
||||
"https://docs.dify.ai/ja/develop-plugin/publishing/marketplace-listing/plugin-auto-publish-pr",
|
||||
"https://docs.dify.ai/ja/self-host/troubleshooting/common-issues",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/iteration",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/agent",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/loop",
|
||||
"https://docs.dify.ai/en/develop-plugin/publishing/standards/third-party-signature-verification",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/plugin-types/persistent-storage-kv",
|
||||
"https://docs.dify.ai/api-reference/chat-message",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/variable-aggregator",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/llm",
|
||||
"https://docs.dify.ai/en/use-dify/build/additional-features",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/publish-mcp",
|
||||
"https://docs.dify.ai/ja/develop-plugin/getting-started/cli",
|
||||
"https://docs.dify.ai/en/use-dify/build/predefined-error-handling-logic",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/knowledge-pipeline/knowledge-pipeline-orchestration",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF/%E5%AD%90%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E3%82%92%E5%89%8A%E9%99%A4",
|
||||
"https://docs.dify.ai/zh/self-host/troubleshooting/storage-and-migration",
|
||||
"https://docs.dify.ai/en/develop-plugin/publishing/standards/privacy-protection-guidelines",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/plugin-types/remote-debug-a-plugin",
|
||||
"https://docs.dify.ai/zh/develop-plugin/publishing/marketplace-listing/release-overview",
|
||||
"https://docs.dify.ai/en/use-dify/tutorials/simple-chatbot",
|
||||
"https://docs.dify.ai/en/develop-plugin/publishing/faq/faq",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/integrations/integrate-aliyun",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/integrations/integrate-phoenix",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/advanced-development/reverse-invocation",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF/%E5%AD%90%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E3%82%92%E4%BD%9C%E6%88%90",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/develop-flomo-plugin",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/user-input",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/advanced-development/reverse-invocation-app",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/advanced-development/reverse-invocation-tool",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E8%A8%AD%E5%AE%9A-%E3%83%AF%E3%83%BC%E3%82%AF%E3%83%95%E3%83%AD%E3%83%BC/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%AE%E5%9F%BA%E6%9C%AC%E6%83%85%E5%A0%B1%E3%82%92%E5%8F%96%E5%BE%97-%E3%83%AF%E3%83%BC%E3%82%AF%E3%83%95%E3%83%AD%E3%83%BC",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/trigger-plugin",
|
||||
"https://docs.dify.ai/api-reference/chunks/get-a-chunk-details-in-a-document",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/knowledge-pipeline/manage-knowledge-base",
|
||||
"https://docs.dify.ai/en/develop-plugin/publishing/marketplace-listing/release-to-individual-github-repo",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/integrations/integrate-weave",
|
||||
"https://docs.dify.ai/zh/develop-plugin/publishing/marketplace-listing/plugin-auto-publish-pr",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/advanced-development/customizable-model",
|
||||
"https://docs.dify.ai/en/self-host/troubleshooting/common-issues",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/variable-assigner",
|
||||
"https://docs.dify.ai/api-reference/workflow-execution/get-workflow-run-detail",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/logs",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/plugin-types/tool",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/knowledge-retrieval",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/knowledge-pipeline/create-knowledge-pipeline",
|
||||
"https://docs.dify.ai/api-reference/files/file-upload",
|
||||
"https://docs.dify.ai/en/use-dify/publish/developing-with-apis",
|
||||
"https://docs.dify.ai/ja/use-dify/getting-started/introduction",
|
||||
"https://docs.dify.ai/versions/3-3-x/zh/user-guide/tools/mcp",
|
||||
"https://docs.dify.ai/en/self-host/quick-start/faqs",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/parameter-extractor",
|
||||
"https://docs.dify.ai/api-reference/%E6%95%B0%E6%8D%AE%E9%9B%86/%E4%BB%8E%E7%9F%A5%E8%AF%86%E5%BA%93%E6%A3%80%E7%B4%A2%E5%9D%97-%E6%B5%8B%E8%AF%95%E6%A3%80%E7%B4%A2",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E8%A8%AD%E5%AE%9A/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%AE%E3%83%A1%E3%82%BF%E6%83%85%E5%A0%B1%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/en/use-dify/publish/webapp/web-app-access",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/endpoint",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/developing-with-apis",
|
||||
"https://docs.dify.ai/api-reference/%E6%96%87%E6%9C%AC%E7%94%9F%E6%88%90/%E5%8F%91%E9%80%81%E6%B6%88%E6%81%AF",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/tool-oauth",
|
||||
"https://docs.dify.ai/ja/use-dify/debug/history-and-logs",
|
||||
"https://docs.dify.ai/api-reference/metadata-&-tags/bind-dataset-to-knowledge-base-type-tag",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/knowledge-pipeline/upload-files",
|
||||
"https://docs.dify.ai/zh/use-dify/build/additional-features",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/metadata",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/template",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/integrations/integrate-opik",
|
||||
"https://docs.dify.ai/ja/use-dify/tutorials/build-ai-image-generation-app",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/code",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/question-classifier",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/api-documentation/external-knowledge-api-documentation",
|
||||
"https://docs.dify.ai/en/use-dify/workspace/team-members-management",
|
||||
"https://docs.dify.ai/ja/use-dify/tutorials/twitter-chatflow",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%83%E3%83%88%E3%83%A1%E3%83%83%E3%82%BB%E3%83%BC%E3%82%B8/%E7%94%9F%E6%88%90%E5%81%9C%E6%AD%A2",
|
||||
"https://docs.dify.ai/api-reference/datasets/retrieve-chunks-from-a-knowledge-base-test-retrieval",
|
||||
"https://docs.dify.ai/zh/use-dify/debug/history-and-logs",
|
||||
"https://docs.dify.ai/en/self-host/troubleshooting/weaviate-v4-migration",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/advanced-development/reverse-invocation-node",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/connect-external-knowledge-base",
|
||||
"https://docs.dify.ai/en/use-dify/tutorials/build-ai-image-generation-app",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/test-retrieval",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-orchestrate/workflow",
|
||||
"https://docs.dify.ai/zh/use-dify/workflows/supported-blocks",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/integrations/integrate-langsmith",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/agent-strategy-plugin",
|
||||
"https://docs.dify.ai/api-reference/%E5%AF%B9%E8%AF%9D%E6%B6%88%E6%81%AF/%E5%8F%91%E9%80%81%E5%AF%B9%E8%AF%9D%E6%B6%88%E6%81%AF",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-and-documents-maintenance/maintain-dataset-via-api",
|
||||
"https://docs.dify.ai/api-reference/documents/update-a-document-with-text",
|
||||
"https://docs.dify.ai/api-reference/annotations/create-annotation",
|
||||
"https://docs.dify.ai/api-reference/workflow-execution/stop-workflow-task-generation",
|
||||
"https://docs.dify.ai/en/self-host/quick-start/docker-compose",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/plugin-types/model-designing-rules",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/advanced-development/customizable-model",
|
||||
"https://docs.dify.ai/en/develop-plugin/publishing/standards/contributor-covenant-code-of-conduct",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/trigger-plugin",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/question-classifier",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E6%83%85%E5%A0%B1/%E3%82%A2%E3%83%97%E3%83%AA%E3%81%AEwebapp%E8%A8%AD%E5%AE%9A%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/api-reference/%E5%B7%A5%E4%BD%9C%E6%B5%81%E6%89%A7%E8%A1%8C/%E6%89%A7%E8%A1%8C-workflow",
|
||||
"https://docs.dify.ai/api-reference/tts/speech-to-text",
|
||||
"https://docs.dify.ai/en/use-dify/publish/webapp/embedding-in-websites",
|
||||
"https://docs.dify.ai/api-reference/tts/text-to-audio",
|
||||
"https://docs.dify.ai/ja/use-dify/tutorials/article-reader",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/annotation-reply",
|
||||
"https://docs.dify.ai/en/use-dify/publish/README",
|
||||
"https://docs.dify.ai/zh/use-dify/getting-started/introduction",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/trigger/webhook-trigger",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/develop-a-slack-bot-plugin",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/develop-a-slack-bot-plugin",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-request-rate-limit",
|
||||
"https://docs.dify.ai/ja/use-dify/workspace/plugins",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/webapp/web-app-settings",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/test-retrieval",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/README",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/iteration",
|
||||
"https://docs.dify.ai/en/use-dify/debug/error-type",
|
||||
"https://docs.dify.ai/zh/use-dify/build/version-control",
|
||||
"https://docs.dify.ai/api-reference/metadata-&-tags/create-new-knowledge-base-type-tag",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/annotation-reply",
|
||||
"https://docs.dify.ai/api-reference/documents/get-document-embedding-status-progress",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/datasource-plugin",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/manage-knowledge/maintain-knowledge-documents",
|
||||
"https://docs.dify.ai/en/use-dify/getting-started/key-concepts",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/chatflow-webapp",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%83%E3%83%88%E3%83%A1%E3%83%83%E3%82%BB%E3%83%BC%E3%82%B8/%E7%94%9F%E6%88%90%E3%82%92%E5%81%9C%E6%AD%A2",
|
||||
"https://docs.dify.ai/guides/application-orchestrate/agent",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/knowledge-pipeline/readme",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/llm",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/agent",
|
||||
"https://docs.dify.ai/api-reference/chunks/update-child-chunk",
|
||||
"https://docs.dify.ai/en/develop-plugin/publishing/marketplace-listing/release-to-dify-marketplace",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF/%E3%83%89%E3%82%AD%E3%83%A5%E3%83%A1%E3%83%B3%E3%83%88%E5%86%85%E3%81%AE%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E3%82%92%E6%9B%B4%E6%96%B0",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/trigger/plugin-trigger",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/manage-knowledge/maintain-dataset-via-api",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/template",
|
||||
"https://docs.dify.ai/en/use-dify/publish/web-app-access",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF/%E5%AD%90%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/guides/tools",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/manage-knowledge/maintain-dataset-via-api",
|
||||
"https://docs.dify.ai/api-reference/feedback/get-feedbacks-of-application",
|
||||
"https://docs.dify.ai/api-reference/chat/next-suggested-questions",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/develop-multimodal-data-processing-tool",
|
||||
"https://docs.dify.ai/api-reference/annotations/update-annotation",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/tools",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/develop-multimodal-data-processing-tool",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/readme",
|
||||
"https://docs.dify.ai/api-reference/documents/update-a-document-with-a-file",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/code",
|
||||
"https://docs.dify.ai/en/use-dify/build/version-control",
|
||||
"https://docs.dify.ai/en/use-dify/publish/publish-mcp",
|
||||
"https://docs.dify.ai/ja/use-dify/build/orchestrate-node",
|
||||
"https://docs.dify.ai/ja/develop-plugin/publishing/faq/faq",
|
||||
"https://docs.dify.ai/api-reference/files/file-upload-for-workflow",
|
||||
"https://docs.dify.ai/zh/use-dify/build/cn/use-dify/debug/error-type",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/plugin-types/multilingual-readme",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/webapp/chatflow-webapp",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/chatflow-webapp",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%87%E3%83%BC%E3%82%BF%E3%82%BB%E3%83%83%E3%83%88/%E3%83%8A%E3%83%AC%E3%83%83%E3%82%B8%E3%83%99%E3%83%BC%E3%82%B9%E3%82%92%E5%89%8A%E9%99%A4",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/integrations/integrate-aliyun",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/integrations/integrate-langsmith",
|
||||
"https://docs.dify.ai/en/use-dify/tutorials/twitter-chatflow",
|
||||
"https://docs.dify.ai/guides/application-publishing/openai-compatible-api",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-website",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/answer",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/integrations/integrate-weave",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/create-knowledge/import-text-data/sync-from-website",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/agent",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/advanced-development/reverse-invocation",
|
||||
"https://docs.dify.ai/guides/workflow/file-variables",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/plugin-types/plugin-logging",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/introduction",
|
||||
"https://docs.dify.ai/zh/develop-plugin/publishing/standards/privacy-protection-guidelines",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/plugin-types/model-schema",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/developing-with-apis",
|
||||
"https://docs.dify.ai/ja/self-host/platform-guides/bt-panel",
|
||||
"https://docs.dify.ai/api-reference/datasets/create-document-by-text",
|
||||
"https://docs.dify.ai/ja/use-dify/workspace/subscription-management",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF/%E3%83%89%E3%82%AD%E3%83%A5%E3%83%A1%E3%83%B3%E3%83%88%E5%86%85%E3%81%AE%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E3%82%92%E5%89%8A%E9%99%A4",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/plugin-types/general-specifications",
|
||||
"https://docs.dify.ai/zh/use-dify/build/predefined-error-handling-logic",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/develop-flomo-plugin",
|
||||
"https://docs.dify.ai/ja/develop-plugin/publishing/standards/contributor-covenant-code-of-conduct",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/llm",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/iteration",
|
||||
"https://docs.dify.ai/ja/use-dify/debug/step-run",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/metadata",
|
||||
"https://docs.dify.ai/zh/self-host/quick-start/faqs",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/trigger/plugin-trigger",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/plugin-types/remote-debug-a-plugin",
|
||||
"https://docs.dify.ai/zh/use-dify/tutorials/customer-service-bot",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/webapp/embedding-in-websites",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/analysis",
|
||||
"https://docs.dify.ai/api-reference/application/get-application-webapp-settings",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/create-knowledge/setting-indexing-methods",
|
||||
"https://docs.dify.ai/zh/develop-plugin/publishing/marketplace-listing/release-to-dify-marketplace",
|
||||
"https://docs.dify.ai/ja/use-dify/workspace/model-providers",
|
||||
"https://docs.dify.ai/en/use-dify/publish/webapp/chatflow-webapp",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/connect-external-knowledge-base",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/output",
|
||||
"https://docs.dify.ai/api-reference/documents/get-the-document-list-of-a-knowledge-base",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/knowledge-pipeline/create-knowledge-pipeline",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/http-request",
|
||||
"https://docs.dify.ai/api-reference/conversations/delete-conversation",
|
||||
"https://docs.dify.ai/zh/self-host/troubleshooting/integrations",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/trigger-plugin",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/plugin-types/plugin-info-by-manifest",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/integrations/integrate-opik",
|
||||
"https://docs.dify.ai/api-reference/%E6%96%87%E6%A1%A3/%E7%94%A8%E6%96%87%E6%9C%AC%E6%9B%B4%E6%96%B0%E6%96%87%E6%A1%A3",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/advanced-development/reverse-invocation-app",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/endpoint",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/manage-knowledge/maintain-knowledge-documents",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/plugin-types/persistent-storage-kv",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/knowledge-pipeline/authorize-data-source",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/README",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/cheatsheet",
|
||||
"https://docs.dify.ai/en/use-dify/getting-started/readme",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/trigger/plugin-trigger",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/integrations/integrate-arize",
|
||||
"https://docs.dify.ai/zh/develop-plugin/publishing/marketplace-listing/release-to-individual-github-repo",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/knowledge-request-rate-limit",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/advanced-development/bundle",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/list-operator",
|
||||
"https://docs.dify.ai/zh/self-host/advanced-deployments/local-source-code",
|
||||
"https://docs.dify.ai/ja/use-dify/build/goto-anything",
|
||||
"https://docs.dify.ai/ja/self-host/troubleshooting/docker-issues",
|
||||
"https://docs.dify.ai/ja/use-dify/build/mcp",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/knowledge-pipeline/knowledge-pipeline-orchestration",
|
||||
"https://docs.dify.ai/api-reference/chunks/add-chunks-to-a-document",
|
||||
"https://docs.dify.ai/api-reference/conversations/get-conversations",
|
||||
"https://docs.dify.ai/ja/self-host/troubleshooting/storage-and-migration",
|
||||
"https://docs.dify.ai/en/use-dify/workspace/model-providers",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/test-retrieval",
|
||||
"https://docs.dify.ai/zh/use-dify/workspace/personal-account-management",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/publish-mcp",
|
||||
"https://docs.dify.ai/en/self-host/advanced-deployments/start-the-frontend-docker-container",
|
||||
"https://docs.dify.ai/api-reference/application/get-application-basic-information",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/llm",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/datasource-plugin",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-and-documents-maintenance/introduction",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/integrate-knowledge-within-application",
|
||||
"https://docs.dify.ai/sitemap.xml",
|
||||
"https://docs.dify.ai/en/use-dify/publish/webapp/workflow-webapp",
|
||||
"https://docs.dify.ai/api-reference/chatflow/next-suggested-questions",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/integrations/integrate-langfuse",
|
||||
"https://docs.dify.ai/zh/use-dify/tutorials/twitter-chatflow",
|
||||
"https://docs.dify.ai/en/self-host/configuration/environments",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/advanced-development/reverse-invocation-model",
|
||||
"https://docs.dify.ai/ja/self-host/troubleshooting/integrations",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/webapp/embedding-in-websites",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/knowledge-request-rate-limit",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/advanced-development/customizable-model",
|
||||
"https://docs.dify.ai/api-reference/chat/send-chat-message",
|
||||
"https://docs.dify.ai/en/use-dify/workspace/readme",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/develop-md-exporter",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/ifelse",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/plugin-types/model-schema",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/webapp/web-app-settings",
|
||||
"https://docs.dify.ai/ja/develop-plugin/publishing/standards/privacy-protection-guidelines",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge-base/readme",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/advanced-development/reverse-invocation-node",
|
||||
"https://docs.dify.ai/ja/use-dify/tutorials/customer-service-bot",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/knowledge-pipeline/authorize-data-source",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/manage-knowledge/introduction",
|
||||
"https://docs.dify.ai/api-reference/application/get-application-parameters-information",
|
||||
"https://docs.dify.ai/ja/develop-plugin/getting-started/getting-started-dify-plugin",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/analysis",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E8%A8%AD%E5%AE%9A/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%AE%E5%9F%BA%E6%9C%AC%E6%83%85%E5%A0%B1%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/embedding-in-websites",
|
||||
"https://docs.dify.ai/zh/user-guide/build-app/flow-app/create-flow-app",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%87%E3%83%BC%E3%82%BF%E3%82%BB%E3%83%83%E3%83%88/%E3%83%8A%E3%83%AC%E3%83%83%E3%82%B8%E3%83%99%E3%83%BC%E3%82%B9%E3%82%92%E6%9B%B4%E6%96%B0",
|
||||
"https://docs.dify.ai/api-reference/chunks/delete-a-chunk-in-a-document",
|
||||
"https://docs.dify.ai/ja/use-dify/debug/error-type",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/webapp/workflow-webapp",
|
||||
"https://docs.dify.ai/ja/develop-plugin/publishing/standards/third-party-signature-verification",
|
||||
"https://docs.dify.ai/zh/use-dify/getting-started/key-concepts",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/develop-md-exporter",
|
||||
"https://docs.dify.ai/zh/use-dify/monitor/integrations/integrate-arize",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/connect-external-knowledge-base",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/trigger/overview",
|
||||
"https://docs.dify.ai/api-reference/datasets/get-knowledge-base-list",
|
||||
"https://docs.dify.ai/zh/self-host/advanced-deployments/start-the-frontend-docker-container",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/trigger/schedule-trigger",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/webapp/web-app-access",
|
||||
"https://docs.dify.ai/en/self-host/advanced-deployments/local-source-code",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/trigger/schedule-trigger",
|
||||
"https://docs.dify.ai/zh/use-dify/tutorials/build-ai-image-generation-app",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/tools",
|
||||
"https://docs.dify.ai/zh/use-dify/publish/webapp/chatflow-webapp",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/create-knowledge/setting-indexing-methods",
|
||||
"https://docs.dify.ai/api-reference/chunks/delete-child-chunk",
|
||||
"https://docs.dify.ai/api-reference/chunks/get-chunks-from-a-document",
|
||||
"https://docs.dify.ai/ja/use-dify/workspace/app-management",
|
||||
"https://docs.dify.ai/api-reference/%E6%96%87%E4%BB%B6%E6%93%8D%E4%BD%9C/%E4%B8%8A%E4%BC%A0%E6%96%87%E4%BB%B6",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/advanced-development/reverse-invocation-tool",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/knowledge-pipeline/upload-files",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/plugin-types/multilingual-readme",
|
||||
"https://docs.dify.ai/ja/self-host/advanced-deployments/start-the-frontend-docker-container",
|
||||
"https://docs.dify.ai/api-reference/chunks/get-child-chunks",
|
||||
"https://docs.dify.ai/api-reference/application/get-application-meta-information",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/integrate-knowledge-within-application",
|
||||
"https://docs.dify.ai/versions/3-3-x/en/user-guide/tools/mcp",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/cheatsheet",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/variable-aggregator",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/trigger/overview",
|
||||
"https://docs.dify.ai/api-reference/metadata-&-tags/get-knowledge-base-type-tags",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/plugin-types/tool",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF/%E3%83%89%E3%82%AD%E3%83%A5%E3%83%A1%E3%83%B3%E3%83%88%E3%81%8B%E3%82%89%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/api-reference/documents/create-a-document-from-a-file",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/tool-plugin",
|
||||
"https://docs.dify.ai/api-reference/conversations/get-conversation-history-messages",
|
||||
"https://docs.dify.ai/api-reference/chatflow/send-chat-message",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/setting-indexing-methods.md",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E6%93%8D%E4%BD%9C/%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E3%82%A2%E3%83%83%E3%83%97%E3%83%AD%E3%83%BC%E3%83%89",
|
||||
"https://docs.dify.ai/api-reference/application/get-application-info",
|
||||
"https://docs.dify.ai/api-reference/%E5%BA%94%E7%94%A8%E8%AE%BE%E7%BD%AE-workflow/%E8%8E%B7%E5%8F%96%E5%BA%94%E7%94%A8%E5%8F%82%E6%95%B0-workflow",
|
||||
"https://docs.dify.ai/en/use-dify/workspace/plugins",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-pipeline/readme",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/loop",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/knowledge-pipeline/publish-knowledge-pipeline",
|
||||
"https://docs.dify.ai/zh/develop-plugin/getting-started/cli",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E6%83%85%E5%A0%B1/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E3%81%AE%E5%9F%BA%E6%9C%AC%E6%83%85%E5%A0%B1%E3%82%92%E5%8F%96%E5%BE%97",
|
||||
"https://docs.dify.ai/zh/self-host/platform-guides/bt-panel",
|
||||
"https://docs.dify.ai/api-reference/models/get-available-embedding-models",
|
||||
"https://docs.dify.ai/ja/self-host/troubleshooting/weaviate-v4-migration",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/trigger/webhook-trigger",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/creating-new-model-provider",
|
||||
"https://docs.dify.ai/api-reference/documents/get-document-detail",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/user-input",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/metadata",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/readme",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/develop-multimodal-data-processing-tool",
|
||||
"https://docs.dify.ai/zh/develop-plugin/publishing/marketplace-listing/release-by-file",
|
||||
"https://docs.dify.ai/guides/application-publishing/embedding-in-websites",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/tool-plugin",
|
||||
"https://docs.dify.ai/en/use-dify/monitor/integrations/integrate-phoenix",
|
||||
"https://docs.dify.ai/en/develop-plugin/dev-guides-and-walkthroughs/develop-md-exporter",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/plugin-types/plugin-info-by-manifest",
|
||||
"https://docs.dify.ai/api-reference/chat-and-completions/chat",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/knowledge-pipeline/readme",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/create-knowledge/introduction",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/readme",
|
||||
"https://docs.dify.ai/zh/use-dify/workspace/subscription-management",
|
||||
"https://docs.dify.ai/en/self-host/platform-guides/dify-premium",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/tool-plugin",
|
||||
"https://docs.dify.ai/api-reference/statistics",
|
||||
"https://docs.dify.ai/api-reference/messages",
|
||||
"https://docs.dify.ai/en/self-host/troubleshooting/docker-issues",
|
||||
"https://docs.dify.ai/zh/use-dify/tutorials/simple-chatbot",
|
||||
"https://docs.dify.ai/en/use-dify/build/shortcut-key",
|
||||
"https://docs.dify.ai/ja/use-dify/publish/embedding-in-websites",
|
||||
"https://docs.dify.ai/zh/self-host/troubleshooting/common-issues",
|
||||
"https://docs.dify.ai/api-reference/documents/create-a-document-from-text",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/agent",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/plugin-types/general-specifications",
|
||||
"https://docs.dify.ai/zh/self-host/troubleshooting/docker-issues",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-pipeline/manage-knowledge-base",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/advanced-development/bundle",
|
||||
"https://docs.dify.ai/zh/self-host/platform-guides/dify-premium",
|
||||
"https://docs.dify.ai/api-reference/workflow-execution/get-workflow-logs",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/external-knowledge-api",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-pipeline/knowledge-pipeline-orchestration",
|
||||
"https://docs.dify.ai/zh/use-dify/getting-started/quick-start",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/advanced-development/reverse-invocation-model",
|
||||
"https://docs.dify.ai/zh/develop-plugin/features-and-specs/plugin-types/multilingual-readme",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/trigger/schedule-trigger",
|
||||
"https://docs.dify.ai/en/use-dify/build-ai-apps/build-agent",
|
||||
"https://docs.dify.ai/api-reference/metadata-&-tags/query-tags-bound-to-a-dataset",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/manage-knowledge/maintain-knowledge-documents",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/advanced-development/reverse-invocation",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/manage-knowledge/introduction",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/integrations/integrate-phoenix",
|
||||
"https://docs.dify.ai/zh/use-dify/workspace/readme",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/manage-knowledge/introduction",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/create-knowledge/chunking-and-cleaning-text",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/knowledge-base-creation/introduction",
|
||||
"https://docs.dify.ai/guides/application-publishing/developing-with-apis",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/output",
|
||||
"https://docs.dify.ai/en/use-dify/workspace/app-management",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/list-operator",
|
||||
"https://docs.dify.ai/api-reference/%E3%82%A2%E3%83%97%E3%83%AA%E3%82%B1%E3%83%BC%E3%82%B7%E3%83%A7%E3%83%B3%E8%A8%AD%E5%AE%9A-%E3%83%AF%E3%83%BC%E3%82%AF%E3%83%95%E3%83%AD%E3%83%BC/%E3%82%A2%E3%83%97%E3%83%AA%E3%81%AEwebapp%E8%A8%AD%E5%AE%9A%E3%82%92%E5%8F%96%E5%BE%97-%E3%83%AF%E3%83%BC%E3%82%AF%E3%83%95%E3%83%AD%E3%83%BC",
|
||||
"https://docs.dify.ai/zh/develop-plugin/dev-guides-and-walkthroughs/tool-oauth",
|
||||
"https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/chunking-and-cleaning-text",
|
||||
"https://docs.dify.ai/api-reference/workflows/workflow-run",
|
||||
"https://docs.dify.ai/api-reference/datasets/delete-a-knowledge-base",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%83%E3%83%88%E3%83%A1%E3%83%83%E3%82%BB%E3%83%BC%E3%82%B8/%E6%AC%A1%E3%81%AE%E6%8E%A8%E5%A5%A8%E8%B3%AA%E5%95%8F",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/create-knowledge/introduction",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/logs",
|
||||
"https://docs.dify.ai/en/develop-plugin/features-and-specs/plugin-types/tool",
|
||||
"https://docs.dify.ai/ja/use-dify/workspace/team-members-management",
|
||||
"https://docs.dify.ai/en/use-dify/workspace/subscription-management",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/creating-new-model-provider",
|
||||
"https://docs.dify.ai/en/use-dify/tutorials/customer-service-bot",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/plugin-types/plugin-info-by-manifest",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%83%E3%83%88%E3%83%A1%E3%83%83%E3%82%BB%E3%83%BC%E3%82%B8/%E3%83%81%E3%83%A3%E3%83%83%E3%83%88%E3%83%A1%E3%83%83%E3%82%BB%E3%83%BC%E3%82%B8%E3%82%92%E9%80%81%E4%BF%A1",
|
||||
"https://docs.dify.ai/ja/use-dify/build/additional-features",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/code",
|
||||
"https://docs.dify.ai/en/use-dify/debug/history-and-logs",
|
||||
"https://docs.dify.ai/api-reference/%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF/%E3%83%89%E3%82%AD%E3%83%A5%E3%83%A1%E3%83%B3%E3%83%88%E3%81%AB%E3%83%81%E3%83%A3%E3%83%B3%E3%82%AF%E3%82%92%E8%BF%BD%E5%8A%A0",
|
||||
"https://docs.dify.ai/api-reference/chunks/update-a-chunk-in-a-document",
|
||||
"https://docs.dify.ai/en/develop-plugin/publishing/marketplace-listing/release-by-file",
|
||||
"https://docs.dify.ai/api-reference/chunks/create-child-chunk",
|
||||
"https://docs.dify.ai/zh/use-dify/nodes/user-input",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/agent-strategy-plugin",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/template",
|
||||
"https://docs.dify.ai/api-reference/conversations/conversation-rename",
|
||||
"https://docs.dify.ai/en/use-dify/nodes/variable-assigner",
|
||||
"https://docs.dify.ai/ja/use-dify/getting-started/quick-start",
|
||||
"https://docs.dify.ai/api-reference/annotations/delete-annotation",
|
||||
"https://docs.dify.ai/ja/use-dify/knowledge/knowledge-and-documents-maintenance/introduction",
|
||||
"https://docs.dify.ai/ja/use-dify/nodes/answer",
|
||||
"https://docs.dify.ai/ja/develop-plugin/dev-guides-and-walkthroughs/cheatsheet",
|
||||
"https://docs.dify.ai/zh/use-dify/workspace/plugins",
|
||||
"https://docs.dify.ai/zh/use-dify/knowledge/knowledge-pipeline/create-knowledge-pipeline",
|
||||
"https://docs.dify.ai/ja/use-dify/monitor/integrations/integrate-aliyun",
|
||||
"https://docs.dify.ai/api-reference/データセット/ナレッジベースリストを取得",
|
||||
"https://docs.dify.ai/api-reference/データセット/ナレッジベース詳細を取得",
|
||||
"https://docs.dify.ai/api-reference/データセット/空のナレッジベースを作成",
|
||||
"https://docs.dify.ai/api-reference/ドキュメント/テキストからドキュメントを作成",
|
||||
"https://docs.dify.ai/api-reference/ドキュメント/テキストでドキュメントを更新",
|
||||
"https://docs.dify.ai/api-reference/ドキュメント/ドキュメントを削除",
|
||||
"https://docs.dify.ai/api-reference/ドキュメント/ドキュメントステータスを更新",
|
||||
"https://docs.dify.ai/api-reference/ドキュメント/ドキュメント埋め込みステータス(進捗)を取得",
|
||||
"https://docs.dify.ai/api-reference/ドキュメント/ドキュメント詳細を取得",
|
||||
"https://docs.dify.ai/api-reference/ドキュメント/ナレッジベースのドキュメントリストを取得",
|
||||
"https://docs.dify.ai/api-reference/ドキュメント/ファイルからドキュメントを作成",
|
||||
"https://docs.dify.ai/api-reference/ドキュメント/ファイルでドキュメントを更新",
|
||||
"https://docs.dify.ai/api-reference/ファイル操作-ワークフロー/ファイルアップロード-ワークフロー用",
|
||||
"https://docs.dify.ai/api-reference/ファイル操作/ファイルプレビュー",
|
||||
"https://docs.dify.ai/api-reference/メタデータ・タグ/データセットとナレッジベースタイプタグのバインドを解除",
|
||||
"https://docs.dify.ai/api-reference/メタデータ・タグ/データセットにバインドされたタグをクエリ",
|
||||
"https://docs.dify.ai/api-reference/メタデータ・タグ/データセットをナレッジベースタイプタグにバインド",
|
||||
"https://docs.dify.ai/api-reference/メタデータ・タグ/ナレッジベースタイプタグを削除",
|
||||
"https://docs.dify.ai/api-reference/メタデータ・タグ/ナレッジベースタイプタグを取得",
|
||||
"https://docs.dify.ai/api-reference/メタデータ・タグ/ナレッジベースタイプタグ名を変更",
|
||||
"https://docs.dify.ai/api-reference/メタデータ・タグ/新しいナレッジベースタイプタグを作成",
|
||||
"https://docs.dify.ai/api-reference/メッセージフィードバック/メッセージフィードバック",
|
||||
"https://docs.dify.ai/api-reference/モデル/利用可能な埋め込みモデルを取得",
|
||||
"https://docs.dify.ai/api-reference/ワークフロー実行/ワークフローを実行",
|
||||
"https://docs.dify.ai/api-reference/ワークフロー実行/ワークフローログを取得",
|
||||
"https://docs.dify.ai/api-reference/ワークフロー実行/ワークフロー実行詳細を取得",
|
||||
"https://docs.dify.ai/api-reference/ワークフロー実行/生成を停止-ワークフロータスク",
|
||||
"https://docs.dify.ai/api-reference/会話管理/会話の名前を変更",
|
||||
"https://docs.dify.ai/api-reference/会話管理/会話を削除",
|
||||
"https://docs.dify.ai/api-reference/会話管理/会話を取得",
|
||||
"https://docs.dify.ai/api-reference/会話管理/会話変数の取得",
|
||||
"https://docs.dify.ai/api-reference/会話管理/会話履歴メッセージを取得",
|
||||
"https://docs.dify.ai/api-reference/会话管理/会话重命名",
|
||||
"https://docs.dify.ai/api-reference/会话管理/删除会话",
|
||||
"https://docs.dify.ai/api-reference/会话管理/获取会话列表",
|
||||
"https://docs.dify.ai/api-reference/会话管理/获取会话历史消息",
|
||||
"https://docs.dify.ai/api-reference/会话管理/获取对话变量",
|
||||
"https://docs.dify.ai/api-reference/元数据和标签/创建新的知识库类型标签",
|
||||
"https://docs.dify.ai/api-reference/元数据和标签/删除知识库类型标签",
|
||||
"https://docs.dify.ai/api-reference/元数据和标签/将数据集绑定到知识库类型标签",
|
||||
"https://docs.dify.ai/api-reference/元数据和标签/查询绑定到数据集的标签",
|
||||
"https://docs.dify.ai/api-reference/元数据和标签/获取知识库类型标签",
|
||||
"https://docs.dify.ai/api-reference/元数据和标签/解绑数据集和知识库类型标签",
|
||||
"https://docs.dify.ai/api-reference/反馈/消息反馈(点赞)",
|
||||
"https://docs.dify.ai/api-reference/反馈/获取应用反馈列表",
|
||||
"https://docs.dify.ai/api-reference/完了メッセージ/完了メッセージの作成",
|
||||
"https://docs.dify.ai/api-reference/完了メッセージ/生成の停止",
|
||||
"https://docs.dify.ai/api-reference/对话消息/停止响应",
|
||||
"https://docs.dify.ai/api-reference/对话消息/获取下一轮建议问题列表",
|
||||
"https://docs.dify.ai/api-reference/工作流执行/停止响应-workflow-task",
|
||||
"https://docs.dify.ai/api-reference/工作流执行/获取-workflow-日志",
|
||||
"https://docs.dify.ai/api-reference/工作流执行/获取workflow执行情况",
|
||||
"https://docs.dify.ai/api-reference/应用设置/获取应用-webapp-设置",
|
||||
"https://docs.dify.ai/api-reference/应用设置/获取应用参数",
|
||||
"https://docs.dify.ai/api-reference/应用配置-workflow/获取应用-webapp-设置-workflow",
|
||||
"https://docs.dify.ai/api-reference/应用配置-workflow/获取应用参数-workflow",
|
||||
"https://docs.dify.ai/api-reference/应用配置-workflow/获取应用基本信息-workflow",
|
||||
"https://docs.dify.ai/api-reference/应用配置/获取应用-webapp-设置",
|
||||
"https://docs.dify.ai/api-reference/应用配置/获取应用meta信息",
|
||||
"https://docs.dify.ai/api-reference/应用配置/获取应用参数",
|
||||
"https://docs.dify.ai/api-reference/应用配置/获取应用基本信息",
|
||||
"https://docs.dify.ai/api-reference/数据集/创建空知识库",
|
||||
"https://docs.dify.ai/api-reference/数据集/删除知识库",
|
||||
"https://docs.dify.ai/api-reference/数据集/更新知识库",
|
||||
"https://docs.dify.ai/api-reference/数据集/获取知识库列表",
|
||||
"https://docs.dify.ai/api-reference/数据集/获取知识库详情",
|
||||
"https://docs.dify.ai/api-reference/文件操作-workflow/上传文件-workflow",
|
||||
"https://docs.dify.ai/api-reference/文件操作/文件预览",
|
||||
"https://docs.dify.ai/api-reference/文件管理/上传文件",
|
||||
"https://docs.dify.ai/api-reference/文本生成/停止响应",
|
||||
"https://docs.dify.ai/api-reference/文档/从文件创建文档",
|
||||
"https://docs.dify.ai/api-reference/文档/删除文档",
|
||||
"https://docs.dify.ai/api-reference/文档/更新文档状态",
|
||||
"https://docs.dify.ai/api-reference/文档/用文件更新文档",
|
||||
"https://docs.dify.ai/api-reference/文档/获取文档嵌入状态(进度)",
|
||||
"https://docs.dify.ai/api-reference/文档/获取文档详情",
|
||||
"https://docs.dify.ai/api-reference/文档/获取知识库的文档列表",
|
||||
"https://docs.dify.ai/api-reference/文档块/从文档获取块",
|
||||
"https://docs.dify.ai/api-reference/文档块/创建子块",
|
||||
"https://docs.dify.ai/api-reference/文档块/删除子块",
|
||||
"https://docs.dify.ai/api-reference/文档块/删除文档中的块",
|
||||
"https://docs.dify.ai/api-reference/文档块/向文档添加块",
|
||||
"https://docs.dify.ai/api-reference/文档块/更新子块",
|
||||
"https://docs.dify.ai/api-reference/文档块/更新文档中的块",
|
||||
"https://docs.dify.ai/api-reference/文档块/获取子块",
|
||||
"https://docs.dify.ai/api-reference/文档块/获取文档中的块详情",
|
||||
"https://docs.dify.ai/api-reference/标注管理/创建标注",
|
||||
"https://docs.dify.ai/api-reference/标注管理/删除标注",
|
||||
"https://docs.dify.ai/api-reference/标注管理/更新标注",
|
||||
"https://docs.dify.ai/api-reference/标注管理/查询标注回复初始设置任务状态",
|
||||
"https://docs.dify.ai/api-reference/标注管理/标注回复初始设置",
|
||||
"https://docs.dify.ai/api-reference/标注管理/获取标注列表",
|
||||
"https://docs.dify.ai/api-reference/模型/获取可用的嵌入模型",
|
||||
"https://docs.dify.ai/api-reference/消息反馈/消息反馈(点赞)",
|
||||
"https://docs.dify.ai/api-reference/消息反馈/获取app的消息点赞和反馈",
|
||||
"https://docs.dify.ai/api-reference/语音与文字转换/文字转语音",
|
||||
"https://docs.dify.ai/api-reference/语音与文字转换/语音转文字",
|
||||
"https://docs.dify.ai/api-reference/语音服务/文字转语音",
|
||||
"https://docs.dify.ai/api-reference/音声とテキスト変換/テキストから音声へ",
|
||||
"https://docs.dify.ai/api-reference/音声とテキスト変換/音声からテキストへ",
|
||||
"https://docs.dify.ai/api-reference/音声・テキスト変換/テキストから音声へ",
|
||||
"https://docs.dify.ai/api-reference/音声・テキスト変換/音声からテキストへ",
|
||||
"https://docs.dify.ai/api-reference/音声変換/テキストから音声",
|
||||
"https://docs.dify.ai/ja/develop-plugin/features-and-specs/plugin-types/plugin-logging",
|
||||
"https://docs.dify.ai/versions/2-8-x/en/user-guide/application-publishing/permission-management",
|
||||
"https://docs.dify.ai/versions/2-8-x/ja/user-guide/application-publishing/permission-management",
|
||||
"https://docs.dify.ai/versions/2-8-x/zh/user-guide/application-publishing/permission-management",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/annotation/annotation-reply",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/annotation/logs",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-orchestrate/agent",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-orchestrate/app-toolkits/moderation-tool",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-orchestrate/app-toolkits/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-orchestrate/chatbot-application",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-orchestrate/creating-an-application",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-orchestrate/multiple-llms-debugging",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-orchestrate/text-generator",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-publishing/based-on-frontend-templates",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-publishing/embedding-in-websites",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-publishing/launch-your-webapp-quickly/README",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-publishing/launch-your-webapp-quickly/conversation-application",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-publishing/launch-your-webapp-quickly/text-generator",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/application-publishing/launch-your-webapp-quickly/web-app-settings",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/extension/api-based-extension/README",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/extension/api-based-extension/cloudflare-workers",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/extension/api-based-extension/external-data-tool",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/extension/api-based-extension/moderation",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/extension/code-based-extension/README",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/extension/code-based-extension/external-data-tool",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/extension/code-based-extension/moderation",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/introduction",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/connect-external-knowledge-base",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/create-knowledge-and-upload-documents/chunking-and-cleaning-text",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/create-knowledge-and-upload-documents/import-content-data/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/create-knowledge-and-upload-documents/import-content-data/sync-from-notion",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/create-knowledge-and-upload-documents/import-content-data/sync-from-website",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/create-knowledge-and-upload-documents/setting-indexing-methods",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/external-knowledge-api",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/integrate-knowledge-within-application",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/knowledge-and-documents-maintenance/introduction",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/knowledge-and-documents-maintenance/maintain-dataset-via-api",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/knowledge-and-documents-maintenance/maintain-knowledge-documents",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/knowledge-base-creation/introduction",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/knowledge-request-rate-limit",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/metadata",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/knowledge-base/retrieval-test-and-citation",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/management/app-management",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/management/personal-account-management",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/management/team-members-management",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/management/version-control",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/model-configuration/customizable-model",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/model-configuration/interfaces",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/model-configuration/manage-model-credential",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/model-configuration/new-provider",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/model-configuration/predefined-model",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/model-configuration/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/model-configuration/schema",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/monitoring/README",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/monitoring/analysis",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/monitoring/integrate-external-ops-tools/README",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/monitoring/integrate-external-ops-tools/integrate-aliyun",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/monitoring/integrate-external-ops-tools/integrate-langfuse",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/monitoring/integrate-external-ops-tools/integrate-langsmith",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/monitoring/integrate-external-ops-tools/integrate-opik",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/monitoring/integrate-external-ops-tools/integrate-weave",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/README",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/additional-features",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/debug-and-preview/history",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/debug-and-preview/log",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/debug-and-preview/preview-and-run",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/debug-and-preview/step-run",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/error-handling/README",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/error-handling/error-type",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/error-handling/predefined-error-handling-logic",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/file-upload",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/key-concepts",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/answer",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/code",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/doc-extractor",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/end",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/http-request",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/ifelse",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/iteration",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/knowledge-retrieval",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/list-operator",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/llm",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/loop",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/parameter-extractor",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/question-classifier",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/start",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/template",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/tools",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/variable-aggregator",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/node/variable-assigner",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/orchestrate-node",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/publish",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/shortcut-key",
|
||||
"https://docs.dify.ai/versions/3-0-x/en/user-guide/workflow/variables",
|
||||
"https://docs.dify.ai/versions/3-0-x/ja/user-guide/introduction",
|
||||
"https://docs.dify.ai/versions/3-0-x/ja/user-guide/workflow/debug-and-preview/history",
|
||||
"https://docs.dify.ai/versions/3-0-x/ja/user-guide/workflow/debug-and-preview/log",
|
||||
"https://docs.dify.ai/versions/3-0-x/ja/user-guide/workflow/debug-and-preview/preview-and-run",
|
||||
"https://docs.dify.ai/versions/3-0-x/ja/user-guide/workflow/debug-and-preview/step-run",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/annotation/annotation-reply",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/annotation/logs",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-orchestrate/agent",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-orchestrate/app-toolkits/moderation-tool",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-orchestrate/app-toolkits/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-orchestrate/chatbot-application",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-orchestrate/creating-an-application",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-orchestrate/multiple-llms-debugging",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-orchestrate/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-publishing/based-on-frontend-templates",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-publishing/developing-with-apis",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-publishing/embedding-in-websites",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-publishing/launch-your-webapp-quickly/conversation-application",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-publishing/launch-your-webapp-quickly/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-publishing/launch-your-webapp-quickly/text-generator",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/application-publishing/launch-your-webapp-quickly/web-app-settings",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/introduction",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/api-documentation/external-knowledge-api-documentation",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/connect-external-knowledge-base",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/create-knowledge-and-upload-documents/chunking-and-cleaning-text",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/create-knowledge-and-upload-documents/import-content-data/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/create-knowledge-and-upload-documents/import-content-data/sync-from-notion",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/create-knowledge-and-upload-documents/import-content-data/sync-from-website",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/create-knowledge-and-upload-documents/setting-indexing-methods",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/integrate-knowledge-within-application",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-and-documents-maintenance/introduction",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-and-documents-maintenance/maintain-dataset-via-api",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-and-documents-maintenance/maintain-knowledge-documents",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-base-creation/introduction",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-pipeline/authorize-data-source",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-pipeline/create-knowledge-pipeline",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-pipeline/knowledge-pipeline-orchestration",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-pipeline/manage-knowledge-base",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-pipeline/publish-knowledge-pipeline",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-pipeline/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-pipeline/upload-files",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/knowledge-request-rate-limit",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/metadata",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/knowledge-base/retrieval-test-and-citation",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/management/app-management",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/management/subscription-management",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/management/team-members-management",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/management/version-control",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/model-configuration/customizable-model",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/model-configuration/interfaces",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/model-configuration/load-balancing",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/model-configuration/manage-model-credential",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/model-configuration/new-provider",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/model-configuration/predefined-model",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/model-configuration/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/model-configuration/schema",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/monitoring/README",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/monitoring/analysis",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/monitoring/integrate-external-ops-tools/integrate-aliyun",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/monitoring/integrate-external-ops-tools/integrate-langfuse",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/monitoring/integrate-external-ops-tools/integrate-langsmith",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/monitoring/integrate-external-ops-tools/integrate-opik",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/monitoring/integrate-external-ops-tools/integrate-weave",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/monitoring/integrate-external-ops-tools/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/tools/advanced-tool-integration",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/tools/quick-tool-integration",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/additional-feature",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/bulletin",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/debug-and-preview/history",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/debug-and-preview/log",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/debug-and-preview/preview-and-run",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/debug-and-preview/step-run",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/error-handling/error-type",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/error-handling/predefined-nodes-failure-logic",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/error-handling/readme",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/key-concept",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/answer",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/doc-extractor",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/end",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/http-request",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/ifelse",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/iteration",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/knowledge-retrieval",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/list-operator",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/loop",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/parameter-extractor",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/question-classifier",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/start",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/template",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/tools",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/variable-aggregator",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/node/variable-assigner",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/orchestrate-node",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/publish",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/shortcut-key",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/structured-outputs",
|
||||
"https://docs.dify.ai/versions/3-0-x/zh/user-guide/workflow/variables",
|
||||
"https://docs.dify.ai/versions/3-2-x/en/user-guide/introduction",
|
||||
"https://docs.dify.ai/versions/3-2-x/en/user-guide/workflow/debug-and-preview/variable-inspect",
|
||||
"https://docs.dify.ai/versions/3-2-x/ja/user-guide/introduction",
|
||||
"https://docs.dify.ai/versions/3-2-x/ja/user-guide/workflow/debug-and-preview/variable-inspect",
|
||||
"https://docs.dify.ai/versions/3-2-x/zh/user-guide/introduction",
|
||||
"https://docs.dify.ai/versions/3-2-x/zh/user-guide/workflow/debug-and-preview/variable-inspect",
|
||||
"https://docs.dify.ai/versions/3-3-x/en/user-guide/application-publishing/publish-mcp",
|
||||
"https://docs.dify.ai/versions/3-3-x/en/user-guide/introduction",
|
||||
"https://docs.dify.ai/versions/3-3-x/ja/user-guide/application-publishing/publish-mcp",
|
||||
"https://docs.dify.ai/versions/3-3-x/ja/user-guide/introduction",
|
||||
"https://docs.dify.ai/versions/3-3-x/ja/user-guide/tools/mcp",
|
||||
"https://docs.dify.ai/versions/3-3-x/zh/user-guide/application-publishing/publish-mcp",
|
||||
"https://docs.dify.ai/versions/3-3-x/zh/user-guide/introduction",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/application-publishing/based-on-frontend-templates",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/application-publishing/developing-with-apis",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/application-publishing/embedding-in-websites",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/application-publishing/launch-your-webapp-quickly/conversation-application",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/application-publishing/launch-your-webapp-quickly/text-generator",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/application-publishing/launch-your-webapp-quickly/web-app-settings",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/agent",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/chatbot",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/additional-feature",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/concepts",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/create-flow-app",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/file-upload",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/answer",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/code",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/doc-extractor",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/end",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/http-request",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/ifelse",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/iteration",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/knowledge-retrieval",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/list-operator",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/llm",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/parameter-extractor",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/question-classifier",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/start",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/template",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/tools",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/variable-aggregator",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/nodes/variable-assigner",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/orchestrate-node",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/flow-app/variables",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/build-app/text-generator",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/debug-app/chatflow-and-workflow/history",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/debug-app/chatflow-and-workflow/log",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/debug-app/chatflow-and-workflow/preview-and-run",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/debug-app/chatflow-and-workflow/step-run",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/management/app-management",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/management/personal-account-management",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/management/team-members-management",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/models/customizable-model",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/models/interfaces",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/models/load-balancing",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/models/model-configuration",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/models/new-provider",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/models/predefined-model",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/models/schema",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/monitoring/analysis",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/monitoring/annotation-reply",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/monitoring/integrate-external-ops-tools/integrate-aliyun",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/monitoring/integrate-external-ops-tools/integrate-langfuse",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/monitoring/integrate-external-ops-tools/integrate-langsmith",
|
||||
"https://docs.dify.ai/versions/legacy/ja/user-guide/monitoring/logs",
|
||||
"https://docs.dify.ai/zh/develop-plugin/publishing/standards/third-party-signature-verification",
|
||||
"https://docs.dify.ai/cn/use-dify/workspace/plugins",
|
||||
"https://docs.dify.ai/plugin-dev-en/9241-bundle",
|
||||
"https://docs.dify.ai/plugin-dev-ja/9241-bundle",
|
||||
"https://docs.dify.ai/plugin-dev-en/0411-general-specifications",
|
||||
"https://docs.dify.ai/plugin-dev-zh/0411-multilingual-readme"
|
||||
],
|
||||
"success": true
|
||||
}
|
||||
],
|
||||
"text": ""
|
||||
}
|
||||
106
nodes/parse_and_add_urls.py
Normal file
106
nodes/parse_and_add_urls.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import requests
|
||||
import json
|
||||
import math
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
def parse_urls(map_json: list[dict]):
|
||||
"""
|
||||
解析 Firecrawl 返回的数据
|
||||
"""
|
||||
if not map_json:
|
||||
return []
|
||||
|
||||
map_obj = map_json[0]
|
||||
# 稍微做个容错,防止有些时候结构不一样
|
||||
if not map_obj.get("success", False):
|
||||
# 如果不是必须抛异常,可以打印日志并返回空
|
||||
print(f"Firecrawl Map节点返回失败或无数据:{map_obj}")
|
||||
return []
|
||||
|
||||
urls = map_obj.get("links", [])
|
||||
return urls
|
||||
|
||||
def send_batch_request(urls_batch: list[str], task_id: int, BASE_URL: str):
|
||||
"""
|
||||
发送单个批次的请求
|
||||
"""
|
||||
try:
|
||||
# 设置 timeout 是好习惯,防止卡死
|
||||
# 因为我们不关心返回值,只要发出去就行,timeout设置短一点
|
||||
res = requests.post(
|
||||
f"{BASE_URL}/add_urls",
|
||||
json={
|
||||
"task_id": task_id,
|
||||
"urls": urls_batch
|
||||
},
|
||||
timeout=10 # 10秒没发完就拉倒,防止拖累主进程
|
||||
)
|
||||
if res.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
print(f"Batch failed with status {res.status_code}: {res.text[:100]}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Batch request error: {e}")
|
||||
return False
|
||||
|
||||
def main(map_json: list[dict], BASE_URL: str, task_id: float):
|
||||
|
||||
# 1. 解析 URL
|
||||
all_urls = parse_urls(map_json)
|
||||
total_count = len(all_urls)
|
||||
|
||||
if total_count == 0:
|
||||
return {"msg": "没有解析到URL"}
|
||||
|
||||
# ================= 配置区 =================
|
||||
BATCH_SIZE = 50 # 每一批发送 50 个 URL (根据你后端性能调整)
|
||||
MAX_WORKERS = 10 # 同时开 10 个线程并发发送
|
||||
# ==========================================
|
||||
|
||||
# 2. 将 URL 切片 (分批)
|
||||
# 比如 1000 个 URL,切成 20 个 batch
|
||||
batches = [all_urls[i:i + BATCH_SIZE] for i in range(0, total_count, BATCH_SIZE)]
|
||||
|
||||
print(f"总共 {total_count} 个URL,分为 {len(batches)} 批发送,并发数: {MAX_WORKERS}")
|
||||
|
||||
# 3. 多线程并发发送
|
||||
# 在 Dify/Lambda 等环境中,必须等待线程池执行完才能退出 main,
|
||||
# 否则请求还没发出去进程就被杀掉了。但因为是并发,速度会非常快。
|
||||
success_batches = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||
# 提交所有任务
|
||||
futures = [
|
||||
executor.submit(send_batch_request, batch, int(task_id), BASE_URL)
|
||||
for batch in batches
|
||||
]
|
||||
|
||||
# 等待完成 (as_completed)
|
||||
for future in as_completed(futures):
|
||||
if future.result():
|
||||
success_batches += 1
|
||||
|
||||
return {
|
||||
"status": "done",
|
||||
"total_urls": total_count,
|
||||
"batches_sent": len(batches),
|
||||
"success_batches": success_batches,
|
||||
"msg": "已使用多线程并发发送数据,忽略详细返回值"
|
||||
}
|
||||
|
||||
def test():
|
||||
import json
|
||||
from time import time
|
||||
with open("nodes\parse_and_add_urls.json", "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
map_json = data["json"]
|
||||
BASE_URL = "http://47.122.127.178"
|
||||
task_id = 6
|
||||
start_time = time()
|
||||
res = main(map_json, BASE_URL, task_id)
|
||||
end_time = time()
|
||||
print(f"添加URL耗时:{end_time - start_time}秒")
|
||||
print(res)
|
||||
|
||||
test()
|
||||
@@ -17,6 +17,21 @@ def main(map_json: list[dict]):
|
||||
map_obj = map_json[0]
|
||||
|
||||
return {
|
||||
"urls": map_obj["links"],
|
||||
"code": int(map_obj["success"]),
|
||||
# "urls": map_obj["links"],
|
||||
# "code": int(map_obj["success"]),
|
||||
"urls_obj": {
|
||||
"urls": map_obj["links"]
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
返回值示例
|
||||
{
|
||||
"urls_obj": {
|
||||
"urls": [
|
||||
"http://example.com/page1",
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
@@ -1,19 +1,27 @@
|
||||
def check_status(status_code: float, body: str):
|
||||
import json
|
||||
|
||||
def parse_response(status_code: float, body: str):
|
||||
'''
|
||||
检查状态码和约定的返回值
|
||||
并且返回正确的body
|
||||
'''
|
||||
if status_code != 200:
|
||||
raise Exception(f"注册任务失败,状态码:{status_code}")
|
||||
if "code" not in body or body["code"] != 1:
|
||||
|
||||
data = json.loads(body)
|
||||
|
||||
if "code" not in data or data["code"] != 1:
|
||||
raise Exception(f"注册任务失败,返回值:{body}")
|
||||
|
||||
return data["data"]
|
||||
|
||||
def main(status_code: float, body: str):
|
||||
try:
|
||||
check_status(status_code, body)
|
||||
data = parse_response(status_code, body)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
urls = body["data"]["urls"]
|
||||
urls = data["urls"]
|
||||
|
||||
return {
|
||||
"urls": urls,
|
||||
|
||||
14
nodes/parse_register.json
Normal file
14
nodes/parse_register.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"body": "{\"code\":1,\"msg\":\"Success\",\"data\":{\"task_id\":6,\"is_new_task\":false}}",
|
||||
"files": [],
|
||||
"headers": {
|
||||
"cache-status": "5e126c1b3d46;detail=mismatch",
|
||||
"connection": "keep-alive",
|
||||
"content-length": "67",
|
||||
"content-type": "application/json",
|
||||
"date": "Tue, 23 Dec 2025 08:30:31 GMT",
|
||||
"server": "uvicorn",
|
||||
"via": "1.1 5e126c1b3d46 (squid/6.13)"
|
||||
},
|
||||
"status_code": 200
|
||||
}
|
||||
@@ -1,23 +1,29 @@
|
||||
def check_status(status_code: float, body: str):
|
||||
import json
|
||||
def parse_response(status_code: float, body: str):
|
||||
'''
|
||||
检查状态码和约定的返回值
|
||||
并且返回正确的body
|
||||
'''
|
||||
if status_code != 200:
|
||||
raise Exception(f"注册任务失败,状态码:{status_code}")
|
||||
if "code" not in body or body["code"] != 1:
|
||||
|
||||
data = json.loads(body)
|
||||
|
||||
if "code" not in data or data["code"] != 1:
|
||||
raise Exception(f"注册任务失败,返回值:{body}")
|
||||
|
||||
return data["data"]
|
||||
|
||||
def main(status_code: float, body: str):
|
||||
try:
|
||||
check_status(status_code, body)
|
||||
data = parse_response(status_code, body)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
task_id = body["data"]["task_id"]
|
||||
is_new_task = body["data"]["is_new_task"]
|
||||
task_id = data["task_id"]
|
||||
is_new_task = data["is_new_task"]
|
||||
|
||||
return {
|
||||
"task_id": task_id,
|
||||
"is_new_task": is_new_task
|
||||
}
|
||||
}
|
||||
@@ -1,20 +1,27 @@
|
||||
def check_status(status_code: float, body: str):
|
||||
import json
|
||||
|
||||
def parse_response(status_code: float, body: str):
|
||||
'''
|
||||
检查状态码和约定的返回值
|
||||
并且返回正确的body
|
||||
'''
|
||||
if status_code != 200:
|
||||
raise Exception(f"注册任务失败,状态码:{status_code}")
|
||||
if "code" not in body or body["code"] != 1:
|
||||
|
||||
data = json.loads(body)
|
||||
|
||||
if "code" not in data or data["code"] != 1:
|
||||
raise Exception(f"注册任务失败,返回值:{body}")
|
||||
|
||||
return data["data"]
|
||||
|
||||
def main(status_code: float, body: str):
|
||||
try:
|
||||
check_status(status_code, body)
|
||||
data = parse_response(status_code, body)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
urls_result = body["data"]
|
||||
|
||||
return {
|
||||
"add_urls_result": urls_result
|
||||
"add_urls_result": data
|
||||
}
|
||||
|
||||
@@ -1,11 +1,19 @@
|
||||
def check_status(status_code: float, body: str):
|
||||
import json
|
||||
|
||||
def parse_response(status_code: float, body: str):
|
||||
'''
|
||||
检查状态码和约定的返回值
|
||||
并且返回正确的body
|
||||
'''
|
||||
if status_code != 200:
|
||||
raise Exception(f"注册任务失败,状态码:{status_code}")
|
||||
if "code" not in body or body["code"] != 1:
|
||||
|
||||
data = json.loads(body)
|
||||
|
||||
if "code" not in data or data["code"] != 1:
|
||||
raise Exception(f"注册任务失败,返回值:{body}")
|
||||
|
||||
return data["data"]
|
||||
|
||||
def format_rag_context(data: list) -> str:
|
||||
'''
|
||||
@@ -37,11 +45,10 @@ def format_rag_context(data: list) -> str:
|
||||
|
||||
def main(status_code: float, body: str):
|
||||
try:
|
||||
check_status(status_code, body)
|
||||
data = parse_response(status_code, body)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
data = body["data"]
|
||||
rag_context = format_rag_context(data)
|
||||
|
||||
return {
|
||||
|
||||
@@ -5,18 +5,27 @@ description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"dashscope>=1.25.5",
|
||||
"fastapi>=0.125.0",
|
||||
"firecrawl>=4.10.2",
|
||||
"firecrawl-py>=4.12.0",
|
||||
"langchain>=1.2.0",
|
||||
"langchain-community>=0.4.1",
|
||||
"langchain-core>=1.2.2",
|
||||
"matplotlib>=3.10.8",
|
||||
"mcp>=1.25.0",
|
||||
"numpy>=2.2.6",
|
||||
"pgvector>=0.4.2",
|
||||
"pinecone>=8.0.0",
|
||||
"pip>=25.3",
|
||||
"psycopg2-binary>=2.9.11",
|
||||
"pydantic-settings>=2.12.0",
|
||||
"pymilvus>=2.6.5",
|
||||
"qdrant-client==1.10.1",
|
||||
"redis>=7.1.0",
|
||||
"requests>=2.32.5",
|
||||
"seaborn>=0.13.2",
|
||||
"tabulate>=0.9.0",
|
||||
"tqdm>=4.67.1",
|
||||
"uvicorn>=0.38.0",
|
||||
]
|
||||
|
||||
192
scripts/evaluate_rag.py
Normal file
192
scripts/evaluate_rag.py
Normal file
@@ -0,0 +1,192 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import time
|
||||
import numpy as np
|
||||
from time import sleep
|
||||
|
||||
# 将项目根目录加入路径
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from backend.core.config import settings
|
||||
|
||||
# ================= ⚙️ 配置区域 =================
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
TASK_ID = 19 # ⚠️ 请修改为你实际爬取数据的 Task ID
|
||||
# 自动适配操作系统路径
|
||||
TEST_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_dataset.json")
|
||||
# ==============================================
|
||||
|
||||
class Colors:
|
||||
HEADER = '\033[95m'
|
||||
OKBLUE = '\033[94m'
|
||||
OKCYAN = '\033[96m'
|
||||
OKGREEN = '\033[92m'
|
||||
WARNING = '\033[93m'
|
||||
FAIL = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
|
||||
def get_rag_results(query):
|
||||
"""
|
||||
调用搜索接口并记录耗时
|
||||
"""
|
||||
start_ts = time.time()
|
||||
try:
|
||||
# 调用 V2 接口,该接口内部已集成 混合检索 -> Rerank
|
||||
res = requests.post(
|
||||
f"{BASE_URL}/api/v2/search",
|
||||
json={"query": query, "task_id": TASK_ID, "limit": 5}, # 获取 Top 5
|
||||
timeout=15
|
||||
)
|
||||
latency = (time.time() - start_ts) * 1000 # ms
|
||||
|
||||
if res.status_code != 200:
|
||||
print(f"{Colors.FAIL}❌ API Error {res.status_code}: {res.text}{Colors.ENDC}")
|
||||
return [], 0
|
||||
|
||||
res_json = res.json()
|
||||
chunks = res_json.get('data', {}).get('results', [])
|
||||
return chunks, latency
|
||||
except Exception as e:
|
||||
print(f"{Colors.FAIL}❌ 请求异常: {e}{Colors.ENDC}")
|
||||
return [], 0
|
||||
|
||||
def check_hit(content, keywords):
|
||||
"""
|
||||
检查切片相关性 (Relevance Check)
|
||||
使用关键词匹配作为 Ground Truth 的轻量级验证。
|
||||
"""
|
||||
if not keywords: return True # 拒答题或开放性题目跳过关键词检查
|
||||
if not content: return False
|
||||
|
||||
content_lower = content.lower()
|
||||
for k in keywords:
|
||||
if k.lower() in content_lower:
|
||||
return True
|
||||
return False
|
||||
|
||||
def run_evaluation():
|
||||
# 1. 加载测试集
|
||||
if not os.path.exists(TEST_FILE):
|
||||
print(f"{Colors.FAIL}❌ 找不到测试文件: {TEST_FILE}{Colors.ENDC}")
|
||||
print("请确保 scripts/test_dataset.json 文件存在。")
|
||||
return
|
||||
|
||||
with open(TEST_FILE, 'r', encoding='utf-8') as f:
|
||||
dataset = json.load(f)
|
||||
|
||||
print(f"{Colors.HEADER}🚀 开始全维度量化评测 (Task ID: {TASK_ID}){Colors.ENDC}")
|
||||
print(f"📄 测试集包含 {len(dataset)} 个样本\n")
|
||||
|
||||
# === 统计容器 ===
|
||||
metrics = {
|
||||
"p_at_1": [], # Precision@1: 正确答案排第1
|
||||
"hit_at_5": [], # HitRate@5: 正确答案在前5
|
||||
"mrr": [], # MRR: 倒数排名分数
|
||||
"latency": [] # 耗时
|
||||
}
|
||||
|
||||
# === 开始循环测试 ===
|
||||
for i, item in enumerate(dataset):
|
||||
query = item['query']
|
||||
print(f"📝 Case {i+1}: {Colors.BOLD}{query}{Colors.ENDC}")
|
||||
|
||||
# 执行检索
|
||||
chunks, latency = get_rag_results(query)
|
||||
metrics['latency'].append(latency)
|
||||
|
||||
# 计算单次指标
|
||||
is_hit_at_5 = 0
|
||||
p_at_1 = 0
|
||||
reciprocal_rank = 0.0
|
||||
hit_position = -1
|
||||
hit_chunk = None
|
||||
|
||||
# 遍历 Top 5 结果
|
||||
for idx, chunk in enumerate(chunks):
|
||||
if check_hit(chunk['content'], item['keywords']):
|
||||
# 命中!
|
||||
is_hit_at_5 = 1
|
||||
hit_position = idx
|
||||
reciprocal_rank = 1.0 / (idx + 1)
|
||||
hit_chunk = chunk
|
||||
|
||||
# 如果是第1个就命中了
|
||||
if idx == 0:
|
||||
p_at_1 = 1
|
||||
|
||||
# 找到即停止 (MRR计算只需知道第一个正确答案的位置)
|
||||
break
|
||||
|
||||
# 记录指标
|
||||
metrics['p_at_1'].append(p_at_1)
|
||||
metrics['hit_at_5'].append(is_hit_at_5)
|
||||
metrics['mrr'].append(reciprocal_rank)
|
||||
|
||||
# 打印单行结果
|
||||
if is_hit_at_5:
|
||||
rank_display = f"Rank {hit_position + 1}"
|
||||
color = Colors.OKGREEN if hit_position == 0 else Colors.OKCYAN
|
||||
source = hit_chunk.get('source_url', 'Unknown')
|
||||
|
||||
# 跨语言污染检查 (简单规则)
|
||||
warning = ""
|
||||
if "/es/" in source and "Spanish" not in query: warning = f"{Colors.WARNING}[跨语言风险]{Colors.ENDC}"
|
||||
elif "/zh/" in source and "如何" not in query and "什么" not in query: warning = f"{Colors.WARNING}[跨语言风险]{Colors.ENDC}"
|
||||
|
||||
print(f" {color}✅ 命中 ({rank_display}){Colors.ENDC} | MRR: {reciprocal_rank:.2f} | 耗时: {latency:.0f}ms {warning}")
|
||||
else:
|
||||
print(f" {Colors.FAIL}❌ 未命中{Colors.ENDC} | 预期关键词: {item['keywords']}")
|
||||
|
||||
# 稍微间隔,避免触发 API 频率限制
|
||||
sleep(0.1)
|
||||
|
||||
# === 最终计算 ===
|
||||
count = len(dataset)
|
||||
if count == 0: return
|
||||
|
||||
avg_p1 = np.mean(metrics['p_at_1']) * 100
|
||||
avg_hit5 = np.mean(metrics['hit_at_5']) * 100
|
||||
avg_mrr = np.mean(metrics['mrr'])
|
||||
avg_latency = np.mean(metrics['latency'])
|
||||
p95_latency = np.percentile(metrics['latency'], 95)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print(f"{Colors.HEADER}📊 最终量化评估报告 (Evaluation Report){Colors.ENDC}")
|
||||
print("="*60)
|
||||
|
||||
# 1. Precision@1 (最关键指标)
|
||||
print(f"🥇 {Colors.BOLD}Precision@1 (首位精确率): {avg_p1:.1f}%{Colors.ENDC}")
|
||||
print(f" - 意义: 用户能否直接得到正确答案。引入 Rerank 后此项应显著提高。")
|
||||
|
||||
# 2. Hit Rate / Recall@5
|
||||
print(f"🥈 Hit Rate@5 (前五召回率): {avg_hit5:.1f}%")
|
||||
print(f" - 意义: 数据库是否真的包含答案。如果此项低,说明爬虫没爬全或混合检索漏了。")
|
||||
|
||||
# 3. MRR
|
||||
print(f"🥉 MRR (平均倒数排名): {avg_mrr:.3f} / 1.0")
|
||||
|
||||
# 4. Latency
|
||||
print(f"⚡ Avg Latency (平均耗时): {avg_latency:.0f} ms")
|
||||
print(f"⚡ P95 Latency (95%分位): {p95_latency:.0f} ms")
|
||||
print("="*60)
|
||||
|
||||
# === 智能诊断 ===
|
||||
print(f"{Colors.HEADER}💡 诊断建议:{Colors.ENDC}")
|
||||
|
||||
if avg_p1 < avg_hit5:
|
||||
gap = avg_hit5 - avg_p1
|
||||
print(f" • {Colors.WARNING}排序优化空间大{Colors.ENDC}: 召回了但没排第一的情况占 {gap:.1f}%。")
|
||||
print(" -> 你的 Rerank 模型生效了吗?或者 Rerank 的 Top N 截断是否太早?")
|
||||
elif avg_p1 > 80:
|
||||
print(f" • {Colors.OKGREEN}排序效果优秀{Colors.ENDC}: 绝大多数正确答案都排在第一位。")
|
||||
|
||||
if avg_hit5 < 50:
|
||||
print(f" • {Colors.FAIL}召回率过低{Colors.ENDC}: 可能是测试集关键词太生僻,或者 TS_RANK 权重过低。")
|
||||
|
||||
if avg_latency > 2000:
|
||||
print(f" • {Colors.WARNING}系统响应慢{Colors.ENDC}: 2秒以上。检查是否因为 Rerank 文档过多(建议 <= 50个)。")
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_evaluation()
|
||||
@@ -1,8 +0,0 @@
|
||||
import random
|
||||
|
||||
# 生成1536 8位随机向量
|
||||
def generate_random_vector(dim=1536):
|
||||
return [round(random.uniform(-1, 1), 8) for _ in range(dim)]
|
||||
|
||||
data = [generate_random_vector() for _ in range(1000)]
|
||||
print(data[0])
|
||||
130
scripts/rob.py
Normal file
130
scripts/rob.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import requests
|
||||
import json
|
||||
import dashscope
|
||||
from http import HTTPStatus
|
||||
from typing import List, Dict
|
||||
|
||||
# ================= 配置区域 =================
|
||||
# 1. 设置 DashScope API Key (这里填入你提供的 Key)
|
||||
dashscope.api_key = "sk-8b091493de594c5e9eb42f12f1cc5805"
|
||||
|
||||
# 2. 本地后端地址 (刚才写的 FastAPI)
|
||||
BACKEND_SEARCH_URL = "http://127.0.0.1:8000/api/v2/search"
|
||||
|
||||
# 3. 选择模型 (qwen-turbo, qwen-plus, qwen-max)
|
||||
MODEL_NAME = dashscope.Generation.Models.qwen_plus
|
||||
# ===========================================
|
||||
|
||||
class WikiBot:
|
||||
def __init__(self):
|
||||
self.history = [] #以此保存多轮对话上下文(可选)
|
||||
|
||||
def search_knowledge_base(self, query: str, top_k: int = 5) -> List[Dict]:
|
||||
"""调用本地后端接口检索相关知识"""
|
||||
try:
|
||||
payload = {
|
||||
"query": query,
|
||||
"limit": top_k
|
||||
}
|
||||
# 调用 /api/v2/search,后端会自动做 embedding
|
||||
resp = requests.post(BACKEND_SEARCH_URL, json=payload)
|
||||
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get("code") == 1:
|
||||
return data.get("data", [])
|
||||
|
||||
print(f"[Warning] 检索失败: {resp.text}")
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"[Error] 连接后端失败: {e}")
|
||||
return []
|
||||
|
||||
def build_prompt(self, query: str, context_chunks: List[Dict]) -> str:
|
||||
"""构建 RAG 提示词"""
|
||||
|
||||
if not context_chunks:
|
||||
return f"用户问题:{query}\n\n当前知识库中没有找到相关信息,请直接告知用户无法回答。"
|
||||
|
||||
# 拼接参考资料
|
||||
context_str = ""
|
||||
for idx, item in enumerate(context_chunks):
|
||||
# 这里把 source_url 也带上,方便 AI 引用来源
|
||||
source = item.get('source_url', '未知来源')
|
||||
content = item.get('content', '').strip()
|
||||
context_str += f"【参考资料 {idx+1}】(来源: {source}):\n{content}\n\n"
|
||||
|
||||
# 系统提示词 (System Prompt)
|
||||
prompt = f"""你是一个专业的 Wiki 知识库助手。
|
||||
请严格根据下方的【参考上下文】来回答用户的【问题】。
|
||||
|
||||
要求:
|
||||
1. 回答要准确、简洁,并整合不同参考资料中的信息。
|
||||
2. 如果【参考上下文】中包含答案,请用自己的话回答,并在句尾标注来源,例如 [参考资料 1]。
|
||||
3. 如果【参考上下文】与问题无关或不包含答案,请直接回答:“知识库中暂未收录相关信息”,不要编造答案。
|
||||
4. 保持回答格式清晰(可以使用 Markdown)。
|
||||
|
||||
====== 参考上下文 开始 ======
|
||||
{context_str}
|
||||
====== 参考上下文 结束 ======
|
||||
|
||||
用户问题:{query}
|
||||
"""
|
||||
return prompt
|
||||
|
||||
def chat(self, query: str):
|
||||
"""主对话逻辑"""
|
||||
print(f"\n🔍 正在检索知识库...")
|
||||
|
||||
# 1. 检索
|
||||
chunks = self.search_knowledge_base(query)
|
||||
print(f"✅ 找到 {len(chunks)} 条相关资料")
|
||||
|
||||
# 2. 构建 Prompt
|
||||
prompt = self.build_prompt(query, chunks)
|
||||
|
||||
# (可选) 调试时打印 prompt 看看给 AI 喂了什么
|
||||
# print(f"DEBUG PROMPT:\n{prompt}\n")
|
||||
|
||||
print("🤖 Wiki助手正在思考...\n" + "-"*30)
|
||||
|
||||
# 3. 调用 DashScope 生成 (流式输出)
|
||||
responses = dashscope.Generation.call(
|
||||
model=MODEL_NAME,
|
||||
messages=[
|
||||
{'role': 'system', 'content': 'You are a helpful assistant.'},
|
||||
{'role': 'user', 'content': prompt}
|
||||
],
|
||||
result_format='message', # 设置输出为 message 格式
|
||||
stream=True, # 开启流式输出
|
||||
incremental_output=True # 增量输出
|
||||
)
|
||||
|
||||
full_content = ""
|
||||
for response in responses:
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
text = response.output.choices[0]['message']['content']
|
||||
full_content += text
|
||||
print(text, end='', flush=True)
|
||||
else:
|
||||
print(f"\nRequest id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}")
|
||||
|
||||
print("\n" + "-"*30 + "\n")
|
||||
|
||||
# ================= 运行入口 =================
|
||||
if __name__ == "__main__":
|
||||
bot = WikiBot()
|
||||
print("✨ Wiki 知识库助手已启动 (输入 'q' 或 'exit' 退出)")
|
||||
print("⚠️ 请确保后端服务 (main.py) 正在 localhost:8000 运行")
|
||||
|
||||
while True:
|
||||
user_input = input("\n🙋 请输入问题: ").strip()
|
||||
|
||||
if user_input.lower() in ['q', 'exit', 'quit']:
|
||||
print("再见!")
|
||||
break
|
||||
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
bot.chat(user_input)
|
||||
@@ -1,87 +1,176 @@
|
||||
import requests
|
||||
import time
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
|
||||
# 配置后端地址
|
||||
BASE_URL = "http://127.0.0.1:8000"
|
||||
# ================= ⚙️ 配置区域 =================
|
||||
BASE_URL = "http://47.122.127.178/api/v3"
|
||||
# 测试目标:Firecrawl 官方文档 (结构清晰,适合测试)
|
||||
TARGET_URL = "https://docs.firecrawl.dev"
|
||||
# 测试搜索词
|
||||
TEST_QUERY = "credits pricing"
|
||||
# ==============================================
|
||||
|
||||
def log_res(name, response):
|
||||
print(f"\n=== 测试接口: {name} ===")
|
||||
if response.status_code == 200:
|
||||
res_json = response.json()
|
||||
print(f"状态: 成功 (HTTP 200)")
|
||||
print(f"返回数据: {json.dumps(res_json, indent=2, ensure_ascii=False)}")
|
||||
return res_json
|
||||
else:
|
||||
print(f"状态: 失败 (HTTP {response.status_code})")
|
||||
print(f"错误信息: {response.text}")
|
||||
return None
|
||||
class Colors:
|
||||
HEADER = '\033[95m'
|
||||
OKBLUE = '\033[94m'
|
||||
OKCYAN = '\033[96m'
|
||||
OKGREEN = '\033[92m'
|
||||
WARNING = '\033[93m'
|
||||
FAIL = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
|
||||
def run_tests():
|
||||
# 测试数据准备
|
||||
test_root_url = f"https://example.com/wiki_{random.randint(1000, 9999)}"
|
||||
|
||||
# 1. 测试 /register
|
||||
print("步骤 1: 注册新任务...")
|
||||
res = requests.post(f"{BASE_URL}/register", json={"url": test_root_url})
|
||||
data = log_res("注册任务", res)
|
||||
if not data or data['code'] != 1: return
|
||||
task_id = data['data']['task_id']
|
||||
def log(step, msg, color=Colors.OKBLUE):
|
||||
print(f"{color}[{step}] {msg}{Colors.ENDC}")
|
||||
|
||||
# 2. 测试 /add_urls
|
||||
print("\n步骤 2: 模拟爬虫发现了新链接,存入队列...")
|
||||
sub_urls = [
|
||||
f"{test_root_url}/page1",
|
||||
f"{test_root_url}/page2",
|
||||
f"{test_root_url}/page1" # 故意重复一个,测试后端去重
|
||||
]
|
||||
res = requests.post(f"{BASE_URL}/add_urls", json={
|
||||
"task_id": task_id,
|
||||
"urls": sub_urls
|
||||
})
|
||||
log_res("存入新链接", res)
|
||||
def run_v3_test():
|
||||
print(f"{Colors.HEADER}🚀 开始 Wiki Crawler V3 API 全链路测试{Colors.ENDC}\n")
|
||||
|
||||
# 3. 测试 /pending_urls
|
||||
print("\n步骤 3: 模拟爬虫节点获取待处理任务...")
|
||||
res = requests.post(f"{BASE_URL}/pending_urls", json={
|
||||
"task_id": task_id,
|
||||
"limit": 2
|
||||
})
|
||||
data = log_res("获取待处理URL", res)
|
||||
if not data or not data['data']['urls']:
|
||||
print("没有获取到待处理URL,停止后续测试")
|
||||
# ---------------------------------------------------------
|
||||
# 1. 创建任务 (POST /tasks)
|
||||
# ---------------------------------------------------------
|
||||
log("STEP 1", f"创建任务 (Map): {TARGET_URL}")
|
||||
try:
|
||||
res = requests.post(f"{BASE_URL}/tasks", json={"url": TARGET_URL})
|
||||
resp = res.json()
|
||||
|
||||
if resp['code'] != 1:
|
||||
log("FAIL", f"任务创建失败: {resp}", Colors.FAIL)
|
||||
return
|
||||
|
||||
data = resp['data']
|
||||
task_id = data['task_id']
|
||||
count = data.get('count', 0)
|
||||
is_new = data.get('is_new', False)
|
||||
|
||||
status_text = "新任务" if is_new else "已有任务"
|
||||
log("SUCCESS", f"ID: {task_id} | 状态: {status_text} | 发现链接: {count}", Colors.OKGREEN)
|
||||
|
||||
except Exception as e:
|
||||
log("FAIL", f"请求异常: {e}", Colors.FAIL)
|
||||
return
|
||||
|
||||
target_url = data['data']['urls'][0]
|
||||
|
||||
# 4. 测试 /save_results
|
||||
print("\n步骤 4: 模拟爬虫抓取完成,存入知识片段和向量...")
|
||||
# 模拟一个 1536 维的向量(已处理精度)
|
||||
mock_embedding = [round(random.uniform(-1, 1), 8) for _ in range(1536)]
|
||||
# ---------------------------------------------------------
|
||||
# 2. 触发执行 (POST /tasks/{id}/run)
|
||||
# ---------------------------------------------------------
|
||||
log("STEP 2", f"触发后台多线程爬取 (Task {task_id})")
|
||||
try:
|
||||
# batch_size=10, 意味着会启动多线程处理这10个链接
|
||||
res = requests.post(
|
||||
f"{BASE_URL}/tasks/{task_id}/run",
|
||||
json={"batch_size": 10}
|
||||
)
|
||||
resp = res.json()
|
||||
|
||||
if resp['code'] == 1:
|
||||
log("SUCCESS", "后台任务已接受 (202 Accepted)", Colors.OKGREEN)
|
||||
else:
|
||||
log("FAIL", f"启动失败: {resp}", Colors.FAIL)
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
log("FAIL", f"请求异常: {e}", Colors.FAIL)
|
||||
return
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 3. 实时监控 (GET /tasks/{id})
|
||||
# ---------------------------------------------------------
|
||||
log("STEP 3", "进入实时监控模式 (轮询状态)...", Colors.OKCYAN)
|
||||
|
||||
payload = {
|
||||
"task_id": task_id,
|
||||
"results": [
|
||||
{
|
||||
"source_url": target_url,
|
||||
"chunk_index": 0,
|
||||
"title": "测试页面标题 - 切片1",
|
||||
"content": "这是模拟抓取到的第一段网页内容...",
|
||||
"embedding": mock_embedding
|
||||
},
|
||||
{
|
||||
"source_url": target_url,
|
||||
"chunk_index": 1,
|
||||
"title": "测试页面标题 - 切片2",
|
||||
"content": "这是模拟抓取到的第二段网页内容...",
|
||||
"embedding": mock_embedding
|
||||
max_retries = 20
|
||||
is_completed_batch = False
|
||||
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
res = requests.get(f"{BASE_URL}/tasks/{task_id}")
|
||||
monitor = res.json()['data']
|
||||
stats = monitor['stats']
|
||||
active_threads = monitor['active_threads']
|
||||
|
||||
# 格式化输出状态
|
||||
active_count = len(active_threads)
|
||||
progress_bar = f"Pending: {stats['pending']} | Processing: {stats['processing']}/{active_count} | Completed: {stats['completed']}"
|
||||
|
||||
print(f" [{i+1}/{max_retries}] {progress_bar}")
|
||||
|
||||
# 打印正在爬取的 URL (抽样)
|
||||
if active_threads:
|
||||
# 显示全部
|
||||
print("\n ⚡ 当前正在处理的 URL:")
|
||||
for url in active_threads:
|
||||
print(f" ⚡ {url}")
|
||||
|
||||
# 判断完成条件:
|
||||
# 1. 数据库 processing 为 0
|
||||
# 2. 内存 active_threads 为 0
|
||||
# 3. 至少有一个 completed (防止任务没开始就判定结束)
|
||||
if stats['processing'] == 0 and active_count == 0 and stats['completed'] > 0:
|
||||
is_completed_batch = True
|
||||
print("\n")
|
||||
log("SUCCESS", "✅ 当前批次处理完毕!", Colors.OKGREEN)
|
||||
break
|
||||
|
||||
time.sleep(1.5) # 轮询间隔
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n ⚠️ 监控异常: {e}")
|
||||
break
|
||||
|
||||
if not is_completed_batch:
|
||||
print("\n")
|
||||
log("WARN", "监控超时,爬虫可能仍在后台运行,继续测试搜索...", Colors.WARNING)
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 4. 混合搜索与验证 (POST /search)
|
||||
# ---------------------------------------------------------
|
||||
log("STEP 4", f"测试混合检索 + Rerank: '{TEST_QUERY}'")
|
||||
|
||||
try:
|
||||
res = requests.post(
|
||||
f"{BASE_URL}/search",
|
||||
json={
|
||||
"query": TEST_QUERY,
|
||||
"task_id": task_id,
|
||||
"limit": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
res = requests.post(f"{BASE_URL}/save_results", json=payload)
|
||||
log_res("保存结果", res)
|
||||
)
|
||||
resp = res.json()
|
||||
|
||||
if resp['code'] != 1:
|
||||
log("FAIL", f"搜索失败: {resp}", Colors.FAIL)
|
||||
return
|
||||
|
||||
results = resp['data']['results']
|
||||
if not results:
|
||||
log("FAIL", "未搜索到结果 (Result Empty)", Colors.FAIL)
|
||||
return
|
||||
|
||||
log("SUCCESS", f"搜索命中 {len(results)} 条结果", Colors.OKGREEN)
|
||||
|
||||
# === 详细验证 ===
|
||||
first = results[0]
|
||||
print(f"\n{Colors.WARNING}--- Top 1 结果详情 ---{Colors.ENDC}")
|
||||
print(f"📄 标题: {first.get('title', 'N/A')}")
|
||||
print(f"🔗 链接: {first.get('source_url')}")
|
||||
print(f"🧭 路径: {first.get('meta_info', {}).get('header_path', 'N/A')}")
|
||||
print(f"🎯 分数: {first.get('score')} " + ("(Reranked)" if first.get('reranked') else "(Rough)"))
|
||||
print(f"📝 内容: {first.get('content')[:80].replace(chr(10), ' ')}...")
|
||||
print(f"{Colors.WARNING}-----------------------{Colors.ENDC}\n")
|
||||
|
||||
# 自动断言
|
||||
if first.get('meta_info') and 'header_path' in first['meta_info']:
|
||||
print(f"✅ [Phase 1.5] 结构化元数据验证通过")
|
||||
else:
|
||||
print(f"❌ [Phase 1.5] 缺少元数据")
|
||||
|
||||
if first.get('reranked'):
|
||||
print(f"✅ [Phase 2.5] Rerank 重排序生效")
|
||||
else:
|
||||
print(f"⚠️ [Phase 2.5] Rerank 未标记 (可能是降级或代码未更新)")
|
||||
|
||||
print("\n✅ 所有 API 流程测试完成!")
|
||||
except Exception as e:
|
||||
log("FAIL", str(e), Colors.FAIL)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
run_v3_test()
|
||||
86
scripts/test_dataset.json
Normal file
86
scripts/test_dataset.json
Normal file
@@ -0,0 +1,86 @@
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"type": "core_function",
|
||||
"query": "What is the difference between /scrape and /map endpoints?",
|
||||
"ground_truth": "/map is used to crawl a website and retrieve all URLs, while /scrape is used to extract content from a specific URL.",
|
||||
"keywords": ["URL", "content", "specific", "retrieve"]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "new_feature",
|
||||
"query": "What is the Deep Research feature?",
|
||||
"ground_truth": "Deep Research is an alpha feature allowing agents to perform iterative research tasks.",
|
||||
"keywords": ["alpha", "iterative", "research", "agent"]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "integration",
|
||||
"query": "How can I integrate Firecrawl with ChatGPT?",
|
||||
"ground_truth": "Firecrawl can be integrated via MCP (Model Context Protocol).",
|
||||
"keywords": ["MCP", "Model Context Protocol", "setup"]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "multilingual_zh",
|
||||
"query": "如何进行私有化部署 (Self-host)?",
|
||||
"ground_truth": "你需要使用 Docker Compose 进行部署,文档位于 /self-host/quick-start/docker-compose。",
|
||||
"keywords": ["Docker", "Compose", "self-host", "deploy"]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "api_detail",
|
||||
"query": "What parameters are available for the /extract endpoint?",
|
||||
"ground_truth": "The extract endpoint allows defining a schema for structured data extraction.",
|
||||
"keywords": ["schema", "structured", "prompt"]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "numeric",
|
||||
"query": "How do credits work for the scrape endpoint?",
|
||||
"ground_truth": "Specific credit usage details are in the /credits endpoint documentation (usually 1 credit per page for basic scrape).",
|
||||
"keywords": ["credit", "usage", "cost"]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "negative_test",
|
||||
"query": "Does Firecrawl support scraping video content from YouTube?",
|
||||
"ground_truth": "The documentation does not mention video scraping support.",
|
||||
"keywords": []
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "advanced",
|
||||
"query": "How to use batch scrape?",
|
||||
"ground_truth": "Use the /batch/scrape endpoint to submit multiple URLs at once.",
|
||||
"keywords": ["batch", "multiple", "URLs"]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "automation",
|
||||
"query": "Is there an n8n integration guide?",
|
||||
"ground_truth": "Yes, there is a workflow automation guide for n8n.",
|
||||
"keywords": ["n8n", "workflow", "automation"]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "security",
|
||||
"query": "Where can I find information about webhook security?",
|
||||
"ground_truth": "Information is available in the Webhooks Security section.",
|
||||
"keywords": ["webhook", "security", "signature"]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "cross_lingual_trap",
|
||||
"query": "Explain the crawl features in French.",
|
||||
"ground_truth": "The system should ideally retrieve the French document (/fr/features/crawl) and answer in French.",
|
||||
"keywords": ["fonctionnalités", "crawl", "fr"]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "api_history",
|
||||
"query": "How to check historical token usage?",
|
||||
"ground_truth": "Use the /token-usage-historical endpoint.",
|
||||
"keywords": ["token", "usage", "historical"]
|
||||
}
|
||||
]
|
||||
76
scripts/test_firecrawl.py
Normal file
76
scripts/test_firecrawl.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import requests
|
||||
import json
|
||||
|
||||
def map_firecrawl_docs():
|
||||
"""
|
||||
调用本地 Firecrawl 服务的 map 功能,映射 Firecrawl 文档地址
|
||||
"""
|
||||
# 本地 Firecrawl 服务地址(默认端口3002,根据你的实际情况调整)
|
||||
base_url = "http://localhost:3002"
|
||||
# map 功能的接口路径
|
||||
map_endpoint = f"{base_url}/api/v1/map"
|
||||
|
||||
# 请求参数
|
||||
payload = {
|
||||
# Firecrawl 文档的基础地址
|
||||
"url": "https://docs.firecrawl.dev",
|
||||
# 可选配置:控制爬取深度、是否包含子路径等
|
||||
"config": {
|
||||
"depth": 2, # 爬取深度,2层足够覆盖文档主要结构
|
||||
"includeSubdomains": False,
|
||||
"onlyMainContent": True, # 只爬取主要内容,过滤导航/广告等
|
||||
"limit": 50 # 限制爬取页面数量,避免爬取过多
|
||||
}
|
||||
}
|
||||
|
||||
# 请求头(如果本地服务需要认证,添加你的 API Key)
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
# 如果有 API Key,取消下面注释并替换为你的密钥
|
||||
# "Authorization": "Bearer YOUR_FIRECRAWL_API_KEY"
|
||||
}
|
||||
|
||||
try:
|
||||
# 发送 POST 请求调用 map 功能
|
||||
response = requests.post(
|
||||
map_endpoint,
|
||||
data=json.dumps(payload),
|
||||
headers=headers,
|
||||
timeout=60 # 设置超时时间,避免长时间等待
|
||||
)
|
||||
|
||||
# 检查响应状态
|
||||
response.raise_for_status()
|
||||
|
||||
# 解析响应结果
|
||||
result = response.json()
|
||||
print("✅ Map 功能调用成功!")
|
||||
print("\n📄 爬取结果概览:")
|
||||
print(f"总页面数: {len(result.get('links', []))}")
|
||||
print(f"基础URL: {result.get('baseUrl')}")
|
||||
|
||||
# 打印爬取到的所有链接
|
||||
print("\n🔗 爬取到的文档链接:")
|
||||
for idx, link in enumerate(result.get('links', []), 1):
|
||||
print(f"{idx}. {link}")
|
||||
|
||||
# 保存结果到本地文件(方便后续查看)
|
||||
with open("firecrawl_docs_map.json", "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
print("\n💾 结果已保存到 firecrawl_docs_map.json 文件")
|
||||
|
||||
return result
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
print("❌ 连接失败!请检查本地 Firecrawl 服务是否正在运行(http://localhost:3002)")
|
||||
except requests.exceptions.Timeout:
|
||||
print("❌ 请求超时!爬取文档可能需要更长时间,可调整 timeout 参数")
|
||||
except requests.exceptions.HTTPError as e:
|
||||
print(f"❌ HTTP 错误:{e}")
|
||||
print(f"响应内容:{response.text}")
|
||||
except Exception as e:
|
||||
print(f"❌ 未知错误:{str(e)}")
|
||||
|
||||
# 执行调用
|
||||
if __name__ == "__main__":
|
||||
map_firecrawl_docs()
|
||||
82
scripts/update_sql.py
Normal file
82
scripts/update_sql.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from sqlalchemy import create_engine, text
|
||||
from backend.core.config import settings
|
||||
|
||||
def update_database_schema():
|
||||
"""
|
||||
数据库无损升级脚本
|
||||
"""
|
||||
print(f"🔌 连接数据库: {settings.DB_NAME}...")
|
||||
engine = create_engine(settings.DATABASE_URL)
|
||||
|
||||
commands = [
|
||||
# 1. 安全添加 meta_info 列 (旧数据会自动填充为 {})
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='knowledge_chunks' AND column_name='meta_info') THEN
|
||||
ALTER TABLE knowledge_chunks ADD COLUMN meta_info JSONB DEFAULT '{}';
|
||||
RAISE NOTICE '已添加 meta_info 列';
|
||||
END IF;
|
||||
END $$;
|
||||
""",
|
||||
|
||||
# 2. 安全添加 content_tsvector 列
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name='knowledge_chunks' AND column_name='content_tsvector') THEN
|
||||
ALTER TABLE knowledge_chunks ADD COLUMN content_tsvector TSVECTOR;
|
||||
RAISE NOTICE '已添加 content_tsvector 列';
|
||||
END IF;
|
||||
END $$;
|
||||
""",
|
||||
|
||||
# 3. 创建索引 (不影响现有数据)
|
||||
"CREATE INDEX IF NOT EXISTS idx_chunks_meta ON knowledge_chunks USING GIN (meta_info);",
|
||||
"CREATE INDEX IF NOT EXISTS idx_chunks_tsvector ON knowledge_chunks USING GIN (content_tsvector);",
|
||||
|
||||
# 4. 创建触发器函数 (用于新插入的数据)
|
||||
"""
|
||||
CREATE OR REPLACE FUNCTION chunks_tsvector_trigger() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
new.content_tsvector := to_tsvector('english', coalesce(new.title, '') || ' ' || new.content);
|
||||
return new;
|
||||
END
|
||||
$$ LANGUAGE plpgsql;
|
||||
""",
|
||||
|
||||
# 5. 绑定触发器
|
||||
"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT 1 FROM pg_trigger WHERE tgname = 'tsvectorupdate') THEN
|
||||
CREATE TRIGGER tsvectorupdate BEFORE INSERT OR UPDATE
|
||||
ON knowledge_chunks FOR EACH ROW EXECUTE PROCEDURE chunks_tsvector_trigger();
|
||||
END IF;
|
||||
END $$;
|
||||
""",
|
||||
|
||||
# 6. 【新增】回填旧数据
|
||||
# 让以前存的 task_id=6 的数据也能生成关键词索引
|
||||
"""
|
||||
UPDATE knowledge_chunks
|
||||
SET content_tsvector = to_tsvector('english', coalesce(title, '') || ' ' || content)
|
||||
WHERE content_tsvector IS NULL;
|
||||
"""
|
||||
]
|
||||
|
||||
with engine.begin() as conn:
|
||||
for cmd in commands:
|
||||
try:
|
||||
conn.execute(text(cmd))
|
||||
except Exception as e:
|
||||
print(f"⚠️ 执行警告 (通常可忽略): {e}")
|
||||
|
||||
print("✅ 数据库结构升级完成!旧数据已保留并兼容。")
|
||||
|
||||
if __name__ == "__main__":
|
||||
update_database_schema()
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
0
tests/rag_benchmark/__init__.py
Normal file
0
tests/rag_benchmark/__init__.py
Normal file
BIN
tests/rag_benchmark/benchmark_report.png
Normal file
BIN
tests/rag_benchmark/benchmark_report.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 100 KiB |
86
tests/rag_benchmark/dataset.json
Normal file
86
tests/rag_benchmark/dataset.json
Normal file
@@ -0,0 +1,86 @@
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"type": "core_function",
|
||||
"query": "What is the difference between /scrape and /map endpoints?",
|
||||
"ground_truth": "/map is used to crawl a website and retrieve all URLs, while /scrape is used to extract content from a specific URL.",
|
||||
"keywords": ["URL", "content", "specific", "retrieve"]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "new_feature",
|
||||
"query": "What is the Deep Research feature?",
|
||||
"ground_truth": "Deep Research is an alpha feature allowing agents to perform iterative research tasks.",
|
||||
"keywords": ["alpha", "iterative", "research", "agent"]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "integration",
|
||||
"query": "How can I integrate Firecrawl with ChatGPT?",
|
||||
"ground_truth": "Firecrawl can be integrated via MCP (Model Context Protocol).",
|
||||
"keywords": ["MCP", "Model Context Protocol", "setup"]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "multilingual_zh",
|
||||
"query": "如何进行私有化部署 (Self-host)?",
|
||||
"ground_truth": "你需要使用 Docker Compose 进行部署,文档位于 /self-host/quick-start/docker-compose。",
|
||||
"keywords": ["Docker", "Compose", "self-host", "deploy"]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "api_detail",
|
||||
"query": "What parameters are available for the /extract endpoint?",
|
||||
"ground_truth": "The extract endpoint allows defining a schema for structured data extraction.",
|
||||
"keywords": ["schema", "structured", "prompt"]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "numeric",
|
||||
"query": "How do credits work for the scrape endpoint?",
|
||||
"ground_truth": "Specific credit usage details are in the /credits endpoint documentation (usually 1 credit per page for basic scrape).",
|
||||
"keywords": ["credit", "usage", "cost"]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "negative_test",
|
||||
"query": "Does Firecrawl support scraping video content from YouTube?",
|
||||
"ground_truth": "The documentation does not mention video scraping support.",
|
||||
"keywords": []
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "advanced",
|
||||
"query": "How to use batch scrape?",
|
||||
"ground_truth": "Use the /batch/scrape endpoint to submit multiple URLs at once.",
|
||||
"keywords": ["batch", "multiple", "URLs"]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "automation",
|
||||
"query": "Is there an n8n integration guide?",
|
||||
"ground_truth": "Yes, there is a workflow automation guide for n8n.",
|
||||
"keywords": ["n8n", "workflow", "automation"]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "security",
|
||||
"query": "Where can I find information about webhook security?",
|
||||
"ground_truth": "Information is available in the Webhooks Security section.",
|
||||
"keywords": ["webhook", "security", "signature"]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "cross_lingual_trap",
|
||||
"query": "Explain the crawl features in French.",
|
||||
"ground_truth": "The system should ideally retrieve the French document (/fr/features/crawl) and answer in French.",
|
||||
"keywords": ["fonctionnalités", "crawl", "fr"]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "api_history",
|
||||
"query": "How to check historical token usage?",
|
||||
"ground_truth": "Use the /token-usage-historical endpoint.",
|
||||
"keywords": ["token", "usage", "historical"]
|
||||
}
|
||||
]
|
||||
74
tests/rag_benchmark/evaluator.py
Normal file
74
tests/rag_benchmark/evaluator.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import json
|
||||
import logging
|
||||
from backend.services.llm_service import llm_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class RAGEvaluator:
|
||||
def __init__(self):
|
||||
self.llm = llm_service
|
||||
|
||||
def calculate_retrieval_metrics(self, retrieved_docs, dataset_item):
|
||||
"""
|
||||
计算检索阶段指标: Keyword Recall (关键词覆盖率)
|
||||
检查 dataset 中的 keywords 有多少出现在了 retrieved_docs 的 content 中
|
||||
"""
|
||||
required_keywords = dataset_item.get("keywords", [])
|
||||
if not required_keywords:
|
||||
return {"keyword_recall": 1.0, "hit": True} # 没有关键词要求,默认算对
|
||||
|
||||
# 将所有检索到的文本拼接并转小写
|
||||
full_context = " ".join([doc['content'] for doc in retrieved_docs]).lower()
|
||||
|
||||
found_count = 0
|
||||
for kw in required_keywords:
|
||||
if kw.lower() in full_context:
|
||||
found_count += 1
|
||||
|
||||
recall = found_count / len(required_keywords)
|
||||
|
||||
return {
|
||||
"keyword_recall": recall,
|
||||
# 只要召回率大于 0 就认为 Hit 了一部分;
|
||||
# 严格一点可以要求 recall > 0.5,这里我们设定只要沾边就算 Hit
|
||||
"hit": recall > 0
|
||||
}
|
||||
|
||||
def evaluate_generation_quality(self, question, generated_answer, ground_truth_answer, q_type):
|
||||
"""
|
||||
使用 LLM 作为裁判,评估生成质量 (1-5分)
|
||||
"""
|
||||
prompt = f"""
|
||||
你是一名RAG系统的自动化测试裁判。请根据以下信息对“系统回答”进行评分(1-5分)。
|
||||
|
||||
【测试类型】: {q_type}
|
||||
【用户问题】: {question}
|
||||
【标准答案 (Ground Truth)】: {ground_truth_answer}
|
||||
【系统回答】: {generated_answer}
|
||||
|
||||
评分标准:
|
||||
- 5分: 含义与标准答案完全一致,逻辑正确,无幻觉。
|
||||
- 4分: 核心意思正确,但缺少部分细节或废话较多。
|
||||
- 3分: 回答了一部分正确信息,但有遗漏或轻微错误。
|
||||
- 2分: 包含大量错误信息或严重答非所问。
|
||||
- 1分: 完全错误,或产生了严重幻觉(例如在负向测试中编造了不存在的功能)。
|
||||
|
||||
注意:对于"negative_test"(负向测试),如果标准答案是“不支持/文档未提及”,而系统回答诚实地说“未找到相关信息”或“不支持”,应给满分。
|
||||
|
||||
请仅返回JSON格式: {{"score": 5, "reason": "理由..."}}
|
||||
"""
|
||||
|
||||
try:
|
||||
# 使用 system_prompt 强制约束格式
|
||||
result_str = self.llm.chat(prompt, system_prompt="你是一个只输出JSON的评测机器人。")
|
||||
|
||||
# 清洗 Markdown 格式 (```json ... ```)
|
||||
if "```" in result_str:
|
||||
result_str = result_str.split("```json")[-1].split("```")[0].strip()
|
||||
|
||||
eval_result = json.loads(result_str)
|
||||
return eval_result
|
||||
except Exception as e:
|
||||
logger.error(f"Eval LLM failed: {e}")
|
||||
# 降级处理
|
||||
return {"score": 0, "reason": "Evaluation Script Error"}
|
||||
168
tests/rag_benchmark/run_benchmark.py
Normal file
168
tests/rag_benchmark/run_benchmark.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from tabulate import tabulate # 需要 pip install tabulate
|
||||
|
||||
# 路径 Hack: 确保能导入 backend 模块
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '../../../'))
|
||||
|
||||
from backend.services.data_service import data_service
|
||||
from backend.services.llm_service import llm_service
|
||||
from tests.rag_benchmark.evaluator import RAGEvaluator
|
||||
|
||||
# ================= 配置区 =================
|
||||
# 请填入你数据库中真实存在的、包含爬取数据的 task_id
|
||||
TEST_TASK_ID = 19
|
||||
# ========================================
|
||||
|
||||
def run_experiment(config_name, dataset, retrieve_func, generate_func):
|
||||
"""
|
||||
运行单组实验并收集详细指标
|
||||
"""
|
||||
print(f"\n🚀 开始测试配置: [ {config_name} ]")
|
||||
evaluator = RAGEvaluator()
|
||||
results = []
|
||||
|
||||
total_latency = 0
|
||||
|
||||
# 用于分类统计 (例如 core_function 得分多少, negative_test 得分多少)
|
||||
category_stats = defaultdict(lambda: {"count": 0, "score_sum": 0, "recall_sum": 0})
|
||||
|
||||
for item in dataset:
|
||||
start_time = time.time()
|
||||
|
||||
# 1. 检索 (Retrieval)
|
||||
retrieved_docs = retrieve_func(item['query'])
|
||||
|
||||
# 2. 生成 (Generation)
|
||||
# 构造 Context,如果没有检索到内容,给一个提示
|
||||
if retrieved_docs:
|
||||
context_str = "\n---\n".join([f"Source: {d.get('source_url', 'unknown')}\nContent: {d['content']}" for d in retrieved_docs])
|
||||
else:
|
||||
context_str = "没有检索到任何相关文档。"
|
||||
|
||||
answer = generate_func(item['query'], context_str)
|
||||
|
||||
latency = time.time() - start_time
|
||||
total_latency += latency
|
||||
|
||||
# 3. 评测 (Evaluation)
|
||||
# 计算关键词召回率
|
||||
retrieval_metric = evaluator.calculate_retrieval_metrics(retrieved_docs, item)
|
||||
# 计算LLM回答质量
|
||||
gen_eval = evaluator.evaluate_generation_quality(
|
||||
item['query'], answer, item['ground_truth'], item['type']
|
||||
)
|
||||
|
||||
# 记录单条结果
|
||||
row = {
|
||||
"id": item['id'],
|
||||
"type": item['type'],
|
||||
"query": item['query'],
|
||||
"recall": retrieval_metric['keyword_recall'],
|
||||
"score": gen_eval['score'],
|
||||
"reason": gen_eval.get('reason', '')[:50] + "...", # 截断显示
|
||||
"latency": latency
|
||||
}
|
||||
results.append(row)
|
||||
|
||||
# 累加分类统计
|
||||
cat = item['type']
|
||||
category_stats[cat]["count"] += 1
|
||||
category_stats[cat]["score_sum"] += gen_eval['score']
|
||||
category_stats[cat]["recall_sum"] += retrieval_metric['keyword_recall']
|
||||
|
||||
# 实时打印进度 (简洁版)
|
||||
status_icon = "✅" if gen_eval['score'] >= 4 else "⚠️" if gen_eval['score'] >= 3 else "❌"
|
||||
print(f" {status_icon} ID:{item['id']} [{item['type'][:10]}] Score:{gen_eval['score']} | Recall:{retrieval_metric['keyword_recall']:.1f}")
|
||||
|
||||
# --- 汇总本轮实验数据 ---
|
||||
avg_score = sum(r['score'] for r in results) / len(results)
|
||||
avg_recall = sum(r['recall'] for r in results) / len(results)
|
||||
avg_latency = total_latency / len(results)
|
||||
|
||||
# 格式化分类报告
|
||||
cat_report = []
|
||||
for cat, data in category_stats.items():
|
||||
cat_report.append(f"{cat}: {data['score_sum']/data['count']:.1f}分")
|
||||
|
||||
return {
|
||||
"Config": config_name,
|
||||
"Avg Score (1-5)": f"{avg_score:.2f}",
|
||||
"Avg Recall": f"{avg_recall:.2%}",
|
||||
"Avg Latency": f"{avg_latency:.3f}s",
|
||||
"Weakest Category": min(category_stats, key=lambda k: category_stats[k]['score_sum']/category_stats[k]['count'])
|
||||
}
|
||||
|
||||
def main():
|
||||
# 1. 加载数据集
|
||||
dataset_path = os.path.join(os.path.dirname(__file__), 'dataset.json')
|
||||
if not os.path.exists(dataset_path):
|
||||
print("Error: dataset.json not found.")
|
||||
return
|
||||
|
||||
with open(dataset_path, 'r', encoding='utf-8') as f:
|
||||
dataset = json.load(f)
|
||||
|
||||
print(f"载入 {len(dataset)} 条测试用例,准备开始横向评测...")
|
||||
|
||||
# 2. 定义实验变量 (检索函数 + 生成函数)
|
||||
|
||||
# === Exp A: 纯关键词 (模拟传统搜索) ===
|
||||
def retrieve_keyword(query):
|
||||
# vector_weight=0 强制使用 SQL TSVector
|
||||
# 注意: 需要传递一个假向量给接口占位
|
||||
dummy_vec = [0.0] * 1536
|
||||
res = data_service.search(query, dummy_vec, task_id=TEST_TASK_ID, vector_weight=0.0, candidates_num=5)
|
||||
return res['results']
|
||||
|
||||
# === Exp B: 纯向量 (语义检索) ===
|
||||
def retrieve_vector(query):
|
||||
vec = llm_service.get_embedding(query)
|
||||
# vector_weight=1 忽略关键词匹配
|
||||
res = data_service.search(query, vec, task_id=TEST_TASK_ID, vector_weight=1.0, candidates_num=5)
|
||||
return res['results']
|
||||
|
||||
# === Exp C: 混合检索 (Hybrid) ===
|
||||
def retrieve_hybrid(query):
|
||||
vec = llm_service.get_embedding(query)
|
||||
# 默认 0.7 向量 + 0.3 关键词
|
||||
res = data_service.search(query, vec, task_id=TEST_TASK_ID, vector_weight=0.7, candidates_num=5)
|
||||
return res['results']
|
||||
|
||||
# === Exp D: 混合 + 重排序 (Rerank) ===
|
||||
def retrieve_rerank(query):
|
||||
vec = llm_service.get_embedding(query)
|
||||
# 1. 扩大召回 (Top 30)
|
||||
res = data_service.search(query, vec, task_id=TEST_TASK_ID, vector_weight=0.7, candidates_num=30)
|
||||
initial_docs = res['results']
|
||||
# 2. 精排 (Top 5)
|
||||
reranked = llm_service.rerank(query, initial_docs, top_n=5)
|
||||
return reranked
|
||||
|
||||
# === 通用生成函数 ===
|
||||
def generate_answer(query, context):
|
||||
system_prompt = "你是一个智能助手。请严格根据提供的上下文回答用户问题。如果上下文中没有答案,请直接说'未找到相关信息'。"
|
||||
prompt = f"参考上下文:\n{context}\n\n用户问题:{query}"
|
||||
return llm_service.chat(prompt, system_prompt=system_prompt)
|
||||
|
||||
# 3. 运行所有实验
|
||||
final_report = []
|
||||
|
||||
final_report.append(run_experiment("1. Keyword Only (BM25)", dataset, retrieve_keyword, generate_answer))
|
||||
final_report.append(run_experiment("2. Vector Only", dataset, retrieve_vector, generate_answer))
|
||||
final_report.append(run_experiment("3. Hybrid (Base)", dataset, retrieve_hybrid, generate_answer))
|
||||
final_report.append(run_experiment("4. Hybrid + Rerank", dataset, retrieve_rerank, generate_answer))
|
||||
|
||||
# 4. 输出最终报表
|
||||
print("\n\n📊 ================= 最终横向对比报告 (Final Report) ================= 📊")
|
||||
print(tabulate(final_report, headers="keys", tablefmt="github"))
|
||||
print("\n💡 解读建议:")
|
||||
print("1. 如果 'Avg Recall' 低,说明切片(Chunking)或检索算法找不到资料。")
|
||||
print("2. 如果 Recall 高但 'Avg Score' 低,说明 LLM 产生了幻觉或 Prompt 没写好。")
|
||||
print("3. 'Weakest Category' 帮你发现短板(如多语言或负向测试)。")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
208
tests/rag_benchmark/visual_benchmark.py
Normal file
208
tests/rag_benchmark/visual_benchmark.py
Normal file
@@ -0,0 +1,208 @@
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from tqdm import tqdm
|
||||
|
||||
# 路径 Hack: 确保能导入 backend
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.abspath(os.path.join(current_dir, '../../'))
|
||||
if project_root not in sys.path:
|
||||
sys.path.insert(0, project_root)
|
||||
|
||||
# 直接导入服务类 (Direct Call)
|
||||
from backend.services.data_service import data_service
|
||||
from backend.services.llm_service import llm_service
|
||||
|
||||
# ================= 配置区 =================
|
||||
TEST_TASK_ID = 19 # 请修改为真实的 Task ID
|
||||
DATASET_PATH = os.path.join(current_dir, 'dataset.json')
|
||||
OUTPUT_IMG = os.path.join(current_dir, 'benchmark_report.png')
|
||||
# ========================================
|
||||
|
||||
class RAGEvaluator:
|
||||
"""评测工具类:负责计算召回率和调用LLM打分"""
|
||||
def __init__(self):
|
||||
self.llm = llm_service
|
||||
|
||||
def calculate_recall(self, retrieved_docs, keywords):
|
||||
"""计算关键词召回率"""
|
||||
if not keywords: return 1.0 # 无关键词要求的题目默认为满分
|
||||
|
||||
full_text = " ".join([d['content'] for d in retrieved_docs]).lower()
|
||||
hit_count = sum(1 for k in keywords if k.lower() in full_text)
|
||||
return hit_count / len(keywords)
|
||||
|
||||
def judge_answer(self, query, answer, ground_truth):
|
||||
"""调用 LLM 给生成结果打分 (1-5)"""
|
||||
prompt = f"""
|
||||
作为 RAG 评测员,请对【系统回答】打分 (1-5)。
|
||||
用户问题: {query}
|
||||
标准答案: {ground_truth}
|
||||
系统回答: {answer}
|
||||
|
||||
标准:
|
||||
5: 含义完全一致,无幻觉。
|
||||
3: 包含核心信息,但有遗漏。
|
||||
1: 错误或严重幻觉。
|
||||
|
||||
只返回数字 (1, 2, 3, 4, 5)。
|
||||
"""
|
||||
try:
|
||||
# 这里调用你在 llm_service 中新增的 chat 方法
|
||||
res = self.llm.chat(prompt)
|
||||
# 简单的清洗逻辑,提取数字
|
||||
score = int(''.join(filter(str.isdigit, res)))
|
||||
return min(max(score, 1), 5) # 限制在 1-5
|
||||
except:
|
||||
return 1 # 失败保底 1 分
|
||||
|
||||
class Visualizer:
|
||||
"""绘图工具类"""
|
||||
def plot_dashboard(self, df):
|
||||
# 设置风格
|
||||
sns.set_theme(style="whitegrid")
|
||||
# 解决中文显示问题 (如果环境支持 SimHei 则用中文,否则用英文)
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial', 'DejaVu Sans']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
|
||||
fig = plt.figure(figsize=(18, 10))
|
||||
gs = fig.add_gridspec(2, 2)
|
||||
|
||||
# Chart 1: 总体指标对比 (Bar Chart)
|
||||
ax1 = fig.add_subplot(gs[0, 0])
|
||||
# 将数据变形为长格式以便绘图
|
||||
df_summary = df.groupby('config')[['score', 'recall']].mean().reset_index()
|
||||
df_melt = df_summary.melt(id_vars='config', var_name='Metric', value_name='Value')
|
||||
# 将 Recall 归一化到 0-5 方便同图显示,或者分开轴。这里简单处理:Recall * 5
|
||||
df_melt.loc[df_melt['Metric'] == 'recall', 'Value'] *= 5
|
||||
|
||||
sns.barplot(data=df_melt, x='config', y='Value', hue='Metric', ax=ax1, palette="viridis")
|
||||
ax1.set_title('Overall Performance (Score & Recall)', fontsize=14, fontweight='bold')
|
||||
ax1.set_ylabel('Score (1-5) / Recall (x5)')
|
||||
ax1.set_ylim(0, 5.5)
|
||||
for container in ax1.containers:
|
||||
ax1.bar_label(container, fmt='%.1f')
|
||||
|
||||
# Chart 2: 延迟 vs 质量 (Scatter Plot)
|
||||
ax2 = fig.add_subplot(gs[0, 1])
|
||||
df_latency = df.groupby('config')[['score', 'latency']].mean().reset_index()
|
||||
sns.scatterplot(data=df_latency, x='latency', y='score', hue='config', s=200, ax=ax2, palette="deep")
|
||||
|
||||
# 添加标签
|
||||
for i in range(df_latency.shape[0]):
|
||||
ax2.text(
|
||||
df_latency.latency[i]+0.05,
|
||||
df_latency.score[i],
|
||||
df_latency.config[i],
|
||||
fontsize=10
|
||||
)
|
||||
ax2.set_title('Trade-off: Latency vs Quality', fontsize=14, fontweight='bold')
|
||||
ax2.set_xlabel('Avg Latency (seconds)')
|
||||
ax2.set_ylabel('Avg Quality Score (1-5)')
|
||||
|
||||
# Chart 3: 类别热力图 (Heatmap) - 你的 Weakest Category 可视化
|
||||
ax3 = fig.add_subplot(gs[1, :]) # 占用下方整行
|
||||
pivot_data = df.pivot_table(index='config', columns='type', values='score', aggfunc='mean')
|
||||
sns.heatmap(pivot_data, annot=True, cmap="RdYlGn", center=3, fmt=".1f", ax=ax3, linewidths=.5)
|
||||
ax3.set_title('Category Breakdown (Find the Weakest Link)', fontsize=14, fontweight='bold')
|
||||
ax3.set_xlabel('')
|
||||
ax3.set_ylabel('')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUTPUT_IMG)
|
||||
print(f"\n📊 报表已生成: {OUTPUT_IMG}")
|
||||
|
||||
def main():
|
||||
# 1. 加载数据
|
||||
if not os.path.exists(DATASET_PATH):
|
||||
print("Dataset not found!")
|
||||
return
|
||||
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
|
||||
dataset = json.load(f)
|
||||
|
||||
# 2. 定义实验配置 (Direct Call)
|
||||
configs = [
|
||||
{
|
||||
"name": "1. BM25 (Keyword)",
|
||||
"retriever": lambda q: data_service.search(q, [0.0]*1536, task_id=TEST_TASK_ID, vector_weight=0.0, candidates_num=5)['results'],
|
||||
"rerank": False
|
||||
},
|
||||
{
|
||||
"name": "2. Vector Only",
|
||||
"retriever": lambda q: data_service.search(q, llm_service.get_embedding(q), task_id=TEST_TASK_ID, vector_weight=1.0, candidates_num=5)['results'],
|
||||
"rerank": False
|
||||
},
|
||||
{
|
||||
"name": "3. Hybrid (Base)",
|
||||
"retriever": lambda q: data_service.search(q, llm_service.get_embedding(q), task_id=TEST_TASK_ID, vector_weight=0.7, candidates_num=5)['results'],
|
||||
"rerank": False
|
||||
},
|
||||
{
|
||||
"name": "4. Hybrid + Rerank",
|
||||
"retriever": lambda q: data_service.search(q, llm_service.get_embedding(q), task_id=TEST_TASK_ID, vector_weight=0.7, candidates_num=30)['results'], # 召回 Top 30
|
||||
"rerank": True
|
||||
}
|
||||
]
|
||||
|
||||
evaluator = RAGEvaluator()
|
||||
all_results = []
|
||||
|
||||
print("🚀 开始自动化评测 (Visualization Mode)...")
|
||||
|
||||
# 3. 循环执行 (双重循环:配置 -> 数据)
|
||||
# 使用 tqdm 显示总进度
|
||||
total_steps = len(configs) * len(dataset)
|
||||
pbar = tqdm(total=total_steps, desc="Running Experiments")
|
||||
|
||||
for cfg in configs:
|
||||
for item in dataset:
|
||||
pbar.set_description(f"Testing {cfg['name']}")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# A. 检索
|
||||
docs = cfg['retriever'](item['query'])
|
||||
|
||||
# B. Rerank (如果在配置里开启)
|
||||
if cfg['rerank']:
|
||||
docs = llm_service.rerank(item['query'], docs, top_n=5)
|
||||
|
||||
# C. 生成
|
||||
context = "\n".join([d['content'] for d in docs]) if docs else ""
|
||||
if not context:
|
||||
answer = "未找到相关信息"
|
||||
else:
|
||||
prompt = f"Context:\n{context}\n\nQuestion: {item['query']}"
|
||||
answer = llm_service.chat(prompt) # 调用生成接口
|
||||
|
||||
latency = time.time() - start_time
|
||||
|
||||
# D. 评测指标
|
||||
recall = evaluator.calculate_recall(docs, item.get('keywords', []))
|
||||
score = evaluator.judge_answer(item['query'], answer, item['ground_truth'])
|
||||
|
||||
# E. 收集数据
|
||||
all_results.append({
|
||||
"config": cfg['name'],
|
||||
"id": item['id'],
|
||||
"type": item['type'], # 类别字段
|
||||
"recall": recall,
|
||||
"score": score,
|
||||
"latency": latency
|
||||
})
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
pbar.close()
|
||||
|
||||
# 4. 数据处理与绘图
|
||||
df = pd.DataFrame(all_results)
|
||||
viz = Visualizer()
|
||||
viz.plot_dashboard(df)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user