完成后端配置,用于部署
This commit is contained in:
20
README.md
20
README.md
@@ -1,31 +1,35 @@
|
||||
# wiki_crawler
|
||||
|
||||
本仓库主要用于存放和更新dify中wiki_crawler的代码节点的代码
|
||||
|
||||
## 节点返回值格式约定
|
||||
|
||||
节点返回值统一采用json格式,包含以下字段:
|
||||
|
||||
- code:状态码,0失败,1成功,2警告
|
||||
- msg:状态描述,用于提示调用方
|
||||
- data:返回数据,json格式,根据不同节点有不同的字段,若失败则为null
|
||||
|
||||
## 节点输入输出设计
|
||||
|
||||
- start:启动节点
|
||||
- input:
|
||||
- input
|
||||
- type: ['url','task_id'] # 根据输入类型对input进行处理
|
||||
- output:无
|
||||
- register:注册节点
|
||||
- register:注册节点,涉及sql
|
||||
- input:
|
||||
- url:任务url
|
||||
- output:
|
||||
- task_id:任务id,用于后续查询任务状态
|
||||
- is_new_task:是否为新任务,1表示是,0表示否
|
||||
- pending_urls:剩余待处理url
|
||||
- pending_urls:剩余待处理url,涉及sql
|
||||
- input:
|
||||
- task_id:任务id
|
||||
- limit:最多返回的url数量,默认值为10
|
||||
- output:
|
||||
urls: 剩余待处理url列表
|
||||
- save_results:保存处理结果
|
||||
- urls: 剩余待处理url列表
|
||||
- save_results:保存处理结果,涉及sql
|
||||
- input:
|
||||
- task_id:任务id
|
||||
- results:任务结果列表,用于存入数据库
|
||||
@@ -33,3 +37,11 @@
|
||||
- completed:已入库url列表
|
||||
- failed:入库url列表
|
||||
- warnings:入库警告列表
|
||||
- message:消息节点,前置一个变量聚合器,不涉及sql操作
|
||||
- input:
|
||||
- msgs:各个节点的msg经过前置节点整合后统一输出
|
||||
- output:
|
||||
- output:整合消息之后输出给end
|
||||
- end:结束节点
|
||||
- input:
|
||||
- message节点整合的
|
||||
|
||||
BIN
backend/__pycache__/config.cpython-313.pyc
Normal file
BIN
backend/__pycache__/config.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/database.cpython-313.pyc
Normal file
BIN
backend/__pycache__/database.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/main.cpython-313.pyc
Normal file
BIN
backend/__pycache__/main.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/schemas.cpython-313.pyc
Normal file
BIN
backend/__pycache__/schemas.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/service.cpython-313.pyc
Normal file
BIN
backend/__pycache__/service.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/utils.cpython-313.pyc
Normal file
BIN
backend/__pycache__/utils.cpython-313.pyc
Normal file
Binary file not shown.
16
backend/config.py
Normal file
16
backend/config.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import os
|
||||
|
||||
class Settings:
|
||||
# 数据库配置
|
||||
DB_USER: str = "postgres"
|
||||
DB_PASS: str = "DXC_welcome001"
|
||||
DB_HOST: str = "8.155.144.6"
|
||||
DB_PORT: str = "25432"
|
||||
DB_NAME: str = "wiki_crawler"
|
||||
|
||||
@property
|
||||
def DATABASE_URL(self) -> str:
|
||||
url = f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASS}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
|
||||
return url
|
||||
|
||||
settings = Settings()
|
||||
22
backend/database.py
Normal file
22
backend/database.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from sqlalchemy import create_engine, MetaData, Table
|
||||
from .config import settings
|
||||
|
||||
class Database:
|
||||
def __init__(self):
|
||||
self.engine = create_engine(settings.DATABASE_URL, pool_pre_ping=True)
|
||||
self.metadata = MetaData()
|
||||
self.tasks = None
|
||||
self.queue = None
|
||||
self.chunks = None
|
||||
self._reflect_tables()
|
||||
|
||||
def _reflect_tables(self):
|
||||
try:
|
||||
# 自动从数据库加载表结构
|
||||
self.tasks = Table('crawl_tasks', self.metadata, autoload_with=self.engine)
|
||||
self.queue = Table('crawl_queue', self.metadata, autoload_with=self.engine)
|
||||
self.chunks = Table('knowledge_chunks', self.metadata, autoload_with=self.engine)
|
||||
except Exception as e:
|
||||
print(f"❌ 数据库表加载失败: {e}")
|
||||
|
||||
db_instance = Database()
|
||||
39
backend/main.py
Normal file
39
backend/main.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from fastapi import FastAPI
|
||||
from .service import crawler_service
|
||||
from .schemas import RegisterRequest, PendingRequest, SaveResultsRequest, AddUrlsRequest
|
||||
from .utils import make_response
|
||||
|
||||
app = FastAPI(title="Wiki Crawler API")
|
||||
|
||||
@app.post("/register")
|
||||
async def register(req: RegisterRequest):
|
||||
try:
|
||||
data = crawler_service.register_task(req.url)
|
||||
return make_response(1, "Success", data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
|
||||
@app.post("/add_urls")
|
||||
async def add_urls(req: AddUrlsRequest):
|
||||
try:
|
||||
data = crawler_service.add_urls(req.task_id, req.urls)
|
||||
return make_response(1, "Success", data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
|
||||
@app.post("/pending_urls")
|
||||
async def pending_urls(req: PendingRequest):
|
||||
try:
|
||||
data = crawler_service.get_pending_urls(req.task_id, req.limit)
|
||||
msg = "Success" if data["urls"] else "Queue Empty"
|
||||
return make_response(1, msg, data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
|
||||
@app.post("/save_results")
|
||||
async def save_results(req: SaveResultsRequest):
|
||||
try:
|
||||
data = crawler_service.save_results(req.task_id, req.results)
|
||||
return make_response(1, "Success", data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
23
backend/schemas.py
Normal file
23
backend/schemas.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
|
||||
class RegisterRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
class PendingRequest(BaseModel):
|
||||
task_id: int
|
||||
limit: Optional[int] = 10
|
||||
|
||||
class AddUrlsRequest(BaseModel):
|
||||
task_id: int
|
||||
urls: List[str]
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
title: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
embedding: Optional[List[float]] = None
|
||||
|
||||
class SaveResultsRequest(BaseModel):
|
||||
task_id: int
|
||||
results: List[CrawlResult]
|
||||
91
backend/service.py
Normal file
91
backend/service.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from sqlalchemy import select, update, and_
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from .database import db_instance
|
||||
from .utils import normalize_url
|
||||
|
||||
class CrawlerService:
|
||||
def __init__(self):
|
||||
self.db = db_instance
|
||||
|
||||
def register_task(self, url: str):
|
||||
"""注册新任务并初始化队列"""
|
||||
clean_url = normalize_url(url)
|
||||
with self.db.engine.begin() as conn:
|
||||
# 1. 查重
|
||||
find_stmt = select(self.db.tasks.c.id).where(self.db.tasks.c.root_url == clean_url)
|
||||
existing = conn.execute(find_stmt).fetchone()
|
||||
|
||||
if existing:
|
||||
return {"task_id": existing[0], "is_new_task": False}
|
||||
|
||||
# 2. 插入新任务
|
||||
new_task = conn.execute(
|
||||
pg_insert(self.db.tasks).values(root_url=clean_url).returning(self.db.tasks.c.id)
|
||||
).fetchone()
|
||||
task_id = new_task[0]
|
||||
|
||||
# 3. 初始化首个 URL 到队列
|
||||
conn.execute(
|
||||
pg_insert(self.db.queue).values(task_id=task_id, url=clean_url, status='pending')
|
||||
)
|
||||
return {"task_id": task_id, "is_new_task": True}
|
||||
|
||||
def add_urls(self, task_id: int, urls: list):
|
||||
"""批量存入新发现的待处理 URL(自动去重)"""
|
||||
added_count = 0
|
||||
with self.db.engine.begin() as conn:
|
||||
for url in urls:
|
||||
clean_url = normalize_url(url)
|
||||
stmt = pg_insert(self.db.queue).values(
|
||||
task_id=task_id,
|
||||
url=clean_url,
|
||||
status='pending'
|
||||
).on_conflict_do_nothing(index_elements=['task_id', 'url'])
|
||||
|
||||
res = conn.execute(stmt)
|
||||
if res.rowcount > 0:
|
||||
added_count += 1
|
||||
return {"added_count": added_count}
|
||||
|
||||
def get_pending_urls(self, task_id: int, limit: int):
|
||||
"""原子化获取待处理 URL 并锁定"""
|
||||
with self.db.engine.begin() as conn:
|
||||
stmt = select(self.db.queue.c.url).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.status == 'pending')
|
||||
).limit(limit)
|
||||
|
||||
urls = [r[0] for r in conn.execute(stmt).fetchall()]
|
||||
|
||||
if urls:
|
||||
conn.execute(
|
||||
update(self.db.queue).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url.in_(urls))
|
||||
).values(status='processing')
|
||||
)
|
||||
return {"urls": urls}
|
||||
|
||||
def save_results(self, task_id: int, results: list):
|
||||
"""保存正文、向量并闭环队列状态"""
|
||||
with self.db.engine.begin() as conn:
|
||||
for res in results:
|
||||
clean_url = normalize_url(res.url)
|
||||
# 存入数据
|
||||
conn.execute(
|
||||
pg_insert(self.db.chunks).values(
|
||||
task_id=task_id,
|
||||
source_url=clean_url,
|
||||
title=res.title,
|
||||
content=res.content,
|
||||
embedding=res.embedding
|
||||
)
|
||||
)
|
||||
# 更新状态
|
||||
conn.execute(
|
||||
update(self.db.queue).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
|
||||
).values(status='completed')
|
||||
)
|
||||
return {"inserted": len(results)}
|
||||
|
||||
# 全局单例
|
||||
crawler_service = CrawlerService()
|
||||
15
backend/utils.py
Normal file
15
backend/utils.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from sqlalchemy import create_engine, MetaData, Table, select, update, and_
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
if not url: return ""
|
||||
url = url.strip()
|
||||
parsed = urlparse(url)
|
||||
scheme = parsed.scheme.lower()
|
||||
netloc = parsed.netloc.lower()
|
||||
path = parsed.path.rstrip('/')
|
||||
if not path: path = ""
|
||||
return urlunparse((scheme, netloc, path, parsed.params, parsed.query, ""))
|
||||
|
||||
def make_response(code: int, msg: str, data: any = None):
|
||||
return {"code": code, "msg": msg, "data": data}
|
||||
136
nodes/register.py
Normal file
136
nodes/register.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import json
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from langchain_community.utilities import SQLDatabase
|
||||
from sqlalchemy import Table, MetaData, select, insert
|
||||
|
||||
# --- 1. 工具函数:URL 标准化 ---
|
||||
def normalize_url(url: str) -> str:
|
||||
"""
|
||||
标准化 URL,解决末尾斜杠、大小写、锚点造成的重复问题
|
||||
"""
|
||||
if not url:
|
||||
return ""
|
||||
# 去除前后空格
|
||||
url = url.strip()
|
||||
# 解析 URL
|
||||
parsed = urlparse(url)
|
||||
# 1. 协议和域名转小写
|
||||
scheme = parsed.scheme.lower()
|
||||
netloc = parsed.netloc.lower()
|
||||
# 2. 路径处理:去除末尾斜杠
|
||||
path = parsed.path
|
||||
if path.endswith('/'):
|
||||
path = path.rstrip('/')
|
||||
# 3. 忽略 Query 参数排序或 Fragment (#)
|
||||
# 这里保留 Query 参数,但丢弃 Fragment,因为锚点指向同一页面
|
||||
query = parsed.query
|
||||
|
||||
# 重新拼接
|
||||
return urlunparse((scheme, netloc, path, parsed.params, query, ""))
|
||||
|
||||
# --- 2. 数据库连接工厂 ---
|
||||
def get_db_connection(db_url: str):
|
||||
"""
|
||||
获取通用数据库连接,处理协议兼容性
|
||||
"""
|
||||
if not db_url:
|
||||
raise ValueError("数据库连接字符串 (db_url) 不能为空")
|
||||
|
||||
# 修复常见协议头报错
|
||||
if db_url.startswith("postgres://"):
|
||||
db_url = db_url.replace("postgres://", "postgresql+psycopg2://", 1)
|
||||
elif db_url.startswith("postgresql://") and "+psycopg2" not in db_url:
|
||||
db_url = db_url.replace("postgresql://", "postgresql+psycopg2://", 1)
|
||||
|
||||
try:
|
||||
# pool_pre_ping=True 用于在获取连接前检查有效性,防止超时断开
|
||||
return SQLDatabase.from_uri(db_url, engine_args={"pool_pre_ping": True})
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"数据库连接失败: {str(e)}")
|
||||
|
||||
# --- 3. 核心业务逻辑 ---
|
||||
def _logic_handler(db: SQLDatabase, inputs: dict):
|
||||
"""
|
||||
业务逻辑:注册任务并初始化队列
|
||||
"""
|
||||
engine = db._engine
|
||||
metadata = MetaData()
|
||||
|
||||
# 标准化输入 URL
|
||||
raw_url = inputs.get("url", "")
|
||||
if not raw_url:
|
||||
raise ValueError("输入参数 'url' 缺失")
|
||||
clean_url = normalize_url(raw_url)
|
||||
|
||||
# 反射获取表结构(无需写SQL)
|
||||
tasks_table = Table('crawl_tasks', metadata, autoload_with=engine)
|
||||
queue_table = Table('crawl_queue', metadata, autoload_with=engine)
|
||||
|
||||
with engine.begin() as conn:
|
||||
# 1. 查询该 root_url 是否已存在
|
||||
find_stmt = select(tasks_table.c.id).where(tasks_table.c.root_url == clean_url)
|
||||
existing_task = conn.execute(find_stmt).fetchone()
|
||||
|
||||
if existing_task:
|
||||
# 任务已存在,直接返回
|
||||
return {
|
||||
"task_id": existing_task[0],
|
||||
"is_new_task": 0,
|
||||
"url": clean_url
|
||||
}
|
||||
|
||||
# 2. 任务不存在,创建新任务记录
|
||||
# .returning(tasks_table.c.id) 是 PostgreSQL 获取自增 ID 的标准写法
|
||||
insert_task_stmt = insert(tasks_table).values(
|
||||
root_url=clean_url,
|
||||
status='running'
|
||||
).returning(tasks_table.c.id)
|
||||
|
||||
new_task_id = conn.execute(insert_task_stmt).fetchone()[0]
|
||||
|
||||
# 3. 初始化任务队列:将根 URL 作为第一条待爬取数据
|
||||
# 确保根 URL 也经过标准化处理
|
||||
insert_queue_stmt = insert(queue_table).values(
|
||||
task_id=new_task_id,
|
||||
url=clean_url,
|
||||
status='pending'
|
||||
)
|
||||
conn.execute(insert_queue_stmt)
|
||||
|
||||
return {
|
||||
"task_id": new_task_id,
|
||||
"is_new_task": 1,
|
||||
"url": clean_url
|
||||
}
|
||||
|
||||
# --- 4. Dify 节点主入口 ---
|
||||
def main(url: str, DB_URL: str):
|
||||
"""
|
||||
Dify 节点入口函数
|
||||
"""
|
||||
ret = {"code": 0, "msg": "unknown", "data": None}
|
||||
|
||||
# 从输入或环境变量获取数据库地址
|
||||
db_url = DB_URL
|
||||
|
||||
try:
|
||||
# 获取连接
|
||||
db = get_db_connection(db_url)
|
||||
|
||||
# 处理逻辑
|
||||
result_data = _logic_handler(db, url)
|
||||
|
||||
ret["code"] = 1
|
||||
ret["msg"] = "注册成功"
|
||||
ret["data"] = result_data
|
||||
|
||||
except Exception as e:
|
||||
ret["code"] = 0
|
||||
ret["msg"] = str(e)
|
||||
ret["data"] = None
|
||||
|
||||
return {
|
||||
"code": ret["code"],
|
||||
"msg": ret["msg"],
|
||||
"data": ret["data"]
|
||||
}
|
||||
106
nodes/template.py
Normal file
106
nodes/template.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import json
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from langchain_community.utilities import SQLDatabase
|
||||
from sqlalchemy import Table, MetaData, select, insert, update, delete, and_
|
||||
|
||||
# --- 工具函数:URL 标准化 ---
|
||||
def normalize_url(url: str) -> str:
|
||||
"""
|
||||
标准化 URL,确保末尾斜杠、大小写等不影响唯一性判定
|
||||
"""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
# 1. 解析 URL
|
||||
parsed = urlparse(url.strip())
|
||||
|
||||
# 2. 转换协议和域名为小写 (Domain 是不区分大小写的)
|
||||
scheme = parsed.scheme.lower()
|
||||
netloc = parsed.netloc.lower()
|
||||
|
||||
# 3. 处理路径:去除末尾的斜杠
|
||||
path = parsed.path
|
||||
if path.endswith('/'):
|
||||
path = path.rstrip('/')
|
||||
|
||||
# 4. 去除 Fragment (#部分),保留 Query 参数
|
||||
# 如果需要忽略 Query 参数,可以将 query 设置为 ""
|
||||
query = parsed.query
|
||||
|
||||
# 5. 重新拼接
|
||||
normalized = urlunparse((scheme, netloc, path, parsed.params, query, ""))
|
||||
return normalized
|
||||
|
||||
# --- 数据库连接工厂 ---
|
||||
def get_db_connection(db_url: str):
|
||||
"""
|
||||
获取通用数据库连接,处理协议兼容性
|
||||
"""
|
||||
if db_url.startswith("postgres://"):
|
||||
db_url = db_url.replace("postgres://", "postgresql+psycopg2://", 1)
|
||||
elif db_url.startswith("postgresql://") and "+psycopg2" not in db_url:
|
||||
db_url = db_url.replace("postgresql://", "postgresql+psycopg2://", 1)
|
||||
|
||||
try:
|
||||
# engine_args 确保连接池在 Dify 高并发下更稳定
|
||||
return SQLDatabase.from_uri(db_url, engine_args={
|
||||
"pool_pre_ping": True,
|
||||
"pool_recycle": 3600
|
||||
})
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"DB_CONNECT_ERROR: {str(e)}")
|
||||
|
||||
# --- Dify 节点主入口 ---
|
||||
def main(inputs: dict):
|
||||
"""
|
||||
Dify 节点主入口函数
|
||||
"""
|
||||
ret = {"code": 0, "msg": "unknown", "data": None}
|
||||
|
||||
# 预设数据库连接字符串 (建议在 Dify 环境变量中配置)
|
||||
db_url = inputs.get("db_url")
|
||||
|
||||
try:
|
||||
# 1. 初始化数据库
|
||||
db = get_db_connection(db_url)
|
||||
|
||||
# 2. 执行具体的业务逻辑
|
||||
result_data = _logic_handler(db, inputs)
|
||||
|
||||
ret["code"] = 1
|
||||
ret["msg"] = "success"
|
||||
ret["data"] = result_data
|
||||
|
||||
except Exception as e:
|
||||
ret["code"] = 0
|
||||
ret["msg"] = str(e)
|
||||
ret["data"] = None
|
||||
|
||||
return ret
|
||||
|
||||
# -------------------------------------------------
|
||||
# 业务逻辑处理器:每个节点只需修改这里
|
||||
# -------------------------------------------------
|
||||
def _logic_handler(db: SQLDatabase, inputs: dict):
|
||||
"""
|
||||
在这里编写具体的业务操作
|
||||
"""
|
||||
engine = db._engine
|
||||
metadata = MetaData()
|
||||
|
||||
# 示例:获取并标准化 URL
|
||||
raw_url = inputs.get("url", "")
|
||||
clean_url = normalize_url(raw_url)
|
||||
|
||||
# 反射获取表对象
|
||||
# tasks = Table('crawl_tasks', metadata, autoload_with=engine)
|
||||
|
||||
# 使用 SQLAlchemy Core 进行操作(无需写原生SQL)
|
||||
# with engine.begin() as conn:
|
||||
# stmt = select(tasks).where(tasks.c.root_url == clean_url)
|
||||
# result = conn.execute(stmt).fetchone()
|
||||
|
||||
return {
|
||||
"processed_url": clean_url,
|
||||
"info": "逻辑已执行"
|
||||
}
|
||||
@@ -1,6 +0,0 @@
|
||||
# 注册查询节点
|
||||
'''
|
||||
实现逻辑,从数据库查询url或task_id是否已经存在,
|
||||
存在则在返回值给出标志位提示,返回task_id
|
||||
不存在则在数据库中插入新的记录,返回新的task_id
|
||||
'''
|
||||
@@ -5,6 +5,7 @@ description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"fastapi>=0.125.0",
|
||||
"firecrawl>=4.10.2",
|
||||
"langchain>=1.2.0",
|
||||
"langchain-community>=0.4.1",
|
||||
@@ -17,4 +18,5 @@ dependencies = [
|
||||
"qdrant-client==1.10.1",
|
||||
"redis>=7.1.0",
|
||||
"requests>=2.32.5",
|
||||
"uvicorn>=0.38.0",
|
||||
]
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
{
|
||||
"res_json": [
|
||||
{
|
||||
"data": {
|
||||
"markdown": "[Skip to main content](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#content-area)\n\n[Dify Docs home page](https://docs.dify.ai/)\n\nLatest\n\n\nEnglish\n\nSearch...\n\nCtrl K\n\nSearch...\n\nNavigation\n\n1\\. Import Text Data\n\n1\\. Import Text Data\n\nClick on Knowledge in the main navigation bar of Dify. On this page, you can see your existing knowledge bases. Click **Create Knowledge** to enter the setup wizard. The Knowledge supports the import of the following two online data:Click **Knowledge** in the top navigation bar of the Dify, then select **Create Knowledge**. You can upload documents to the knowledge or importing online data to it.\n\n## [](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#upload-local-files) Upload Local Files\n\nDrag and drop or select files to upload. The number of files allowed for **batch upload** depends on your [subscription plan](https://dify.ai/pricing).**Limitations for uploading documents:**\n\n- The upload size limit for a single document is 15MB;\n- Different [subscription plans](https://dify.ai/pricing) for the SaaS version limit **batch upload numbers, total document uploads, and vector storage**\n\n\n\n## [](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#import-from-online-data-source) Import From Online Data Source\n\nWhen creating a **Knowledge**, you can import data from online sources. The knowledge supports the following two types of online data: [**1.1 Import Data from Notion** \\\\\n\\\\\nLearn how to import data from Notion](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion) [**1.2 Sync from Website** \\\\\n\\\\\nLearn how to sync data from websites](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-website)\n\nIf a knowledge base is set up to use online data, you won’t be able to add local documents later or switch it to a local file-based mode. This prevents a single knowledge base from mixing multiple data sources, avoiding management complications.\n\n## [](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#adding-data-later) Adding Data Later\n\nIf you haven’t prepared your documents or other content yet, simply create an empty knowledge first. You can then upload local files or import online data whenever you’re ready.\n\nWas this page helpful?\n\nYesNo\n\n[Previous](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/introduction) [1.1 Sync Data from Notion\\\\\n\\\\\nNext](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion)\n\nCtrl+I\n\nOn this page\n\n- [Upload Local Files](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#upload-local-files)\n- [Import From Online Data Source](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#import-from-online-data-source)\n- [Adding Data Later](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#adding-data-later)\n\nAssistant\n\nResponses are generated using AI and may contain mistakes.\n\n",
|
||||
"metadata": {
|
||||
"apple-mobile-web-app-title": "Dify Docs",
|
||||
"application-name": "Dify Docs",
|
||||
"cacheState": "hit",
|
||||
"cachedAt": "2025-12-09T08:12:32.803Z",
|
||||
"canonical": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"charset": "utf-8",
|
||||
"concurrencyLimited": true,
|
||||
"concurrencyQueueDurationMs": 371,
|
||||
"contentType": "text/html; charset=utf-8",
|
||||
"creditsUsed": 1,
|
||||
"favicon": "https://docs.dify.ai/mintlify-assets/_mintlify/favicons/dify-6c0370d8/tWYYD8GkT0MUJV0z/_generated/favicon/favicon-16x16.png",
|
||||
"generator": "Mintlify",
|
||||
"language": "en",
|
||||
"msapplication-TileColor": "#0060FF",
|
||||
"msapplication-config": "/mintlify-assets/_mintlify/favicons/dify-6c0370d8/tWYYD8GkT0MUJV0z/_generated/favicon/browserconfig.xml",
|
||||
"next-size-adjust": "",
|
||||
"og:image": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100",
|
||||
"og:image:height": "630",
|
||||
"og:image:width": "1200",
|
||||
"og:site_name": "Dify Docs",
|
||||
"og:title": "1. Import Text Data - Dify Docs",
|
||||
"og:type": "website",
|
||||
"og:url": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"ogImage": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100",
|
||||
"ogTitle": "1. Import Text Data - Dify Docs",
|
||||
"ogUrl": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"proxyUsed": "basic",
|
||||
"scrapeId": "019b024f-f76e-746b-b13c-6ca4884fdd64",
|
||||
"sourceURL": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"statusCode": 200,
|
||||
"title": "1. Import Text Data - Dify Docs",
|
||||
"twitter:card": "summary_large_image",
|
||||
"twitter:image": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100",
|
||||
"twitter:image:height": "630",
|
||||
"twitter:image:width": "1200",
|
||||
"twitter:title": "1. Import Text Data - Dify Docs",
|
||||
"url": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
|
||||
"viewport": "width=device-width, initial-scale=1"
|
||||
},
|
||||
"warning": "This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan."
|
||||
},
|
||||
"success": true
|
||||
}
|
||||
]
|
||||
}
|
||||
185
scripts/chunk.py
185
scripts/chunk.py
@@ -1,185 +0,0 @@
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
def embedding_alibaba(texts: list[str], api_key: str) -> list[list[float]]:
|
||||
"""
|
||||
调用阿里百炼 (DashScope) Embedding API
|
||||
文档参考: https://help.aliyun.com/zh/dashscope/developer-reference/text-embedding-api-details
|
||||
"""
|
||||
if not texts:
|
||||
return []
|
||||
|
||||
# 配置模型名称,阿里目前主力是 v2 和 v3
|
||||
# 如果后续阿里发布了 v4,直接在这里改字符串即可
|
||||
MODEL_NAME = "text-embedding-v4"
|
||||
|
||||
url = "https://dashscope.aliyuncs.com/api/v1/services/embeddings/text-embedding/text-embedding"
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": MODEL_NAME,
|
||||
"input": {
|
||||
"texts": texts
|
||||
},
|
||||
"parameters": {
|
||||
"text_type": "document",
|
||||
"dimension": 1536
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(url, headers=headers, json=payload, timeout=60)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
# 阿里 API 返回结构:
|
||||
# {
|
||||
# "output": {
|
||||
# "embeddings": [
|
||||
# { "embedding": [...], "text_index": 0 },
|
||||
# { "embedding": [...], "text_index": 1 }
|
||||
# ]
|
||||
# },
|
||||
# "usage": ...
|
||||
# }
|
||||
|
||||
if "output" in result and "embeddings" in result["output"]:
|
||||
# 确保按 text_index 排序,防止乱序
|
||||
embeddings_list = result["output"]["embeddings"]
|
||||
embeddings_list.sort(key=lambda x: x["text_index"])
|
||||
return [item["embedding"] for item in embeddings_list]
|
||||
else:
|
||||
print(f"Alibaba API Response Format Warning: {result}")
|
||||
return [None] * len(texts)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Alibaba Embedding Error: {e}")
|
||||
# 出错时返回 None 列表,确保流程不中断
|
||||
return [None] * len(texts)
|
||||
|
||||
def main(res_json: list, DASHSCOPE_API_KEY: str) -> dict:
|
||||
"""
|
||||
输入: res_json (Firecrawl结果), DASHSCOPE_API_KEY (阿里API Key)
|
||||
"""
|
||||
|
||||
# --- 1. 解析 Firecrawl JSON (通用容错解析) ---
|
||||
try:
|
||||
raw_data = res_json
|
||||
if isinstance(raw_data, str):
|
||||
try: raw_data = json.loads(raw_data)
|
||||
except: pass
|
||||
|
||||
data_list = []
|
||||
if isinstance(raw_data, dict) and 'res_json' in raw_data: data_list = raw_data['res_json']
|
||||
elif isinstance(raw_data, list): data_list = raw_data
|
||||
else: data_list = [raw_data]
|
||||
|
||||
if not data_list or not isinstance(data_list, list): return {"sql_values": "[]"}
|
||||
|
||||
try:
|
||||
first_result = data_list[0]
|
||||
if not isinstance(first_result, dict): return {"sql_values": "[]"}
|
||||
|
||||
data_obj = first_result.get("data", {})
|
||||
metadata = data_obj.get("metadata", {})
|
||||
|
||||
# 获取原始内容
|
||||
text = data_obj.get("markdown", "")
|
||||
title = metadata.get("title", "No Title")
|
||||
url = metadata.get("sourceURL", metadata.get("url", ""))
|
||||
|
||||
if not text: return {"sql_values": "[]"}
|
||||
except IndexError: return {"sql_values": "[]"}
|
||||
|
||||
except Exception as e:
|
||||
return {"sql_values": "[]", "error": f"Parse Error: {str(e)}"}
|
||||
|
||||
# =======================================================
|
||||
# --- 2. 通用 Markdown 清洗 (Generic Cleaning) ---
|
||||
# =======================================================
|
||||
|
||||
# 2.1 移除 Markdown 图片 () -> 也就是删掉图片
|
||||
text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
|
||||
|
||||
# 2.2 移除 Markdown 链接格式,保留文本 ([text](url) -> text)
|
||||
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
|
||||
|
||||
# 2.3 移除 HTML 标签 (简单的防噪)
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
|
||||
# 2.4 清洗特殊字符和零宽空格
|
||||
text = text.replace('\u200b', '')
|
||||
|
||||
# 2.5 压缩空行 (通用逻辑)
|
||||
# 将连续的换行符(3个以上)替换为2个,保持段落感但去除大片空白
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
# 2.6 去除首尾空白
|
||||
text = text.strip()
|
||||
|
||||
# --- 3. 安全切片 (Safe Chunking) ---
|
||||
# 800 字符切片,100 字符重叠
|
||||
chunk_size = 800
|
||||
overlap = 100
|
||||
step = chunk_size - overlap
|
||||
|
||||
chunks = []
|
||||
text_len = len(text)
|
||||
|
||||
if text_len < 50:
|
||||
chunks.append(text)
|
||||
else:
|
||||
start = 0
|
||||
while start < text_len:
|
||||
end = min(start + chunk_size, text_len)
|
||||
chunk_content = text[start:end]
|
||||
|
||||
# 防止切出过短的碎片,或者是最后一块
|
||||
if len(chunk_content) > 50 or start + step >= text_len:
|
||||
chunks.append(chunk_content)
|
||||
|
||||
start += step
|
||||
|
||||
# --- 4. 向量化 (Call Alibaba) ---
|
||||
vectors = []
|
||||
if chunks:
|
||||
# 这里传入 DASHSCOPE_API_KEY
|
||||
vectors = embedding_alibaba(chunks, DASHSCOPE_API_KEY)
|
||||
|
||||
# 双重保险:确保向量列表长度一致
|
||||
if len(vectors) != len(chunks):
|
||||
vectors = [None] * len(chunks)
|
||||
|
||||
# --- 5. 构造 SQL 数据 ---
|
||||
result_list = []
|
||||
# 简单的 SQL 转义,防止单引号报错
|
||||
safe_title = str(title).replace("'", "''")
|
||||
|
||||
for idx, content in enumerate(chunks):
|
||||
clean_content = content.strip()
|
||||
if not clean_content: continue
|
||||
|
||||
result_list.append({
|
||||
"url": url,
|
||||
"title": safe_title,
|
||||
"content": clean_content.replace("'", "''"),
|
||||
"chunk_index": idx,
|
||||
"embedding": vectors[idx]
|
||||
})
|
||||
|
||||
return {
|
||||
"sql_values": json.dumps(result_list)
|
||||
}
|
||||
if __name__ == "__main__":
|
||||
key = "sk-8b091493de594c5e9eb42f12f1cc5805"
|
||||
import json
|
||||
with open("anyscript\wiki_crawler\chunk.json", "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
res = main(data, key)
|
||||
|
||||
result = json.loads(res["sql_values"])
|
||||
print(result)
|
||||
File diff suppressed because one or more lines are too long
@@ -1,233 +0,0 @@
|
||||
它采用了 **“衔尾蛇 (Ouroboros)”** 模式:工作流通过 API 自我调用,利用 Dify 的短生命周期特性,实现无限长度的任务队列处理。
|
||||
|
||||
---
|
||||
|
||||
# Wiki Crawler RAG - 全自动递归爬虫架构设计文档 V1.2
|
||||
|
||||
## 1. 概述 (Overview)
|
||||
|
||||
本项目旨在构建一个基于 URL 锚定的增量式、全自动爬虫系统。
|
||||
为了突破 Dify 单次运行的超时限制(Timeout)和内存瓶颈(OOM),本设计采用了 **Map + Scrape + Recursion (递归)** 架构。
|
||||
|
||||
**核心特性:**
|
||||
|
||||
* **一次点击,自动托管**:用户仅需输入根 URL,工作流自动完成从发现到入库的全过程。
|
||||
* **分批吞噬**:每次运行只处理固定数量(如 50 个)页面,处理完毕后自动触发下一轮运行。
|
||||
* **断点续传**:基于数据库状态(Pending/Completed),任何时候中断都可接力继续。
|
||||
|
||||
---
|
||||
|
||||
## 2. 架构流程图 (Architecture Diagram)
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
Start([开始节点<br>Input: url, run_mode]) --> Condition{判断模式<br>run_mode?}
|
||||
|
||||
%% 分支 A: 初始化模式
|
||||
Condition -- "init (默认)" --> Map[HTTP: Firecrawl Map<br>获取全量 URL]
|
||||
Map --> InitDB[SQL: init_crawl_queue<br>批量插入 Pending 队列]
|
||||
InitDB --> TriggerWorker1[HTTP: Call Self<br>模式切换为 worker]
|
||||
TriggerWorker1 --> End1([结束: 初始化完成])
|
||||
|
||||
%% 分支 B: 打工模式
|
||||
Condition -- "worker" --> FetchBatch[SQL: Fetch Pending<br>LIMIT 50]
|
||||
FetchBatch --> Iterator[迭代器<br>并发处理 50 个任务]
|
||||
|
||||
subgraph Iteration Loop
|
||||
Iterator --> Scrape[HTTP: Firecrawl Scrape]
|
||||
Scrape --> Clean[Python: 清洗 & 提取]
|
||||
Clean --> Save[SQL: save_scrape_result<br>入库 & 标记 Completed]
|
||||
end
|
||||
|
||||
Iterator --> CheckLeft[SQL: Count Remaining]
|
||||
CheckLeft --> IfLeft{还有剩余吗?<br>Count > 0}
|
||||
|
||||
IfLeft -- "Yes" --> TriggerWorker2[HTTP: Call Self<br>递归调用 worker]
|
||||
TriggerWorker2 --> End2([结束: 本轮批次完成])
|
||||
|
||||
IfLeft -- "No" --> EndSuccess([结束: 全部爬取完成])
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. 数据流动 (Data Flow)
|
||||
|
||||
### 3.1 状态流转
|
||||
|
||||
1. **Init 阶段**:`Firecrawl Map` -> `JSON List` -> `DB (crawl_queue)`。此时所有 URL 状态为 `'pending'`。
|
||||
2. **Worker 阶段**:`DB (pending)` -> `Dify List` -> `Firecrawl Scrape` -> `DB (knowledge_chunks)` & `DB (status='completed')`。
|
||||
|
||||
### 3.2 递归逻辑
|
||||
|
||||
* **Run 1**: `run_mode='init'` -> 发现 1000 个 URL -> 存库 -> 触发 Run 2。
|
||||
* **Run 2**: `run_mode='worker'` -> 取前 50 个 -> 抓取 -> 剩余 950 -> 触发 Run 3。
|
||||
* **Run ...**: ...
|
||||
* **Run 21**: `run_mode='worker'` -> 取最后 50 个 -> 抓取 -> 剩余 0 -> 停止。
|
||||
|
||||
---
|
||||
|
||||
## 4. 数据库层准备 (Database Layer)
|
||||
|
||||
在部署工作流前,必须确保以下 SQL 函数已在 PostgreSQL 中执行。
|
||||
|
||||
### 4.1 核心表结构 (回顾)
|
||||
|
||||
* `crawl_tasks`: 存储根任务信息。
|
||||
* `crawl_queue`: 存储待爬取 URL 及其状态。
|
||||
* `knowledge_chunks`: 存储切片后的文档内容。
|
||||
|
||||
### 4.2 新增初始化函数 (必需)
|
||||
|
||||
用于 Map 阶段结束后批量写入队列。
|
||||
|
||||
```sql
|
||||
CREATE OR REPLACE FUNCTION init_crawl_queue(
|
||||
p_urls JSONB,
|
||||
p_root_url TEXT
|
||||
)
|
||||
RETURNS VOID AS $$
|
||||
BEGIN
|
||||
-- 1. 注册/更新主任务
|
||||
INSERT INTO crawl_tasks (root_url) VALUES (p_root_url)
|
||||
ON CONFLICT (root_url) DO UPDATE SET updated_at = DEFAULT;
|
||||
|
||||
-- 2. 批量插入待爬取队列 (忽略已存在的)
|
||||
INSERT INTO crawl_queue (url, root_url, status)
|
||||
SELECT x, p_root_url, 'pending'
|
||||
FROM jsonb_array_elements_text(p_urls) AS x
|
||||
ON CONFLICT (url) DO NOTHING;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. 详细节点定义 (Node Definitions)
|
||||
|
||||
以下是 Dify 工作流中每个节点的详细配置参数。
|
||||
|
||||
### 5.1 开始节点 (Start)
|
||||
|
||||
* **变量 1**: `url` (Text, 必填) - 目标网站 URL。
|
||||
* **变量 2**: `run_mode` (Select, 选填) - 运行模式。
|
||||
* 选项: `init`, `worker`
|
||||
* **默认值**: `init` (保证手动运行时从头开始)
|
||||
|
||||
### 5.2 逻辑分支 (If-Else)
|
||||
|
||||
* **条件**: `run_mode` **is** `init`
|
||||
* **True 路径**: 进入初始化流程。
|
||||
* **False 路径**: 进入打工流程。
|
||||
|
||||
---
|
||||
|
||||
### 分支 A:初始化流程 (Init)
|
||||
|
||||
#### Node A1: HTTP 请求 (Firecrawl Map)
|
||||
|
||||
* **API**: `POST https://api.firecrawl.dev/v1/map`
|
||||
* **Body**:
|
||||
```json
|
||||
{
|
||||
"url": "{{#start.url#}}",
|
||||
"limit": 5000,
|
||||
"includeSubdomains": true,
|
||||
"ignoreSitemap": false
|
||||
}
|
||||
```
|
||||
|
||||
#### Node A2: SQL (Init Queue)
|
||||
|
||||
* **Query**: `SELECT init_crawl_queue($arg0::jsonb, $arg1);`
|
||||
* **arg0**: `{{#NodeA1.body.links#}}` (注意:Map 接口返回的是 links 数组)
|
||||
* **arg1**: `{{#start.url#}}`
|
||||
|
||||
#### Node A3: HTTP 请求 (Trigger Self)
|
||||
|
||||
* **API**: `POST https://api.dify.ai/v1/workflows/run` (替换为您的私有部署域名)
|
||||
* **Headers**: `Authorization: Bearer app-xxxxxxxx` (使用本应用的 API Key)
|
||||
* **Body**:
|
||||
```json
|
||||
{
|
||||
"inputs": {
|
||||
"url": "{{#start.url#}}",
|
||||
"run_mode": "worker"
|
||||
},
|
||||
"response_mode": "blocking",
|
||||
"user": "system-recursion-trigger"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 分支 B:打工流程 (Worker)
|
||||
|
||||
#### Node B1: SQL (Fetch Batch)
|
||||
|
||||
* **Query**:
|
||||
```sql
|
||||
SELECT url FROM crawl_queue
|
||||
WHERE root_url = $arg0 AND status = 'pending'
|
||||
LIMIT 50;
|
||||
```
|
||||
* **arg0**: `{{#start.url#}}`
|
||||
* **Output**: 记为 `batch_list`
|
||||
|
||||
#### Node B2: 迭代器 (Iterator)
|
||||
|
||||
* **Input**: `{{#NodeB1.result#}}`
|
||||
* **Parallelism**: 开启 (推荐 5-10 并发)
|
||||
|
||||
> **内部节点 B2-1: HTTP (Scrape)**
|
||||
>
|
||||
> * API: `POST https://api.firecrawl.dev/v1/scrape`
|
||||
> * Body: `{"url": "{{#item.url#}}", "formats": ["markdown"]}`
|
||||
>
|
||||
> **内部节点 B2-2: Python (Clean)**
|
||||
>
|
||||
> * Code: 清洗 Markdown,去除图片,截取正文,返回标准 JSON 结构 (含 content, title, url)。
|
||||
>
|
||||
> **内部节点 B2-3: SQL (Save)**
|
||||
>
|
||||
> * Query: `SELECT save_scrape_result($arg0::jsonb, $arg1, $arg2);`
|
||||
> * arg0: `{{#NodeB2-2.json_string#}}`
|
||||
> * arg1: `{{#item.url#}}`
|
||||
> * arg2: `{{#start.url#}}`
|
||||
>
|
||||
|
||||
#### Node B3: SQL (Check Remaining)
|
||||
|
||||
* **Query**:
|
||||
```sql
|
||||
SELECT count(*) as count FROM crawl_queue
|
||||
WHERE root_url = $arg0 AND status = 'pending';
|
||||
```
|
||||
* **arg0**: `{{#start.url#}}`
|
||||
|
||||
#### Node B4: 逻辑分支 (Recursion Check)
|
||||
|
||||
* **条件**: `{{#NodeB3.result[0].count#}}` **>** `0`
|
||||
|
||||
#### Node B5: HTTP 请求 (Trigger Self - Recursion)
|
||||
|
||||
* *(配置同 Node A3)*
|
||||
* **作用**: 当检测到还有剩余任务时,再次调用自己,开启下一轮 50 个页面的抓取。
|
||||
|
||||
---
|
||||
|
||||
## 6. 异常处理与安全机制
|
||||
|
||||
1. **死循环熔断**:
|
||||
|
||||
* 建议在 HTTP Trigger Body 中增加一个 `loop_count` 字段。
|
||||
* `inputs: { "loop_count": {{#start.loop_count#}} + 1 }`
|
||||
* 在 Start 节点后增加校验:如果 `loop_count > 100`,强制停止,防止意外消耗过多额度。
|
||||
2. **API Rate Limit**:
|
||||
|
||||
* 如果 Firecrawl 报错 429,迭代器内的 SQL 节点不会执行 `UPDATE ... SET completed`。
|
||||
* 该 URL 状态仍为 `pending`。
|
||||
* 下一轮 Worker 运行时,会再次尝试抓取该 URL(自动重试机制)。
|
||||
3. **超时控制**:
|
||||
|
||||
* 每个 Worker 批次处理 50 个页面,以单页 5秒计算,并发 10的情况下,耗时约 25-30秒。
|
||||
* 远低于 Dify 默认的 300秒/600秒 超时限制,极其安全。
|
||||
1032
scripts/diff.json
1032
scripts/diff.json
File diff suppressed because it is too large
Load Diff
146
scripts/diff.py
146
scripts/diff.py
@@ -1,146 +0,0 @@
|
||||
"""
|
||||
因为map_json和history_json的传入格式不同,在main里改太麻烦
|
||||
为了解耦,所以写两个函数将map和history转换为标准结构
|
||||
urls = [
|
||||
"https://www.baidu.com",
|
||||
"https://www.taobao.com",
|
||||
"https://www.jd.com",
|
||||
"https://www.1688.com",
|
||||
"https://www.taobao.com",
|
||||
"https://www.jd.com",
|
||||
"https://www.1688.com",
|
||||
]
|
||||
"""
|
||||
|
||||
|
||||
def map_json_transform(map_json: list[dict]) -> list[dict]:
|
||||
"""
|
||||
将Map节点的输出转为干净的输出,避免杂七杂八的数据干扰
|
||||
输入: Map节点的输出
|
||||
输出: 转换后的 JSON 数组,每个元素包含 title, url, markdown 字段
|
||||
"""
|
||||
try:
|
||||
# 如果 Dify 传入的是字符串,先转为字典
|
||||
map_obj = json.loads(map_json) if isinstance(map_json, str) else map_json
|
||||
# map_obj首先被列表包裹
|
||||
map_obj = map_obj[0]
|
||||
# 直接从json里提取 links 数组
|
||||
if isinstance(map_obj, dict) and "links" in map_obj:
|
||||
links = map_obj["links"]
|
||||
|
||||
except Exception as e:
|
||||
return {"targets": [], "msg": f"Map数据解析失败: {str(e)}"}
|
||||
|
||||
return {"targets": links, "msg": "Map数据解析成功"}
|
||||
|
||||
def history_json_transform(history_json: list[dict]) -> list[dict]:
|
||||
"""
|
||||
将History节点的输出转为干净的输出,避免杂七杂八的数据干扰
|
||||
输入: History节点的输出
|
||||
输出: 转换后的 JSON 数组,每个元素包含 url 字段
|
||||
"""
|
||||
try:
|
||||
# 如果 Dify 传入的是字符串,先转为字典
|
||||
hist_obj = json.loads(history_json) if isinstance(history_json, str) else history_json
|
||||
# hist_obj首先被列表包裹
|
||||
hist_obj = hist_obj[0]
|
||||
# 将data解析出来
|
||||
hist_obj = hist_obj['data'][0]
|
||||
# 得到result
|
||||
|
||||
# 直接从json里提取 hist_data 数组
|
||||
if isinstance(hist_obj, dict) and "data" in hist_obj:
|
||||
hist_data = hist_obj["data"]
|
||||
|
||||
except Exception as e:
|
||||
return {"targets": [], "msg": f"History数据解析失败: {str(e)}"}
|
||||
|
||||
return {"targets": hist_data, "msg": "History数据解析成功"}
|
||||
|
||||
def main(map_json: list[dict], history_json: list[dict], batch_size: float):
|
||||
"""
|
||||
map_json: Firecrawl Map 节点的输出
|
||||
history_json: 数据库查询节点的输出 (包含 hist_data 数组)
|
||||
batch_size: 每次处理的数量
|
||||
"""
|
||||
|
||||
# 1. 解析 Map 数据 (全量链接)
|
||||
all_links = []
|
||||
try:
|
||||
# 如果 Dify 传入的是字符串,先转为字典
|
||||
map_obj = json.loads(map_json) if isinstance(map_json, str) else map_json
|
||||
# map_obj首先被列表包裹
|
||||
map_obj = map_obj[0]
|
||||
# 直接从json里提取 links 数组
|
||||
if isinstance(map_obj, dict) and "links" in map_obj:
|
||||
all_links = map_obj["links"]
|
||||
|
||||
except Exception as e:
|
||||
return {"targets": [], "msg": f"Map数据解析失败: {str(e)}"}
|
||||
|
||||
# 2. 解析 History 数据 (已完成链接)
|
||||
completed_set = set()
|
||||
try:
|
||||
hist_obj = json.loads(history_json) if isinstance(history_json, str) else history_json
|
||||
# hist_obj首先被列表包裹
|
||||
hist_obj = hist_obj[0]
|
||||
# 直接从json里提取 hist_data 数组
|
||||
if isinstance(hist_obj, dict) and "data" in hist_obj:
|
||||
hist_data = hist_obj["data"]
|
||||
|
||||
# 将 hist_data 里的 url 提取出来放入集合 (Set) 做 O(1) 查找
|
||||
for item in hist_data:
|
||||
if isinstance(item, dict):
|
||||
url_val = item.get("url")
|
||||
if url_val:
|
||||
completed_set.add(url_val)
|
||||
|
||||
except Exception as e:
|
||||
return {"targets": [], "msg": f"History数据解析失败: {str(e)}"}
|
||||
|
||||
# 3. 核心逻辑: 过滤 (Diff)
|
||||
targets = []
|
||||
for link in all_links:
|
||||
# A. 基础校验: 必须是字符串且以 http 开头
|
||||
if not isinstance(link, str) or not link.startswith("http"):
|
||||
continue
|
||||
|
||||
# B. 过滤 sitemap.xml 自身
|
||||
if link.endswith(".xml") or "sitemap" in link.split("/")[-1]:
|
||||
continue
|
||||
|
||||
# C. 过滤已爬取的 (关键步骤)
|
||||
if link in completed_set:
|
||||
continue
|
||||
|
||||
targets.append(link)
|
||||
|
||||
# 4. 分批 (Batch)
|
||||
try:
|
||||
limit = int(batch_size)
|
||||
except:
|
||||
limit = 10 # 默认值
|
||||
|
||||
current_batch = targets[:limit]
|
||||
remaining_count = len(targets) - len(current_batch)
|
||||
|
||||
# 5. 返回结果
|
||||
return {
|
||||
"targets": current_batch,
|
||||
"count_urls": len(all_links),
|
||||
"count_completed": len(completed_set),
|
||||
"count_remaining": remaining_count,
|
||||
"count_error": 0
|
||||
# "msg": f"全站发现: {len(all_links)} | 已入库: {len(completed_set)} | 本次执行: {len(current_batch)} | 剩余待爬: {remaining_count}"
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
with open("anyscript\wiki_crawler\diff.json", "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
map_json = data["map_json"]
|
||||
history_json = data["history_json"]
|
||||
batch_size = data["batch_size"]
|
||||
|
||||
result = main(map_json, history_json, batch_size)
|
||||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||
@@ -1,24 +0,0 @@
|
||||
{
|
||||
"count_completed": 1,
|
||||
"count_error": 1,
|
||||
"count_remaining": 1,
|
||||
"sql_res": [
|
||||
{
|
||||
"data": [
|
||||
{
|
||||
"save_scrape_result": {
|
||||
"data": {
|
||||
"stats": {
|
||||
"deleted_chunks": 0,
|
||||
"inserted_chunks": 2,
|
||||
"queue_updates": 1
|
||||
}
|
||||
},
|
||||
"message": "Saved successfully",
|
||||
"status": "success"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,37 +0,0 @@
|
||||
|
||||
def parse_sql_res(sql_res: list[dict]):
|
||||
'''
|
||||
解析sql_res
|
||||
'''
|
||||
parsed_data = sql_res[0]["data"][0]["save_scrape_result"]
|
||||
return parsed_data
|
||||
|
||||
def main(sql_res: list[dict], count_completed: float, count_remaining: float, count_error: float):
|
||||
|
||||
parsed_data = parse_sql_res(sql_res)
|
||||
if parsed_data["status"] == "success":
|
||||
count_completed += 1
|
||||
count_remaining -= 1
|
||||
else:
|
||||
count_error += 1
|
||||
|
||||
return {
|
||||
"result":{
|
||||
"count_completed": count_completed,
|
||||
"count_remaining": count_remaining,
|
||||
"count_error": count_error,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
with open(r"anyscript\wiki_crawler\regard.json", "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
res = main(
|
||||
sql_res=data["sql_res"],
|
||||
count_completed=data["count_completed"],
|
||||
count_remaining=data["count_remaining"],
|
||||
count_error=data["count_error"],
|
||||
)
|
||||
print(res)
|
||||
147
scripts/test_env.py
Normal file
147
scripts/test_env.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
|
||||
def main():
|
||||
report = {
|
||||
"status": "running",
|
||||
"steps": [],
|
||||
"env_info": {},
|
||||
"vector_pipeline_test": "pending"
|
||||
}
|
||||
|
||||
try:
|
||||
# ==========================================
|
||||
# 1. 环境自检
|
||||
# ==========================================
|
||||
import numpy
|
||||
import requests
|
||||
import langchain
|
||||
import langchain_community
|
||||
import pydantic
|
||||
|
||||
report["env_info"] = {
|
||||
"python": sys.version.split()[0],
|
||||
"numpy": numpy.__version__,
|
||||
"langchain": langchain.__version__,
|
||||
"pydantic": pydantic.__version__
|
||||
}
|
||||
report["steps"].append("✅ 基础环境库加载成功")
|
||||
|
||||
# ==========================================
|
||||
# 2. 定义 Mock 组件 (模拟向量生成)
|
||||
# ==========================================
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.documents import Document
|
||||
|
||||
class MockEmbeddings(Embeddings):
|
||||
"""生成固定维度的随机向量,模拟 OpenAI/HuggingFace"""
|
||||
def __init__(self, dim=1536):
|
||||
self.dim = dim
|
||||
|
||||
def embed_documents(self, texts):
|
||||
# 返回 N 个随机向量
|
||||
return [numpy.random.rand(self.dim).tolist() for _ in texts]
|
||||
|
||||
def embed_query(self, text):
|
||||
# 返回 1 个随机向量
|
||||
return numpy.random.rand(self.dim).tolist()
|
||||
|
||||
embeddings = MockEmbeddings(dim=768)
|
||||
report["steps"].append("✅ 向量嵌入模型 (Mock) 初始化成功")
|
||||
|
||||
# ==========================================
|
||||
# 3. 实战测试:使用 Qdrant 内存模式跑全流程
|
||||
# ==========================================
|
||||
# 这一步能验证 LangChain 和底层库的数据流转是否正常
|
||||
try:
|
||||
from langchain_community.vectorstores import Qdrant
|
||||
|
||||
# 准备假数据
|
||||
docs = [
|
||||
Document(page_content="Hello Dify", metadata={"id": 1}),
|
||||
Document(page_content="Vector Database Test", metadata={"id": 2}),
|
||||
Document(page_content="Conflict Check", metadata={"id": 3}),
|
||||
]
|
||||
|
||||
# --- 关键步骤:在内存中建库 ---
|
||||
# Qdrant 支持 location=":memory:",不需要外部服务器
|
||||
db = Qdrant.from_documents(
|
||||
docs,
|
||||
embeddings,
|
||||
location=":memory:",
|
||||
collection_name="test_collection"
|
||||
)
|
||||
|
||||
# --- 关键步骤:执行向量检索 ---
|
||||
results = db.similarity_search("Hello", k=1)
|
||||
|
||||
if results and len(results) > 0:
|
||||
report["vector_pipeline_test"] = "✅ Success (Qdrant In-Memory)"
|
||||
report["steps"].append(f"✅ 向量存取测试通过: 检索到 '{results[0].page_content}'")
|
||||
else:
|
||||
report["vector_pipeline_test"] = "❌ Failed (No results)"
|
||||
|
||||
except Exception as e:
|
||||
report["vector_pipeline_test"] = f"❌ Failed: {str(e)}"
|
||||
# 如果这一步挂了,说明依赖有深层冲突
|
||||
|
||||
# ==========================================
|
||||
# 4. 其他数据库驱动加载测试
|
||||
# ==========================================
|
||||
# 这一步测试是否能正确 import,确保驱动都在
|
||||
drivers = {}
|
||||
|
||||
# [PostgreSQL / PGVector]
|
||||
try:
|
||||
import psycopg2
|
||||
from langchain_community.vectorstores import PGVector
|
||||
drivers["postgres"] = "✅ Loaded"
|
||||
except Exception as e:
|
||||
drivers["postgres"] = f"❌ Error: {str(e)}"
|
||||
|
||||
# [Milvus]
|
||||
try:
|
||||
import pymilvus
|
||||
from langchain_community.vectorstores import Milvus
|
||||
drivers["milvus"] = "✅ Loaded"
|
||||
except Exception as e:
|
||||
drivers["milvus"] = f"❌ Error: {str(e)}"
|
||||
|
||||
# [Redis]
|
||||
try:
|
||||
import redis
|
||||
from langchain_community.vectorstores import Redis
|
||||
drivers["redis"] = "✅ Loaded"
|
||||
except Exception as e:
|
||||
drivers["redis"] = f"❌ Error: {str(e)}"
|
||||
|
||||
# [Pinecone]
|
||||
try:
|
||||
import pinecone # 新版包名
|
||||
from langchain_community.vectorstores import Pinecone
|
||||
drivers["pinecone"] = "✅ Loaded"
|
||||
except Exception as e:
|
||||
drivers["pinecone"] = f"❌ Error: {str(e)}"
|
||||
|
||||
report["drivers_status"] = drivers
|
||||
|
||||
# ==========================================
|
||||
# 5. 最终判定
|
||||
# ==========================================
|
||||
if "❌" not in str(drivers.values()) and "Success" in report["vector_pipeline_test"]:
|
||||
report["status"] = "success"
|
||||
report["message"] = "🎉 完美验证!所有库已就绪,且向量计算逻辑运行正常。"
|
||||
else:
|
||||
report["status"] = "warning"
|
||||
report["message"] = "存在潜在问题,请检查详细信息。"
|
||||
|
||||
except Exception as e:
|
||||
report["status"] = "error"
|
||||
report["message"] = f"全局错误: {str(e)}"
|
||||
|
||||
return report
|
||||
|
||||
if __name__ == "__main__":
|
||||
report = main()
|
||||
print(report)
|
||||
Reference in New Issue
Block a user