106 lines
3.2 KiB
Python
106 lines
3.2 KiB
Python
|
|
import json
|
|||
|
|
from urllib.parse import urlparse, urlunparse
|
|||
|
|
from langchain_community.utilities import SQLDatabase
|
|||
|
|
from sqlalchemy import Table, MetaData, select, insert, update, delete, and_
|
|||
|
|
|
|||
|
|
# --- 工具函数:URL 标准化 ---
|
|||
|
|
def normalize_url(url: str) -> str:
|
|||
|
|
"""
|
|||
|
|
标准化 URL,确保末尾斜杠、大小写等不影响唯一性判定
|
|||
|
|
"""
|
|||
|
|
if not url:
|
|||
|
|
return url
|
|||
|
|
|
|||
|
|
# 1. 解析 URL
|
|||
|
|
parsed = urlparse(url.strip())
|
|||
|
|
|
|||
|
|
# 2. 转换协议和域名为小写 (Domain 是不区分大小写的)
|
|||
|
|
scheme = parsed.scheme.lower()
|
|||
|
|
netloc = parsed.netloc.lower()
|
|||
|
|
|
|||
|
|
# 3. 处理路径:去除末尾的斜杠
|
|||
|
|
path = parsed.path
|
|||
|
|
if path.endswith('/'):
|
|||
|
|
path = path.rstrip('/')
|
|||
|
|
|
|||
|
|
# 4. 去除 Fragment (#部分),保留 Query 参数
|
|||
|
|
# 如果需要忽略 Query 参数,可以将 query 设置为 ""
|
|||
|
|
query = parsed.query
|
|||
|
|
|
|||
|
|
# 5. 重新拼接
|
|||
|
|
normalized = urlunparse((scheme, netloc, path, parsed.params, query, ""))
|
|||
|
|
return normalized
|
|||
|
|
|
|||
|
|
# --- 数据库连接工厂 ---
|
|||
|
|
def get_db_connection(db_url: str):
|
|||
|
|
"""
|
|||
|
|
获取通用数据库连接,处理协议兼容性
|
|||
|
|
"""
|
|||
|
|
if db_url.startswith("postgres://"):
|
|||
|
|
db_url = db_url.replace("postgres://", "postgresql+psycopg2://", 1)
|
|||
|
|
elif db_url.startswith("postgresql://") and "+psycopg2" not in db_url:
|
|||
|
|
db_url = db_url.replace("postgresql://", "postgresql+psycopg2://", 1)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# engine_args 确保连接池在 Dify 高并发下更稳定
|
|||
|
|
return SQLDatabase.from_uri(db_url, engine_args={
|
|||
|
|
"pool_pre_ping": True,
|
|||
|
|
"pool_recycle": 3600
|
|||
|
|
})
|
|||
|
|
except Exception as e:
|
|||
|
|
raise RuntimeError(f"DB_CONNECT_ERROR: {str(e)}")
|
|||
|
|
|
|||
|
|
# --- Dify 节点主入口 ---
|
|||
|
|
def main(inputs: dict):
|
|||
|
|
"""
|
|||
|
|
Dify 节点主入口函数
|
|||
|
|
"""
|
|||
|
|
ret = {"code": 0, "msg": "unknown", "data": None}
|
|||
|
|
|
|||
|
|
# 预设数据库连接字符串 (建议在 Dify 环境变量中配置)
|
|||
|
|
db_url = inputs.get("db_url")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 1. 初始化数据库
|
|||
|
|
db = get_db_connection(db_url)
|
|||
|
|
|
|||
|
|
# 2. 执行具体的业务逻辑
|
|||
|
|
result_data = _logic_handler(db, inputs)
|
|||
|
|
|
|||
|
|
ret["code"] = 1
|
|||
|
|
ret["msg"] = "success"
|
|||
|
|
ret["data"] = result_data
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
ret["code"] = 0
|
|||
|
|
ret["msg"] = str(e)
|
|||
|
|
ret["data"] = None
|
|||
|
|
|
|||
|
|
return ret
|
|||
|
|
|
|||
|
|
# -------------------------------------------------
|
|||
|
|
# 业务逻辑处理器:每个节点只需修改这里
|
|||
|
|
# -------------------------------------------------
|
|||
|
|
def _logic_handler(db: SQLDatabase, inputs: dict):
|
|||
|
|
"""
|
|||
|
|
在这里编写具体的业务操作
|
|||
|
|
"""
|
|||
|
|
engine = db._engine
|
|||
|
|
metadata = MetaData()
|
|||
|
|
|
|||
|
|
# 示例:获取并标准化 URL
|
|||
|
|
raw_url = inputs.get("url", "")
|
|||
|
|
clean_url = normalize_url(raw_url)
|
|||
|
|
|
|||
|
|
# 反射获取表对象
|
|||
|
|
# tasks = Table('crawl_tasks', metadata, autoload_with=engine)
|
|||
|
|
|
|||
|
|
# 使用 SQLAlchemy Core 进行操作(无需写原生SQL)
|
|||
|
|
# with engine.begin() as conn:
|
|||
|
|
# stmt = select(tasks).where(tasks.c.root_url == clean_url)
|
|||
|
|
# result = conn.execute(stmt).fetchone()
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"processed_url": clean_url,
|
|||
|
|
"info": "逻辑已执行"
|
|||
|
|
}
|