Files
wiki_crawler/nodes/template.py
2025-12-20 17:08:54 +08:00

106 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
from urllib.parse import urlparse, urlunparse
from langchain_community.utilities import SQLDatabase
from sqlalchemy import Table, MetaData, select, insert, update, delete, and_
# --- 工具函数URL 标准化 ---
def normalize_url(url: str) -> str:
"""
标准化 URL确保末尾斜杠、大小写等不影响唯一性判定
"""
if not url:
return url
# 1. 解析 URL
parsed = urlparse(url.strip())
# 2. 转换协议和域名为小写 (Domain 是不区分大小写的)
scheme = parsed.scheme.lower()
netloc = parsed.netloc.lower()
# 3. 处理路径:去除末尾的斜杠
path = parsed.path
if path.endswith('/'):
path = path.rstrip('/')
# 4. 去除 Fragment (#部分),保留 Query 参数
# 如果需要忽略 Query 参数,可以将 query 设置为 ""
query = parsed.query
# 5. 重新拼接
normalized = urlunparse((scheme, netloc, path, parsed.params, query, ""))
return normalized
# --- 数据库连接工厂 ---
def get_db_connection(db_url: str):
"""
获取通用数据库连接,处理协议兼容性
"""
if db_url.startswith("postgres://"):
db_url = db_url.replace("postgres://", "postgresql+psycopg2://", 1)
elif db_url.startswith("postgresql://") and "+psycopg2" not in db_url:
db_url = db_url.replace("postgresql://", "postgresql+psycopg2://", 1)
try:
# engine_args 确保连接池在 Dify 高并发下更稳定
return SQLDatabase.from_uri(db_url, engine_args={
"pool_pre_ping": True,
"pool_recycle": 3600
})
except Exception as e:
raise RuntimeError(f"DB_CONNECT_ERROR: {str(e)}")
# --- Dify 节点主入口 ---
def main(inputs: dict):
"""
Dify 节点主入口函数
"""
ret = {"code": 0, "msg": "unknown", "data": None}
# 预设数据库连接字符串 (建议在 Dify 环境变量中配置)
db_url = inputs.get("db_url")
try:
# 1. 初始化数据库
db = get_db_connection(db_url)
# 2. 执行具体的业务逻辑
result_data = _logic_handler(db, inputs)
ret["code"] = 1
ret["msg"] = "success"
ret["data"] = result_data
except Exception as e:
ret["code"] = 0
ret["msg"] = str(e)
ret["data"] = None
return ret
# -------------------------------------------------
# 业务逻辑处理器:每个节点只需修改这里
# -------------------------------------------------
def _logic_handler(db: SQLDatabase, inputs: dict):
"""
在这里编写具体的业务操作
"""
engine = db._engine
metadata = MetaData()
# 示例:获取并标准化 URL
raw_url = inputs.get("url", "")
clean_url = normalize_url(raw_url)
# 反射获取表对象
# tasks = Table('crawl_tasks', metadata, autoload_with=engine)
# 使用 SQLAlchemy Core 进行操作无需写原生SQL
# with engine.begin() as conn:
# stmt = select(tasks).where(tasks.c.root_url == clean_url)
# result = conn.execute(stmt).fetchone()
return {
"processed_url": clean_url,
"info": "逻辑已执行"
}