import json from urllib.parse import urlparse, urlunparse from langchain_community.utilities import SQLDatabase from sqlalchemy import Table, MetaData, select, insert, update, delete, and_ # --- 工具函数:URL 标准化 --- def normalize_url(url: str) -> str: """ 标准化 URL,确保末尾斜杠、大小写等不影响唯一性判定 """ if not url: return url # 1. 解析 URL parsed = urlparse(url.strip()) # 2. 转换协议和域名为小写 (Domain 是不区分大小写的) scheme = parsed.scheme.lower() netloc = parsed.netloc.lower() # 3. 处理路径:去除末尾的斜杠 path = parsed.path if path.endswith('/'): path = path.rstrip('/') # 4. 去除 Fragment (#部分),保留 Query 参数 # 如果需要忽略 Query 参数,可以将 query 设置为 "" query = parsed.query # 5. 重新拼接 normalized = urlunparse((scheme, netloc, path, parsed.params, query, "")) return normalized # --- 数据库连接工厂 --- def get_db_connection(db_url: str): """ 获取通用数据库连接,处理协议兼容性 """ if db_url.startswith("postgres://"): db_url = db_url.replace("postgres://", "postgresql+psycopg2://", 1) elif db_url.startswith("postgresql://") and "+psycopg2" not in db_url: db_url = db_url.replace("postgresql://", "postgresql+psycopg2://", 1) try: # engine_args 确保连接池在 Dify 高并发下更稳定 return SQLDatabase.from_uri(db_url, engine_args={ "pool_pre_ping": True, "pool_recycle": 3600 }) except Exception as e: raise RuntimeError(f"DB_CONNECT_ERROR: {str(e)}") # --- Dify 节点主入口 --- def main(inputs: dict): """ Dify 节点主入口函数 """ ret = {"code": 0, "msg": "unknown", "data": None} # 预设数据库连接字符串 (建议在 Dify 环境变量中配置) db_url = inputs.get("db_url") try: # 1. 初始化数据库 db = get_db_connection(db_url) # 2. 执行具体的业务逻辑 result_data = _logic_handler(db, inputs) ret["code"] = 1 ret["msg"] = "success" ret["data"] = result_data except Exception as e: ret["code"] = 0 ret["msg"] = str(e) ret["data"] = None return ret # ------------------------------------------------- # 业务逻辑处理器:每个节点只需修改这里 # ------------------------------------------------- def _logic_handler(db: SQLDatabase, inputs: dict): """ 在这里编写具体的业务操作 """ engine = db._engine metadata = MetaData() # 示例:获取并标准化 URL raw_url = inputs.get("url", "") clean_url = normalize_url(raw_url) # 反射获取表对象 # tasks = Table('crawl_tasks', metadata, autoload_with=engine) # 使用 SQLAlchemy Core 进行操作(无需写原生SQL) # with engine.begin() as conn: # stmt = select(tasks).where(tasks.c.root_url == clean_url) # result = conn.execute(stmt).fetchone() return { "processed_url": clean_url, "info": "逻辑已执行" }