完成节点

This commit is contained in:
2025-12-22 22:08:51 +08:00
parent 694a2ab979
commit 75b32b4bcd
9 changed files with 426 additions and 60 deletions

View File

@@ -12,8 +12,10 @@ class AddUrlsRequest(BaseModel):
task_id: int
urls: List[str]
# schemas.py
class CrawlResult(BaseModel):
url: str
source_url: str
chunk_index: int # 新增字段
title: Optional[str] = None
content: Optional[str] = None
embedding: Optional[List[float]] = None

View File

@@ -1,5 +1,5 @@
from sqlalchemy import select, update, and_
from sqlalchemy.dialects.postgresql import insert as pg_insert
# service.py
from sqlalchemy import select, insert, update, delete, and_
from .database import db_instance
from .utils import normalize_url
@@ -8,84 +8,120 @@ class CrawlerService:
self.db = db_instance
def register_task(self, url: str):
"""注册新任务并初始化队列"""
"""完全使用库 API 实现的注册"""
clean_url = normalize_url(url)
with self.db.engine.begin() as conn:
# 1. 查重
find_stmt = select(self.db.tasks.c.id).where(self.db.tasks.c.root_url == clean_url)
existing = conn.execute(find_stmt).fetchone()
# 使用 select() API
query = select(self.db.tasks.c.id).where(self.db.tasks.c.root_url == clean_url)
existing = conn.execute(query).fetchone()
if existing:
return {"task_id": existing[0], "is_new_task": False}
# 2. 插入新任务
new_task = conn.execute(
pg_insert(self.db.tasks).values(root_url=clean_url).returning(self.db.tasks.c.id)
).fetchone()
task_id = new_task[0]
# 3. 初始化首个 URL 到队列
conn.execute(
pg_insert(self.db.queue).values(task_id=task_id, url=clean_url, status='pending')
)
return {"task_id": task_id, "is_new_task": True}
# 使用 insert() API
stmt = insert(self.db.tasks).values(root_url=clean_url).returning(self.db.tasks.c.id)
new_task = conn.execute(stmt).fetchone()
return {"task_id": new_task[0], "is_new_task": True}
def add_urls(self, task_id: int, urls: list):
"""批量存入新发现的待处理 URL自动去重"""
added_count = 0
"""通用 API 实现的批量添加(含详细返回"""
success_urls, skipped_urls, failed_urls = [], [], []
with self.db.engine.begin() as conn:
for url in urls:
clean_url = normalize_url(url)
stmt = pg_insert(self.db.queue).values(
task_id=task_id,
url=clean_url,
status='pending'
).on_conflict_do_nothing(index_elements=['task_id', 'url'])
try:
# 检查是否存在 (通用写法)
check_q = select(self.db.queue).where(
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
)
if conn.execute(check_q).fetchone():
skipped_urls.append(clean_url)
continue
res = conn.execute(stmt)
if res.rowcount > 0:
added_count += 1
return {"added_count": added_count}
# 插入新 URL
conn.execute(insert(self.db.queue).values(
task_id=task_id, url=clean_url, status='pending'
))
success_urls.append(clean_url)
except Exception:
failed_urls.append(clean_url)
return {"success_urls": success_urls, "skipped_urls": skipped_urls, "failed_urls": failed_urls}
def get_pending_urls(self, task_id: int, limit: int):
"""原子化获取待处理 URL 并锁定"""
"""原子锁定 API 实现"""
with self.db.engine.begin() as conn:
stmt = select(self.db.queue.c.url).where(
query = select(self.db.queue.c.url).where(
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.status == 'pending')
).limit(limit)
urls = [r[0] for r in conn.execute(stmt).fetchall()]
urls = [r[0] for r in conn.execute(query).fetchall()]
if urls:
conn.execute(
update(self.db.queue).where(
upd = update(self.db.queue).where(
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url.in_(urls))
).values(status='processing')
)
conn.execute(upd)
return {"urls": urls}
def save_results(self, task_id: int, results: list):
"""保存正文、向量并闭环队列状态"""
"""通用 API 实现的 UPSERT 逻辑:区分插入、更新、失败"""
inserted_urls, updated_urls, failed_urls = [], [], []
with self.db.engine.begin() as conn:
for res in results:
clean_url = normalize_url(res.url)
# 存入数据
conn.execute(
pg_insert(self.db.chunks).values(
# 适配 Dify 传来的字典或对象
data = res if isinstance(res, dict) else res.__dict__
clean_url = normalize_url(data.get('source_url'))
c_idx = data.get('chunk_index')
try:
# 1. 检查是否存在该切片
find_q = select(self.db.chunks).where(
and_(
self.db.chunks.c.task_id == task_id,
self.db.chunks.c.source_url == clean_url,
self.db.chunks.c.chunk_index == c_idx
)
)
existing = conn.execute(find_q).fetchone()
if existing:
# 2. 执行更新 API
upd = update(self.db.chunks).where(self.db.chunks.c.id == existing[0]).values(
title=data.get('title'),
content=data.get('content'),
embedding=data.get('embedding')
)
conn.execute(upd)
updated_urls.append(clean_url)
else:
# 3. 执行插入 API
ins = insert(self.db.chunks).values(
task_id=task_id,
source_url=clean_url,
title=res.title,
content=res.content,
embedding=res.embedding
chunk_index=c_idx,
title=data.get('title'),
content=data.get('content'),
embedding=data.get('embedding')
)
)
# 更新状态
conn.execute(
update(self.db.queue).where(
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
).values(status='completed')
)
return {"inserted": len(results)}
conn.execute(ins)
inserted_urls.append(clean_url)
# 4. 更新队列状态
conn.execute(update(self.db.queue).where(
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
).values(status='completed'))
except Exception as e:
print(f"Error: {e}")
failed_urls.append(clean_url)
return {
"inserted_urls": list(set(inserted_urls)),
"updated_urls": list(set(updated_urls)),
"failed_urls": failed_urls
}
# 全局单例
crawler_service = CrawlerService()

View File

@@ -0,0 +1,51 @@
{
"res_json": [
{
"data": {
"markdown": "[Skip to main content](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#content-area)\n\n[Dify Docs home page![light logo](https://assets-docs.dify.ai/2025/05/d05cfc6ebe48f725d171dc71c64a5d16.svg)![dark logo](https://assets-docs.dify.ai/2025/05/c51f1cda47c1d9a4a162d7736f6e4c53.svg)](https://docs.dify.ai/)\n\nLatest\n![US](https://d3gk2c5xim1je2.cloudfront.net/flags/US.svg)\n\nEnglish\n\nSearch...\n\nCtrl K\n\nSearch...\n\nNavigation\n\n1\\. Import Text Data\n\n1\\. Import Text Data\n\nClick on Knowledge in the main navigation bar of Dify. On this page, you can see your existing knowledge bases. Click **Create Knowledge** to enter the setup wizard. The Knowledge supports the import of the following two online data:Click **Knowledge** in the top navigation bar of the Dify, then select **Create Knowledge**. You can upload documents to the knowledge or importing online data to it.\n\n## [](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#upload-local-files) Upload Local Files\n\nDrag and drop or select files to upload. The number of files allowed for **batch upload** depends on your [subscription plan](https://dify.ai/pricing).**Limitations for uploading documents:**\n\n- The upload size limit for a single document is 15MB;\n- Different [subscription plans](https://dify.ai/pricing) for the SaaS version limit **batch upload numbers, total document uploads, and vector storage**\n\n![Create knowledge](https://assets-docs.dify.ai/2025/01/22064cb61356e4c005c4072d5d066cf6.png)\n\n## [](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#import-from-online-data-source) Import From Online Data Source\n\nWhen creating a **Knowledge**, you can import data from online sources. The knowledge supports the following two types of online data: [**1.1 Import Data from Notion** \\\\\n\\\\\nLearn how to import data from Notion](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion) [**1.2 Sync from Website** \\\\\n\\\\\nLearn how to sync data from websites](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-website)\n\nIf a knowledge base is set up to use online data, you wont be able to add local documents later or switch it to a local file-based mode. This prevents a single knowledge base from mixing multiple data sources, avoiding management complications.\n\n## [](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#adding-data-later) Adding Data Later\n\nIf you havent prepared your documents or other content yet, simply create an empty knowledge first. You can then upload local files or import online data whenever youre ready.\n\nWas this page helpful?\n\nYesNo\n\n[Previous](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/introduction) [1.1 Sync Data from Notion\\\\\n\\\\\nNext](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion)\n\nCtrl+I\n\nOn this page\n\n- [Upload Local Files](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#upload-local-files)\n- [Import From Online Data Source](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#import-from-online-data-source)\n- [Adding Data Later](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#adding-data-later)\n\nAssistant\n\nResponses are generated using AI and may contain mistakes.\n\n![Create knowledge](https://assets-docs.dify.ai/2025/01/22064cb61356e4c005c4072d5d066cf6.png)",
"metadata": {
"apple-mobile-web-app-title": "Dify Docs",
"application-name": "Dify Docs",
"cacheState": "hit",
"cachedAt": "2025-12-09T08:12:32.803Z",
"canonical": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
"charset": "utf-8",
"concurrencyLimited": true,
"concurrencyQueueDurationMs": 371,
"contentType": "text/html; charset=utf-8",
"creditsUsed": 1,
"favicon": "https://docs.dify.ai/mintlify-assets/_mintlify/favicons/dify-6c0370d8/tWYYD8GkT0MUJV0z/_generated/favicon/favicon-16x16.png",
"generator": "Mintlify",
"language": "en",
"msapplication-TileColor": "#0060FF",
"msapplication-config": "/mintlify-assets/_mintlify/favicons/dify-6c0370d8/tWYYD8GkT0MUJV0z/_generated/favicon/browserconfig.xml",
"next-size-adjust": "",
"og:image": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100",
"og:image:height": "630",
"og:image:width": "1200",
"og:site_name": "Dify Docs",
"og:title": "1. Import Text Data - Dify Docs",
"og:type": "website",
"og:url": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
"ogImage": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100",
"ogTitle": "1. Import Text Data - Dify Docs",
"ogUrl": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
"proxyUsed": "basic",
"scrapeId": "019b024f-f76e-746b-b13c-6ca4884fdd64",
"sourceURL": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
"statusCode": 200,
"title": "1. Import Text Data - Dify Docs",
"twitter:card": "summary_large_image",
"twitter:image": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100",
"twitter:image:height": "630",
"twitter:image:width": "1200",
"twitter:title": "1. Import Text Data - Dify Docs",
"url": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme",
"viewport": "width=device-width, initial-scale=1"
},
"warning": "This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan."
},
"success": true
}
]
}

View File

@@ -0,0 +1,125 @@
import json
import re
import requests
def text_cleaning(text: str) -> str:
"""
对文本进行清洗,移除多余空格、换行符等
"""
text = re.sub(r'\s+', ' ', text) # 替换多个空格为一个空格
text = text.strip() # 移除首尾空格
return text
def text_to_chunks(text: str):
chunk_size = 800
overlap = 100 # 100 字符重叠意思是每块文本之间有100个字符的重叠
step = chunk_size - overlap
chunks = []
text_len = len(text)
if text_len < 50:
chunks.append(text)
else:
start = 0
while start < text_len:
end = min(start + chunk_size, text_len)
chunk_content = text[start:end]
# 防止切出过短的碎片,或者是最后一块
if len(chunk_content) > 50 or start + step >= text_len:
chunks.append(chunk_content)
start += step
return chunks
def chunks_embedding(texts: list[str], api_key: str) -> list[list[float]]:
if not texts:
return []
MODEL_NAME = "text-embedding-v4"
url = "https://dashscope.aliyuncs.com/api/v1/services/embeddings/text-embedding/text-embedding"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
payload = {
"model": MODEL_NAME,
"input": {"texts": texts},
"parameters": {"text_type": "document", "dimension": 1536}
}
try:
response = requests.post(url, headers=headers, json=payload, timeout=60)
response.raise_for_status()
result = response.json()
if "output" in result and "embeddings" in result["output"]:
embeddings_list = result["output"]["embeddings"]
embeddings_list.sort(key=lambda x: x["text_index"])
# --- 核心修复:对每个浮点数保留 8 位小数,解决精度过高报错 ---
final_vectors = []
for item in embeddings_list:
# 将每个 float 限制在 8 位精度以内
rounded_vector = [round(float(val), 8) for val in item["embedding"]]
final_vectors.append(rounded_vector)
return final_vectors
else:
return [None] * len(texts)
except Exception as e:
print(f"Alibaba Embedding Error: {e}")
return [None] * len(texts)
def main(scrape_json: list, DASHSCOPE_API_KEY: str) -> dict:
"""
输入: res_json (Firecrawl结果), DASHSCOPE_API_KEY (阿里API Key)
"""
# --- 1. 解析 Firecrawl JSON (通用容错解析) ---
scrape_obj = scrape_json[0]
if not scrape_obj["success"]:
return {"results": []}
data = scrape_obj.get("data", [])
# 获取原始内容
text = data.get("markdown", "")
metadata = data.get("metadata", {})
warning = data.get("warning", "")
# =======================================================
# --- 2. 通用 Markdown 清洗 (Generic Cleaning) ---
# =======================================================
text = text_cleaning(text)
# --- 3. 安全切片 (Safe Chunking) ---
# 800 字符切片100 字符重叠
chunks = text_to_chunks(text)
# --- 4. 向量化 (Call Alibaba) ---
vectors = []
if chunks:
# 这里传入 DASHSCOPE_API_KEY
vectors = chunks_embedding(chunks, DASHSCOPE_API_KEY)
# 双重保险:确保向量列表长度一致
if len(vectors) != len(chunks):
vectors = [None] * len(chunks)
# --- 5. 构造 SQL 数据 ---
result_list = []
for idx, content in enumerate(chunks):
clean_content = content.strip() # 清洗首尾空白
if not clean_content: continue
result_list.append({
"source_url": metadata.get("sourceURL", ""),
"title": metadata.get("title", ""),
"content": clean_content,
"chunk_index": idx,
"embedding": vectors[idx]
})
return {
"results": result_list
}

View File

@@ -0,0 +1,22 @@
def main(map_json: list[dict]):
"""
将Firecrawl Map节点的输出转换为干净的输出避免杂七杂八的数据干扰
输入: Firecrawl Map节点的输出结构如下
"map_json": [
{
"links": [
"http://example.com/page1",
"http://example.com/page2"
],
"success": true,
},
]
因为比较简单而且与firecrawl组件绑定比所以就直接main里写完了
"""
map_obj = map_json[0]
return {
"urls": map_obj["links"],
"code": int(map_obj["success"]),
}

View File

@@ -0,0 +1,20 @@
def check_status(status_code: float, body: str):
'''
检查状态码和约定的返回值
'''
if status_code != 200:
raise Exception(f"注册任务失败,状态码:{status_code}")
if "code" not in body or body["code"] != 1:
raise Exception(f"注册任务失败,返回值:{body}")
def main(status_code: float, body: str):
try:
check_status(status_code, body)
except Exception as e:
raise e
urls = body["data"]["urls"]
return {
"urls": urls
}

23
nodes/parse_register.py Normal file
View File

@@ -0,0 +1,23 @@
def check_status(status_code: float, body: str):
'''
检查状态码和约定的返回值
'''
if status_code != 200:
raise Exception(f"注册任务失败,状态码:{status_code}")
if "code" not in body or body["code"] != 1:
raise Exception(f"注册任务失败,返回值:{body}")
def main(status_code: float, body: str):
try:
check_status(status_code, body)
except Exception as e:
raise e
task_id = body["data"]["task_id"]
is_new_task = body["data"]["is_new_task"]
return {
"task_id": task_id,
"is_new_task": is_new_task
}

View File

@@ -1,4 +1,4 @@
[project]
[project]
name = "scripts"
version = "0.1.0"
description = "Add your description here"

87
scripts/test_apis.py Normal file
View File

@@ -0,0 +1,87 @@
import requests
import json
import random
# 配置后端地址
BASE_URL = "http://127.0.0.1:8000"
def log_res(name, response):
print(f"\n=== 测试接口: {name} ===")
if response.status_code == 200:
res_json = response.json()
print(f"状态: 成功 (HTTP 200)")
print(f"返回数据: {json.dumps(res_json, indent=2, ensure_ascii=False)}")
return res_json
else:
print(f"状态: 失败 (HTTP {response.status_code})")
print(f"错误信息: {response.text}")
return None
def run_tests():
# 测试数据准备
test_root_url = f"https://example.com/wiki_{random.randint(1000, 9999)}"
# 1. 测试 /register
print("步骤 1: 注册新任务...")
res = requests.post(f"{BASE_URL}/register", json={"url": test_root_url})
data = log_res("注册任务", res)
if not data or data['code'] != 1: return
task_id = data['data']['task_id']
# 2. 测试 /add_urls
print("\n步骤 2: 模拟爬虫发现了新链接,存入队列...")
sub_urls = [
f"{test_root_url}/page1",
f"{test_root_url}/page2",
f"{test_root_url}/page1" # 故意重复一个,测试后端去重
]
res = requests.post(f"{BASE_URL}/add_urls", json={
"task_id": task_id,
"urls": sub_urls
})
log_res("存入新链接", res)
# 3. 测试 /pending_urls
print("\n步骤 3: 模拟爬虫节点获取待处理任务...")
res = requests.post(f"{BASE_URL}/pending_urls", json={
"task_id": task_id,
"limit": 2
})
data = log_res("获取待处理URL", res)
if not data or not data['data']['urls']:
print("没有获取到待处理URL停止后续测试")
return
target_url = data['data']['urls'][0]
# 4. 测试 /save_results
print("\n步骤 4: 模拟爬虫抓取完成,存入知识片段和向量...")
# 模拟一个 1536 维的向量(已处理精度)
mock_embedding = [round(random.uniform(-1, 1), 8) for _ in range(1536)]
payload = {
"task_id": task_id,
"results": [
{
"source_url": target_url,
"chunk_index": 0,
"title": "测试页面标题 - 切片1",
"content": "这是模拟抓取到的第一段网页内容...",
"embedding": mock_embedding
},
{
"source_url": target_url,
"chunk_index": 1,
"title": "测试页面标题 - 切片2",
"content": "这是模拟抓取到的第二段网页内容...",
"embedding": mock_embedding
}
]
}
res = requests.post(f"{BASE_URL}/save_results", json=payload)
log_res("保存结果", res)
print("\n✅ 所有 API 流程测试完成!")
if __name__ == "__main__":
run_tests()