Files
wiki_crawler/nodes/parse_and_add_urls.py

106 lines
3.4 KiB
Python
Raw Normal View History

2025-12-28 10:33:47 +08:00
import requests
import json
import math
from concurrent.futures import ThreadPoolExecutor, as_completed
def parse_urls(map_json: list[dict]):
"""
解析 Firecrawl 返回的数据
"""
if not map_json:
return []
map_obj = map_json[0]
# 稍微做个容错,防止有些时候结构不一样
if not map_obj.get("success", False):
# 如果不是必须抛异常,可以打印日志并返回空
print(f"Firecrawl Map节点返回失败或无数据{map_obj}")
return []
urls = map_obj.get("links", [])
return urls
def send_batch_request(urls_batch: list[str], task_id: int, BASE_URL: str):
"""
发送单个批次的请求
"""
try:
# 设置 timeout 是好习惯,防止卡死
# 因为我们不关心返回值只要发出去就行timeout设置短一点
res = requests.post(
f"{BASE_URL}/add_urls",
json={
"task_id": task_id,
"urls": urls_batch
},
timeout=10 # 10秒没发完就拉倒防止拖累主进程
)
if res.status_code == 200:
return True
else:
print(f"Batch failed with status {res.status_code}: {res.text[:100]}")
return False
except Exception as e:
print(f"Batch request error: {e}")
return False
def main(map_json: list[dict], BASE_URL: str, task_id: float):
# 1. 解析 URL
all_urls = parse_urls(map_json)
total_count = len(all_urls)
if total_count == 0:
return {"msg": "没有解析到URL"}
# ================= 配置区 =================
BATCH_SIZE = 50 # 每一批发送 50 个 URL (根据你后端性能调整)
MAX_WORKERS = 10 # 同时开 10 个线程并发发送
# ==========================================
# 2. 将 URL 切片 (分批)
# 比如 1000 个 URL切成 20 个 batch
batches = [all_urls[i:i + BATCH_SIZE] for i in range(0, total_count, BATCH_SIZE)]
print(f"总共 {total_count} 个URL分为 {len(batches)} 批发送,并发数: {MAX_WORKERS}")
# 3. 多线程并发发送
# 在 Dify/Lambda 等环境中,必须等待线程池执行完才能退出 main
# 否则请求还没发出去进程就被杀掉了。但因为是并发,速度会非常快。
success_batches = 0
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
# 提交所有任务
futures = [
executor.submit(send_batch_request, batch, int(task_id), BASE_URL)
for batch in batches
]
# 等待完成 (as_completed)
for future in as_completed(futures):
if future.result():
success_batches += 1
return {
"status": "done",
"total_urls": total_count,
"batches_sent": len(batches),
"success_batches": success_batches,
"msg": "已使用多线程并发发送数据,忽略详细返回值"
}
def test():
import json
from time import time
with open("nodes\parse_and_add_urls.json", "r", encoding="utf-8") as f:
data = json.load(f)
map_json = data["json"]
BASE_URL = "http://47.122.127.178"
task_id = 6
start_time = time()
res = main(map_json, BASE_URL, task_id)
end_time = time()
print(f"添加URL耗时{end_time - start_time}")
print(res)
test()