Files
wiki_crawler/nodes/parse_and_add_urls.py
2025-12-28 10:33:47 +08:00

106 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import requests
import json
import math
from concurrent.futures import ThreadPoolExecutor, as_completed
def parse_urls(map_json: list[dict]):
"""
解析 Firecrawl 返回的数据
"""
if not map_json:
return []
map_obj = map_json[0]
# 稍微做个容错,防止有些时候结构不一样
if not map_obj.get("success", False):
# 如果不是必须抛异常,可以打印日志并返回空
print(f"Firecrawl Map节点返回失败或无数据{map_obj}")
return []
urls = map_obj.get("links", [])
return urls
def send_batch_request(urls_batch: list[str], task_id: int, BASE_URL: str):
"""
发送单个批次的请求
"""
try:
# 设置 timeout 是好习惯,防止卡死
# 因为我们不关心返回值只要发出去就行timeout设置短一点
res = requests.post(
f"{BASE_URL}/add_urls",
json={
"task_id": task_id,
"urls": urls_batch
},
timeout=10 # 10秒没发完就拉倒防止拖累主进程
)
if res.status_code == 200:
return True
else:
print(f"Batch failed with status {res.status_code}: {res.text[:100]}")
return False
except Exception as e:
print(f"Batch request error: {e}")
return False
def main(map_json: list[dict], BASE_URL: str, task_id: float):
# 1. 解析 URL
all_urls = parse_urls(map_json)
total_count = len(all_urls)
if total_count == 0:
return {"msg": "没有解析到URL"}
# ================= 配置区 =================
BATCH_SIZE = 50 # 每一批发送 50 个 URL (根据你后端性能调整)
MAX_WORKERS = 10 # 同时开 10 个线程并发发送
# ==========================================
# 2. 将 URL 切片 (分批)
# 比如 1000 个 URL切成 20 个 batch
batches = [all_urls[i:i + BATCH_SIZE] for i in range(0, total_count, BATCH_SIZE)]
print(f"总共 {total_count} 个URL分为 {len(batches)} 批发送,并发数: {MAX_WORKERS}")
# 3. 多线程并发发送
# 在 Dify/Lambda 等环境中,必须等待线程池执行完才能退出 main
# 否则请求还没发出去进程就被杀掉了。但因为是并发,速度会非常快。
success_batches = 0
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
# 提交所有任务
futures = [
executor.submit(send_batch_request, batch, int(task_id), BASE_URL)
for batch in batches
]
# 等待完成 (as_completed)
for future in as_completed(futures):
if future.result():
success_batches += 1
return {
"status": "done",
"total_urls": total_count,
"batches_sent": len(batches),
"success_batches": success_batches,
"msg": "已使用多线程并发发送数据,忽略详细返回值"
}
def test():
import json
from time import time
with open("nodes\parse_and_add_urls.json", "r", encoding="utf-8") as f:
data = json.load(f)
map_json = data["json"]
BASE_URL = "http://47.122.127.178"
task_id = 6
start_time = time()
res = main(map_json, BASE_URL, task_id)
end_time = time()
print(f"添加URL耗时{end_time - start_time}")
print(res)
test()