新增业务原子化;新增fireccrawl的baseurl修改
This commit is contained in:
@@ -23,7 +23,11 @@ class CrawlerService:
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.firecrawl = FirecrawlApp(api_key=settings.FIRECRAWL_API_KEY)
|
||||
# 实例化 FirecrawlApp
|
||||
if settings.FIRECRAWL_API_KEY_EXSIST:
|
||||
self.firecrawl = FirecrawlApp(api_key=settings.FIRECRAWL_API_KEY, api_url=settings.FIRECRAWL_API_URL)
|
||||
else:
|
||||
self.firecrawl = FirecrawlApp(api_url=settings.FIRECRAWL_API_URL)
|
||||
self.max_workers = 5 # 线程池最大并发数
|
||||
|
||||
# 内存状态追踪: { task_id: set([url1, url2]) }
|
||||
@@ -85,59 +89,74 @@ class CrawlerService:
|
||||
"active_thread_count": len(active_urls)
|
||||
}
|
||||
|
||||
def map_site(self, start_url: str) -> Dict[str, Any]:
|
||||
def map_site(self, start_url: str, persist: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
第一阶段:站点地图扫描 (Map)
|
||||
|
||||
|
||||
改动要点:
|
||||
- 先执行外部 map,确认能成功抓取到链接后再进行数据库注册与写入,避免出现“已注册但 map 未完成”的半成品任务。
|
||||
- 增加参数 persist(默认 True)。当 persist=False 时仅返回发现的链接列表,不进行任何数据库写入(用于假性/暂存流程)。
|
||||
- 使用 data_service.create_task_with_urls 在单个事务中创建任务并批量插入 URL(去重),提高原子性。
|
||||
|
||||
Args:
|
||||
start_url (str): 目标网站的根 URL
|
||||
|
||||
persist (bool): 是否将发现的 URL 持久化到数据库。用于先做假性扫描,后续统一持久化或回滚。
|
||||
|
||||
Returns:
|
||||
dict: 包含任务 ID 和发现链接数的字典。
|
||||
{
|
||||
"task_id": 123,
|
||||
"task_id": 123 | None,
|
||||
"count": 50,
|
||||
"is_new": True
|
||||
"is_new": True | False | None,
|
||||
"urls": [ ... ],
|
||||
"persisted": True | False
|
||||
}
|
||||
"""
|
||||
logger.info(f"Mapping: {start_url}")
|
||||
logger.info(f"Mapping (persist={persist}): {start_url}")
|
||||
try:
|
||||
task_res = data_service.register_task(start_url)
|
||||
urls_to_add = [start_url]
|
||||
|
||||
# 如果任务已存在,不再重新 Map,直接返回
|
||||
if not task_res['is_new_task']:
|
||||
logger.info(f"Task {task_res['task_id']} exists, skipping map.")
|
||||
return {
|
||||
"task_id": task_res['task_id'],
|
||||
"count": 0,
|
||||
"is_new": False
|
||||
}
|
||||
|
||||
# 新任务执行 Map
|
||||
# 0. 先尝试执行外部 map(不进行任何数据库动作)
|
||||
try:
|
||||
map_res = self.firecrawl.map(start_url)
|
||||
# 兼容不同版本的 SDK 返回结构
|
||||
found_links = map_res.get('links', []) if isinstance(map_res, dict) else getattr(map_res, 'links', [])
|
||||
|
||||
|
||||
urls_to_add = [start_url]
|
||||
for link in found_links:
|
||||
u = link if isinstance(link, str) else getattr(link, 'url', str(link))
|
||||
urls_to_add.append(u)
|
||||
logger.info(f"Map found {len(found_links)} links")
|
||||
logger.info(f"Map found {len(found_links)} links for {start_url}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Map failed, proceeding with seed only: {e}")
|
||||
# map 失败时不创建任务,直接抛出异常或返回失败信息,由上层决定回滚策略
|
||||
logger.error(f"Map failed for {start_url}, aborting register: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
# 1. 如果仅做假性扫描(不持久化),直接返回发现的链接,供上层统一持久化或回滚
|
||||
if not persist:
|
||||
return {
|
||||
"task_id": None,
|
||||
"count": len(urls_to_add),
|
||||
"is_new": None,
|
||||
"urls": urls_to_add,
|
||||
"persisted": False
|
||||
}
|
||||
|
||||
# 2. map 成功且需要持久化:使用原子化接口在单个事务中创建任务并写入队列
|
||||
try:
|
||||
create_res = data_service.create_task_with_urls(start_url, urls_to_add)
|
||||
return {
|
||||
"task_id": create_res.get('task_id'),
|
||||
"count": create_res.get('added', 0),
|
||||
"is_new": create_res.get('is_new_task', False),
|
||||
"urls": urls_to_add,
|
||||
"persisted": True
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Atomic create_task_with_urls failed for {start_url}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
if urls_to_add:
|
||||
data_service.add_urls(task_res['task_id'], urls_to_add)
|
||||
|
||||
return {
|
||||
"task_id": task_res['task_id'],
|
||||
"count": len(urls_to_add),
|
||||
"is_new": True
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Map failed: {e}")
|
||||
raise e
|
||||
logger.error(f"Map+Register failed for {start_url}: {e}")
|
||||
raise
|
||||
|
||||
def _process_single_url(self, task_id: int, url: str):
|
||||
"""[Internal Worker] 单个 URL 处理线程逻辑"""
|
||||
|
||||
Reference in New Issue
Block a user