diff --git a/backend/main.py b/backend/main.py index 7666d91..b8cec64 100644 --- a/backend/main.py +++ b/backend/main.py @@ -16,7 +16,7 @@ async def register(req: RegisterRequest): @app.post("/add_urls") async def add_urls(req: AddUrlsRequest): try: - data = crawler_service.add_urls(req.task_id, req.urls) + data = crawler_service.add_urls(req.task_id, req.urls_obj) return make_response(1, "Success", data) except Exception as e: return make_response(0, str(e)) diff --git a/backend/schemas.py b/backend/schemas.py index bc26b21..c1c4d92 100644 --- a/backend/schemas.py +++ b/backend/schemas.py @@ -10,7 +10,7 @@ class PendingRequest(BaseModel): class AddUrlsRequest(BaseModel): task_id: int - urls: List[str] + urls_obj: dict # schemas.py class CrawlResult(BaseModel): diff --git a/backend/service.py b/backend/service.py index 281b003..4c535fa 100644 --- a/backend/service.py +++ b/backend/service.py @@ -23,15 +23,17 @@ class CrawlerService: new_task = conn.execute(stmt).fetchone() return {"task_id": new_task[0], "is_new_task": True} - def add_urls(self, task_id: int, urls: list): + def add_urls(self, task_id: int, urls_obj: dict): """通用 API 实现的批量添加(含详细返回)""" success_urls, skipped_urls, failed_urls = [], [], [] - + # 从 urls_obj 中提取 urls 列表 + urls = urls_obj.get("urls", []) + with self.db.engine.begin() as conn: for url in urls: clean_url = normalize_url(url) try: - # 检查是否存在 (通用写法) + # 检查队列中是否已存在该 URL (通用写法) check_q = select(self.db.queue).where( and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url) ) diff --git a/nodes/parse_firecrawl_map.py b/nodes/parse_firecrawl_map.py index f0d101a..527dc4a 100644 --- a/nodes/parse_firecrawl_map.py +++ b/nodes/parse_firecrawl_map.py @@ -1,4 +1,4 @@ -def main(map_json: list[dict], BASE_URL: str): +def main(map_json: list[dict]): """ 将Firecrawl Map节点的输出转换为干净的输出,避免杂七杂八的数据干扰 输入: Firecrawl Map节点的输出,结构如下 @@ -17,6 +17,21 @@ def main(map_json: list[dict], BASE_URL: str): map_obj = map_json[0] return { - "urls": map_obj["links"], - "code": int(map_obj["success"]), + # "urls": map_obj["links"], + # "code": int(map_obj["success"]), + "urls_obj": { + "urls": map_obj["links"] + } } + +''' +返回值示例 +{ + "urls_obj": { + "urls": [ + "http://example.com/page1", + ] + } +} + +''' \ No newline at end of file