修改地图爬取urls,应对dify的列表上限和http超时

This commit is contained in:
2025-12-29 11:29:52 +08:00
parent 79b3f79c15
commit 20317e6788
4 changed files with 25 additions and 8 deletions

View File

@@ -16,7 +16,7 @@ async def register(req: RegisterRequest):
@app.post("/add_urls") @app.post("/add_urls")
async def add_urls(req: AddUrlsRequest): async def add_urls(req: AddUrlsRequest):
try: try:
data = crawler_service.add_urls(req.task_id, req.urls) data = crawler_service.add_urls(req.task_id, req.urls_obj)
return make_response(1, "Success", data) return make_response(1, "Success", data)
except Exception as e: except Exception as e:
return make_response(0, str(e)) return make_response(0, str(e))

View File

@@ -10,7 +10,7 @@ class PendingRequest(BaseModel):
class AddUrlsRequest(BaseModel): class AddUrlsRequest(BaseModel):
task_id: int task_id: int
urls: List[str] urls_obj: dict
# schemas.py # schemas.py
class CrawlResult(BaseModel): class CrawlResult(BaseModel):

View File

@@ -23,15 +23,17 @@ class CrawlerService:
new_task = conn.execute(stmt).fetchone() new_task = conn.execute(stmt).fetchone()
return {"task_id": new_task[0], "is_new_task": True} return {"task_id": new_task[0], "is_new_task": True}
def add_urls(self, task_id: int, urls: list): def add_urls(self, task_id: int, urls_obj: dict):
"""通用 API 实现的批量添加(含详细返回)""" """通用 API 实现的批量添加(含详细返回)"""
success_urls, skipped_urls, failed_urls = [], [], [] success_urls, skipped_urls, failed_urls = [], [], []
# 从 urls_obj 中提取 urls 列表
urls = urls_obj.get("urls", [])
with self.db.engine.begin() as conn: with self.db.engine.begin() as conn:
for url in urls: for url in urls:
clean_url = normalize_url(url) clean_url = normalize_url(url)
try: try:
# 检查是否存在 (通用写法) # 检查队列中是否存在该 URL (通用写法)
check_q = select(self.db.queue).where( check_q = select(self.db.queue).where(
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url) and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
) )

View File

@@ -1,4 +1,4 @@
def main(map_json: list[dict], BASE_URL: str): def main(map_json: list[dict]):
""" """
将Firecrawl Map节点的输出转换为干净的输出避免杂七杂八的数据干扰 将Firecrawl Map节点的输出转换为干净的输出避免杂七杂八的数据干扰
输入: Firecrawl Map节点的输出结构如下 输入: Firecrawl Map节点的输出结构如下
@@ -17,6 +17,21 @@ def main(map_json: list[dict], BASE_URL: str):
map_obj = map_json[0] map_obj = map_json[0]
return { return {
"urls": map_obj["links"], # "urls": map_obj["links"],
"code": int(map_obj["success"]), # "code": int(map_obj["success"]),
"urls_obj": {
"urls": map_obj["links"]
}
} }
'''
返回值示例
{
"urls_obj": {
"urls": [
"http://example.com/page1",
]
}
}
'''