修改地图爬取urls,应对dify的列表上限和http超时
This commit is contained in:
@@ -16,7 +16,7 @@ async def register(req: RegisterRequest):
|
|||||||
@app.post("/add_urls")
|
@app.post("/add_urls")
|
||||||
async def add_urls(req: AddUrlsRequest):
|
async def add_urls(req: AddUrlsRequest):
|
||||||
try:
|
try:
|
||||||
data = crawler_service.add_urls(req.task_id, req.urls)
|
data = crawler_service.add_urls(req.task_id, req.urls_obj)
|
||||||
return make_response(1, "Success", data)
|
return make_response(1, "Success", data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return make_response(0, str(e))
|
return make_response(0, str(e))
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ class PendingRequest(BaseModel):
|
|||||||
|
|
||||||
class AddUrlsRequest(BaseModel):
|
class AddUrlsRequest(BaseModel):
|
||||||
task_id: int
|
task_id: int
|
||||||
urls: List[str]
|
urls_obj: dict
|
||||||
|
|
||||||
# schemas.py
|
# schemas.py
|
||||||
class CrawlResult(BaseModel):
|
class CrawlResult(BaseModel):
|
||||||
|
|||||||
@@ -23,15 +23,17 @@ class CrawlerService:
|
|||||||
new_task = conn.execute(stmt).fetchone()
|
new_task = conn.execute(stmt).fetchone()
|
||||||
return {"task_id": new_task[0], "is_new_task": True}
|
return {"task_id": new_task[0], "is_new_task": True}
|
||||||
|
|
||||||
def add_urls(self, task_id: int, urls: list):
|
def add_urls(self, task_id: int, urls_obj: dict):
|
||||||
"""通用 API 实现的批量添加(含详细返回)"""
|
"""通用 API 实现的批量添加(含详细返回)"""
|
||||||
success_urls, skipped_urls, failed_urls = [], [], []
|
success_urls, skipped_urls, failed_urls = [], [], []
|
||||||
|
# 从 urls_obj 中提取 urls 列表
|
||||||
|
urls = urls_obj.get("urls", [])
|
||||||
|
|
||||||
with self.db.engine.begin() as conn:
|
with self.db.engine.begin() as conn:
|
||||||
for url in urls:
|
for url in urls:
|
||||||
clean_url = normalize_url(url)
|
clean_url = normalize_url(url)
|
||||||
try:
|
try:
|
||||||
# 检查是否存在 (通用写法)
|
# 检查队列中是否已存在该 URL (通用写法)
|
||||||
check_q = select(self.db.queue).where(
|
check_q = select(self.db.queue).where(
|
||||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
|
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
def main(map_json: list[dict], BASE_URL: str):
|
def main(map_json: list[dict]):
|
||||||
"""
|
"""
|
||||||
将Firecrawl Map节点的输出转换为干净的输出,避免杂七杂八的数据干扰
|
将Firecrawl Map节点的输出转换为干净的输出,避免杂七杂八的数据干扰
|
||||||
输入: Firecrawl Map节点的输出,结构如下
|
输入: Firecrawl Map节点的输出,结构如下
|
||||||
@@ -17,6 +17,21 @@ def main(map_json: list[dict], BASE_URL: str):
|
|||||||
map_obj = map_json[0]
|
map_obj = map_json[0]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"urls": map_obj["links"],
|
# "urls": map_obj["links"],
|
||||||
"code": int(map_obj["success"]),
|
# "code": int(map_obj["success"]),
|
||||||
|
"urls_obj": {
|
||||||
|
"urls": map_obj["links"]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
'''
|
||||||
|
返回值示例
|
||||||
|
{
|
||||||
|
"urls_obj": {
|
||||||
|
"urls": [
|
||||||
|
"http://example.com/page1",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
'''
|
||||||
Reference in New Issue
Block a user