Files
wiki_crawler/backend/main.py

131 lines
4.8 KiB
Python
Raw Normal View History

# backend/main.py
from fastapi import FastAPI, APIRouter, BackgroundTasks
2025-12-30 16:57:31 +08:00
# 确保导入路径与你的文件名一致,如果文件名是 workflow.py 则用 workflow
from .services.crawler_sql_service import crawler_sql_service
from .services.automated_crawler import workflow
from .schemas import (
RegisterRequest, PendingRequest, SaveResultsRequest, AddUrlsRequest, SearchRequest,
AutoMapRequest, AutoProcessRequest, TextSearchRequest
)
2025-12-20 17:08:54 +08:00
from .utils import make_response
app = FastAPI(title="Wiki Crawler API")
2025-12-30 16:57:31 +08:00
# ==========================================
# 工具函数
# ==========================================
# ==========================================
# V1 Router: 原始的底层接口 (Manual Control)
# ==========================================
router_v1 = APIRouter()
@router_v1.post("/register")
2025-12-20 17:08:54 +08:00
async def register(req: RegisterRequest):
try:
2025-12-30 16:57:31 +08:00
# Service 返回: {'task_id': 1, 'is_new_task': True, 'msg': '...'}
res = crawler_sql_service.register_task(req.url)
# 使用 pop 将 msg 提取出来作为响应的 msg剩下的作为 data
return make_response(1, res.pop("msg", "Success"), res)
2025-12-20 17:08:54 +08:00
except Exception as e:
return make_response(0, str(e))
@router_v1.post("/add_urls")
2025-12-20 17:08:54 +08:00
async def add_urls(req: AddUrlsRequest):
try:
2025-12-29 14:42:33 +08:00
urls = req.urls_obj["urls"]
2025-12-30 16:57:31 +08:00
res = crawler_sql_service.add_urls(req.task_id, urls=urls)
return make_response(1, res.pop("msg", "Success"), res)
2025-12-20 17:08:54 +08:00
except Exception as e:
return make_response(0, str(e))
@router_v1.post("/pending_urls")
2025-12-20 17:08:54 +08:00
async def pending_urls(req: PendingRequest):
try:
2025-12-30 16:57:31 +08:00
res = crawler_sql_service.get_pending_urls(req.task_id, req.limit)
# 即使队列为空Service 也会返回 msg="Queue is empty"
return make_response(1, res.pop("msg", "Success"), res)
2025-12-20 17:08:54 +08:00
except Exception as e:
return make_response(0, str(e))
@router_v1.post("/save_results")
2025-12-20 17:08:54 +08:00
async def save_results(req: SaveResultsRequest):
try:
2025-12-30 16:57:31 +08:00
res = crawler_sql_service.save_results(req.task_id, req.results)
return make_response(1, res.pop("msg", "Success"), res)
2025-12-20 17:08:54 +08:00
except Exception as e:
2025-12-22 22:50:07 +08:00
return make_response(0, str(e))
2025-12-23 00:36:49 +08:00
@router_v1.post("/search")
async def search_v1(req: SearchRequest):
2025-12-30 16:57:31 +08:00
"""V1 搜索:客户端手动传向量"""
2025-12-23 00:36:49 +08:00
try:
2025-12-29 14:42:33 +08:00
vector = req.query_embedding['vector']
if not vector:
return make_response(2, "Vector is empty", None)
2025-12-29 14:42:33 +08:00
2025-12-30 16:57:31 +08:00
# Service 现在返回 {'results': [...], 'msg': 'Found ...'}
res = crawler_sql_service.search_knowledge(
2025-12-29 14:42:33 +08:00
query_embedding=vector,
2025-12-23 00:36:49 +08:00
task_id=req.task_id,
limit=req.limit
)
2025-12-30 16:57:31 +08:00
return make_response(1, res.pop("msg", "Search Done"), res)
except Exception as e:
return make_response(0, str(e))
2025-12-23 00:36:49 +08:00
# ==========================================
# V2 Router: 自动化工作流 (Automated Workflow)
# ==========================================
router_v2 = APIRouter()
@router_v2.post("/auto/map")
2025-12-30 16:57:31 +08:00
async def auto_map(req: AutoMapRequest):
"""
2025-12-30 16:57:31 +08:00
[同步] 输入首页 URL自动调用 Firecrawl Map 并入库
"""
try:
2025-12-30 16:57:31 +08:00
# Workflow 返回: {'task_id':..., 'msg': 'Task mapped...', ...}
res = workflow.map_and_ingest(req.url)
return make_response(1, res.pop("msg", "Mapping Started"), res)
except Exception as e:
return make_response(0, str(e))
@router_v2.post("/auto/process")
async def auto_process(req: AutoProcessRequest, background_tasks: BackgroundTasks):
"""
[异步] 触发后台任务消费队列 -> 抓取 -> Embedding -> 入库
"""
2025-12-30 16:57:31 +08:00
try:
# 将耗时操作放入后台任务
background_tasks.add_task(workflow.process_task_queue, req.task_id, req.batch_size)
# 因为是后台任务,无法立即获取 Service 的返回值 msg只能返回通用消息
return make_response(1, "Background processing started", {"task_id": req.task_id})
except Exception as e:
return make_response(0, str(e))
@router_v2.post("/search")
async def search_v2(req: TextSearchRequest):
"""
[智能] 输入自然语言文本 -> 后端转向量 -> 搜索
"""
try:
2025-12-30 16:57:31 +08:00
# Workflow 返回 {'results': [...], 'msg': '...'}
res = workflow.search_with_embedding(req.query, req.task_id, req.limit)
return make_response(1, res.pop("msg", "Search Success"), res)
2025-12-23 00:36:49 +08:00
except Exception as e:
return make_response(0, f"Search Failed: {str(e)}")
# ==========================================
# 挂载路由
# ==========================================
app.include_router(router_v1, prefix="/api/v1", tags=["V1 Manual API"])
app.include_router(router_v2, prefix="/api/v2", tags=["V2 Automated Workflow"])
2025-12-23 00:36:49 +08:00
2025-12-22 22:50:07 +08:00
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)