完成后端配置,用于部署

This commit is contained in:
QG
2025-12-20 17:08:54 +08:00
parent 5d93f9bfab
commit bab6be6790
28 changed files with 2552 additions and 4006 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

16
backend/config.py Normal file
View File

@@ -0,0 +1,16 @@
import os
class Settings:
# 数据库配置
DB_USER: str = "postgres"
DB_PASS: str = "DXC_welcome001"
DB_HOST: str = "8.155.144.6"
DB_PORT: str = "25432"
DB_NAME: str = "wiki_crawler"
@property
def DATABASE_URL(self) -> str:
url = f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASS}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
return url
settings = Settings()

22
backend/database.py Normal file
View File

@@ -0,0 +1,22 @@
from sqlalchemy import create_engine, MetaData, Table
from .config import settings
class Database:
def __init__(self):
self.engine = create_engine(settings.DATABASE_URL, pool_pre_ping=True)
self.metadata = MetaData()
self.tasks = None
self.queue = None
self.chunks = None
self._reflect_tables()
def _reflect_tables(self):
try:
# 自动从数据库加载表结构
self.tasks = Table('crawl_tasks', self.metadata, autoload_with=self.engine)
self.queue = Table('crawl_queue', self.metadata, autoload_with=self.engine)
self.chunks = Table('knowledge_chunks', self.metadata, autoload_with=self.engine)
except Exception as e:
print(f"❌ 数据库表加载失败: {e}")
db_instance = Database()

39
backend/main.py Normal file
View File

@@ -0,0 +1,39 @@
from fastapi import FastAPI
from .service import crawler_service
from .schemas import RegisterRequest, PendingRequest, SaveResultsRequest, AddUrlsRequest
from .utils import make_response
app = FastAPI(title="Wiki Crawler API")
@app.post("/register")
async def register(req: RegisterRequest):
try:
data = crawler_service.register_task(req.url)
return make_response(1, "Success", data)
except Exception as e:
return make_response(0, str(e))
@app.post("/add_urls")
async def add_urls(req: AddUrlsRequest):
try:
data = crawler_service.add_urls(req.task_id, req.urls)
return make_response(1, "Success", data)
except Exception as e:
return make_response(0, str(e))
@app.post("/pending_urls")
async def pending_urls(req: PendingRequest):
try:
data = crawler_service.get_pending_urls(req.task_id, req.limit)
msg = "Success" if data["urls"] else "Queue Empty"
return make_response(1, msg, data)
except Exception as e:
return make_response(0, str(e))
@app.post("/save_results")
async def save_results(req: SaveResultsRequest):
try:
data = crawler_service.save_results(req.task_id, req.results)
return make_response(1, "Success", data)
except Exception as e:
return make_response(0, str(e))

23
backend/schemas.py Normal file
View File

@@ -0,0 +1,23 @@
from pydantic import BaseModel
from typing import List, Optional
class RegisterRequest(BaseModel):
url: str
class PendingRequest(BaseModel):
task_id: int
limit: Optional[int] = 10
class AddUrlsRequest(BaseModel):
task_id: int
urls: List[str]
class CrawlResult(BaseModel):
url: str
title: Optional[str] = None
content: Optional[str] = None
embedding: Optional[List[float]] = None
class SaveResultsRequest(BaseModel):
task_id: int
results: List[CrawlResult]

91
backend/service.py Normal file
View File

@@ -0,0 +1,91 @@
from sqlalchemy import select, update, and_
from sqlalchemy.dialects.postgresql import insert as pg_insert
from .database import db_instance
from .utils import normalize_url
class CrawlerService:
def __init__(self):
self.db = db_instance
def register_task(self, url: str):
"""注册新任务并初始化队列"""
clean_url = normalize_url(url)
with self.db.engine.begin() as conn:
# 1. 查重
find_stmt = select(self.db.tasks.c.id).where(self.db.tasks.c.root_url == clean_url)
existing = conn.execute(find_stmt).fetchone()
if existing:
return {"task_id": existing[0], "is_new_task": False}
# 2. 插入新任务
new_task = conn.execute(
pg_insert(self.db.tasks).values(root_url=clean_url).returning(self.db.tasks.c.id)
).fetchone()
task_id = new_task[0]
# 3. 初始化首个 URL 到队列
conn.execute(
pg_insert(self.db.queue).values(task_id=task_id, url=clean_url, status='pending')
)
return {"task_id": task_id, "is_new_task": True}
def add_urls(self, task_id: int, urls: list):
"""批量存入新发现的待处理 URL自动去重"""
added_count = 0
with self.db.engine.begin() as conn:
for url in urls:
clean_url = normalize_url(url)
stmt = pg_insert(self.db.queue).values(
task_id=task_id,
url=clean_url,
status='pending'
).on_conflict_do_nothing(index_elements=['task_id', 'url'])
res = conn.execute(stmt)
if res.rowcount > 0:
added_count += 1
return {"added_count": added_count}
def get_pending_urls(self, task_id: int, limit: int):
"""原子化获取待处理 URL 并锁定"""
with self.db.engine.begin() as conn:
stmt = select(self.db.queue.c.url).where(
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.status == 'pending')
).limit(limit)
urls = [r[0] for r in conn.execute(stmt).fetchall()]
if urls:
conn.execute(
update(self.db.queue).where(
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url.in_(urls))
).values(status='processing')
)
return {"urls": urls}
def save_results(self, task_id: int, results: list):
"""保存正文、向量并闭环队列状态"""
with self.db.engine.begin() as conn:
for res in results:
clean_url = normalize_url(res.url)
# 存入数据
conn.execute(
pg_insert(self.db.chunks).values(
task_id=task_id,
source_url=clean_url,
title=res.title,
content=res.content,
embedding=res.embedding
)
)
# 更新状态
conn.execute(
update(self.db.queue).where(
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
).values(status='completed')
)
return {"inserted": len(results)}
# 全局单例
crawler_service = CrawlerService()

15
backend/utils.py Normal file
View File

@@ -0,0 +1,15 @@
from urllib.parse import urlparse, urlunparse
from sqlalchemy import create_engine, MetaData, Table, select, update, and_
def normalize_url(url: str) -> str:
if not url: return ""
url = url.strip()
parsed = urlparse(url)
scheme = parsed.scheme.lower()
netloc = parsed.netloc.lower()
path = parsed.path.rstrip('/')
if not path: path = ""
return urlunparse((scheme, netloc, path, parsed.params, parsed.query, ""))
def make_response(code: int, msg: str, data: any = None):
return {"code": code, "msg": msg, "data": data}