完成后端配置,用于部署
This commit is contained in:
BIN
backend/__pycache__/config.cpython-313.pyc
Normal file
BIN
backend/__pycache__/config.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/database.cpython-313.pyc
Normal file
BIN
backend/__pycache__/database.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/main.cpython-313.pyc
Normal file
BIN
backend/__pycache__/main.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/schemas.cpython-313.pyc
Normal file
BIN
backend/__pycache__/schemas.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/service.cpython-313.pyc
Normal file
BIN
backend/__pycache__/service.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/__pycache__/utils.cpython-313.pyc
Normal file
BIN
backend/__pycache__/utils.cpython-313.pyc
Normal file
Binary file not shown.
16
backend/config.py
Normal file
16
backend/config.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import os
|
||||
|
||||
class Settings:
|
||||
# 数据库配置
|
||||
DB_USER: str = "postgres"
|
||||
DB_PASS: str = "DXC_welcome001"
|
||||
DB_HOST: str = "8.155.144.6"
|
||||
DB_PORT: str = "25432"
|
||||
DB_NAME: str = "wiki_crawler"
|
||||
|
||||
@property
|
||||
def DATABASE_URL(self) -> str:
|
||||
url = f"postgresql+psycopg2://{self.DB_USER}:{self.DB_PASS}@{self.DB_HOST}:{self.DB_PORT}/{self.DB_NAME}"
|
||||
return url
|
||||
|
||||
settings = Settings()
|
||||
22
backend/database.py
Normal file
22
backend/database.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from sqlalchemy import create_engine, MetaData, Table
|
||||
from .config import settings
|
||||
|
||||
class Database:
|
||||
def __init__(self):
|
||||
self.engine = create_engine(settings.DATABASE_URL, pool_pre_ping=True)
|
||||
self.metadata = MetaData()
|
||||
self.tasks = None
|
||||
self.queue = None
|
||||
self.chunks = None
|
||||
self._reflect_tables()
|
||||
|
||||
def _reflect_tables(self):
|
||||
try:
|
||||
# 自动从数据库加载表结构
|
||||
self.tasks = Table('crawl_tasks', self.metadata, autoload_with=self.engine)
|
||||
self.queue = Table('crawl_queue', self.metadata, autoload_with=self.engine)
|
||||
self.chunks = Table('knowledge_chunks', self.metadata, autoload_with=self.engine)
|
||||
except Exception as e:
|
||||
print(f"❌ 数据库表加载失败: {e}")
|
||||
|
||||
db_instance = Database()
|
||||
39
backend/main.py
Normal file
39
backend/main.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from fastapi import FastAPI
|
||||
from .service import crawler_service
|
||||
from .schemas import RegisterRequest, PendingRequest, SaveResultsRequest, AddUrlsRequest
|
||||
from .utils import make_response
|
||||
|
||||
app = FastAPI(title="Wiki Crawler API")
|
||||
|
||||
@app.post("/register")
|
||||
async def register(req: RegisterRequest):
|
||||
try:
|
||||
data = crawler_service.register_task(req.url)
|
||||
return make_response(1, "Success", data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
|
||||
@app.post("/add_urls")
|
||||
async def add_urls(req: AddUrlsRequest):
|
||||
try:
|
||||
data = crawler_service.add_urls(req.task_id, req.urls)
|
||||
return make_response(1, "Success", data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
|
||||
@app.post("/pending_urls")
|
||||
async def pending_urls(req: PendingRequest):
|
||||
try:
|
||||
data = crawler_service.get_pending_urls(req.task_id, req.limit)
|
||||
msg = "Success" if data["urls"] else "Queue Empty"
|
||||
return make_response(1, msg, data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
|
||||
@app.post("/save_results")
|
||||
async def save_results(req: SaveResultsRequest):
|
||||
try:
|
||||
data = crawler_service.save_results(req.task_id, req.results)
|
||||
return make_response(1, "Success", data)
|
||||
except Exception as e:
|
||||
return make_response(0, str(e))
|
||||
23
backend/schemas.py
Normal file
23
backend/schemas.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
|
||||
class RegisterRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
class PendingRequest(BaseModel):
|
||||
task_id: int
|
||||
limit: Optional[int] = 10
|
||||
|
||||
class AddUrlsRequest(BaseModel):
|
||||
task_id: int
|
||||
urls: List[str]
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
title: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
embedding: Optional[List[float]] = None
|
||||
|
||||
class SaveResultsRequest(BaseModel):
|
||||
task_id: int
|
||||
results: List[CrawlResult]
|
||||
91
backend/service.py
Normal file
91
backend/service.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from sqlalchemy import select, update, and_
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from .database import db_instance
|
||||
from .utils import normalize_url
|
||||
|
||||
class CrawlerService:
|
||||
def __init__(self):
|
||||
self.db = db_instance
|
||||
|
||||
def register_task(self, url: str):
|
||||
"""注册新任务并初始化队列"""
|
||||
clean_url = normalize_url(url)
|
||||
with self.db.engine.begin() as conn:
|
||||
# 1. 查重
|
||||
find_stmt = select(self.db.tasks.c.id).where(self.db.tasks.c.root_url == clean_url)
|
||||
existing = conn.execute(find_stmt).fetchone()
|
||||
|
||||
if existing:
|
||||
return {"task_id": existing[0], "is_new_task": False}
|
||||
|
||||
# 2. 插入新任务
|
||||
new_task = conn.execute(
|
||||
pg_insert(self.db.tasks).values(root_url=clean_url).returning(self.db.tasks.c.id)
|
||||
).fetchone()
|
||||
task_id = new_task[0]
|
||||
|
||||
# 3. 初始化首个 URL 到队列
|
||||
conn.execute(
|
||||
pg_insert(self.db.queue).values(task_id=task_id, url=clean_url, status='pending')
|
||||
)
|
||||
return {"task_id": task_id, "is_new_task": True}
|
||||
|
||||
def add_urls(self, task_id: int, urls: list):
|
||||
"""批量存入新发现的待处理 URL(自动去重)"""
|
||||
added_count = 0
|
||||
with self.db.engine.begin() as conn:
|
||||
for url in urls:
|
||||
clean_url = normalize_url(url)
|
||||
stmt = pg_insert(self.db.queue).values(
|
||||
task_id=task_id,
|
||||
url=clean_url,
|
||||
status='pending'
|
||||
).on_conflict_do_nothing(index_elements=['task_id', 'url'])
|
||||
|
||||
res = conn.execute(stmt)
|
||||
if res.rowcount > 0:
|
||||
added_count += 1
|
||||
return {"added_count": added_count}
|
||||
|
||||
def get_pending_urls(self, task_id: int, limit: int):
|
||||
"""原子化获取待处理 URL 并锁定"""
|
||||
with self.db.engine.begin() as conn:
|
||||
stmt = select(self.db.queue.c.url).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.status == 'pending')
|
||||
).limit(limit)
|
||||
|
||||
urls = [r[0] for r in conn.execute(stmt).fetchall()]
|
||||
|
||||
if urls:
|
||||
conn.execute(
|
||||
update(self.db.queue).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url.in_(urls))
|
||||
).values(status='processing')
|
||||
)
|
||||
return {"urls": urls}
|
||||
|
||||
def save_results(self, task_id: int, results: list):
|
||||
"""保存正文、向量并闭环队列状态"""
|
||||
with self.db.engine.begin() as conn:
|
||||
for res in results:
|
||||
clean_url = normalize_url(res.url)
|
||||
# 存入数据
|
||||
conn.execute(
|
||||
pg_insert(self.db.chunks).values(
|
||||
task_id=task_id,
|
||||
source_url=clean_url,
|
||||
title=res.title,
|
||||
content=res.content,
|
||||
embedding=res.embedding
|
||||
)
|
||||
)
|
||||
# 更新状态
|
||||
conn.execute(
|
||||
update(self.db.queue).where(
|
||||
and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url)
|
||||
).values(status='completed')
|
||||
)
|
||||
return {"inserted": len(results)}
|
||||
|
||||
# 全局单例
|
||||
crawler_service = CrawlerService()
|
||||
15
backend/utils.py
Normal file
15
backend/utils.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from sqlalchemy import create_engine, MetaData, Table, select, update, and_
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
if not url: return ""
|
||||
url = url.strip()
|
||||
parsed = urlparse(url)
|
||||
scheme = parsed.scheme.lower()
|
||||
netloc = parsed.netloc.lower()
|
||||
path = parsed.path.rstrip('/')
|
||||
if not path: path = ""
|
||||
return urlunparse((scheme, netloc, path, parsed.params, parsed.query, ""))
|
||||
|
||||
def make_response(code: int, msg: str, data: any = None):
|
||||
return {"code": code, "msg": msg, "data": data}
|
||||
Reference in New Issue
Block a user