变更项目架构,提高扩展性
This commit is contained in:
26
backend/utils/common.py
Normal file
26
backend/utils/common.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
def make_response(code: int, msg: str = "Success", data: any = None):
|
||||
"""统一 API 响应格式封装"""
|
||||
return {"code": code, "msg": msg, "data": data}
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""
|
||||
URL 标准化处理
|
||||
1. 去除首尾空格
|
||||
2. 移除 fragment (#后面的内容)
|
||||
3. 移除 query 参数 (视业务需求而定,这里假设不同 query 是同一页面)
|
||||
4. 移除尾部斜杠
|
||||
"""
|
||||
if not url:
|
||||
return ""
|
||||
|
||||
parsed = urlparse(url.strip())
|
||||
# 重新组合:scheme, netloc, path, params, query, fragment
|
||||
# 这里我们只保留 scheme, netloc, path
|
||||
clean_path = parsed.path.rstrip('/')
|
||||
|
||||
# 构造新的 parsed 对象 (param, query, fragment 置空)
|
||||
new_parsed = parsed._replace(path=clean_path, params='', query='', fragment='')
|
||||
|
||||
return urlunparse(new_parsed)
|
||||
61
backend/utils/text_process.py
Normal file
61
backend/utils/text_process.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import re
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
|
||||
|
||||
class TextProcessor:
|
||||
"""文本处理工具类:负责 Markdown 清洗和切分"""
|
||||
|
||||
def __init__(self):
|
||||
# 基于 Markdown 标题的语义切分器
|
||||
self.md_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=[
|
||||
("#", "h1"),
|
||||
("##", "h2"),
|
||||
("###", "h3"),
|
||||
],
|
||||
strip_headers=False
|
||||
)
|
||||
|
||||
# 备用的字符切分器
|
||||
self.char_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=800,
|
||||
chunk_overlap=100,
|
||||
separators=["\n\n", "\n", "。", "!", "?", " ", ""]
|
||||
)
|
||||
|
||||
def clean_markdown(self, text: str) -> str:
|
||||
"""清洗 Markdown 中的网页噪音"""
|
||||
if not text: return ""
|
||||
|
||||
# 去除 'Skip to main content'
|
||||
text = re.sub(r'\[Skip to main content\].*?\n', '', text, flags=re.IGNORECASE)
|
||||
# 去除页脚导航 (Previous / Next)
|
||||
text = re.sub(r'\[Previous\].*?\[Next\].*', '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def split_markdown(self, text: str):
|
||||
"""执行切分策略:先按标题切,过长则按字符切"""
|
||||
md_chunks = self.md_splitter.split_text(text)
|
||||
final_chunks = []
|
||||
|
||||
for chunk in md_chunks:
|
||||
# chunk.page_content 是文本
|
||||
# chunk.metadata 是标题层级
|
||||
|
||||
if len(chunk.page_content) > 1000:
|
||||
sub_texts = self.char_splitter.split_text(chunk.page_content)
|
||||
for sub in sub_texts:
|
||||
final_chunks.append({
|
||||
"content": sub,
|
||||
"metadata": chunk.metadata
|
||||
})
|
||||
else:
|
||||
final_chunks.append({
|
||||
"content": chunk.page_content,
|
||||
"metadata": chunk.metadata
|
||||
})
|
||||
|
||||
return final_chunks
|
||||
|
||||
# 单例工具
|
||||
text_processor = TextProcessor()
|
||||
Reference in New Issue
Block a user