变更项目架构，提高扩展性

2026-01-13 01:37:26 +08:00
parent b9dbf1e8f7
commit 9190fee16f
22 changed files with 740 additions and 723 deletions
--- a/backend/utils/common.py
+++ b/backend/utils/common.py
@@ -0,0 +1,26 @@
+from urllib.parse import urlparse, urlunparse
+
+def make_response(code: int, msg: str = "Success", data: any = None):
+    """统一 API 响应格式封装"""
+    return {"code": code, "msg": msg, "data": data}
+
+def normalize_url(url: str) -> str:
+    """
+    URL 标准化处理
+    1. 去除首尾空格
+    2. 移除 fragment (#后面的内容)
+    3. 移除 query 参数 (视业务需求而定，这里假设不同 query 是同一页面)
+    4. 移除尾部斜杠
+    """
+    if not url:
+        return ""
+    
+    parsed = urlparse(url.strip())
+    # 重新组合：scheme, netloc, path, params, query, fragment
+    # 这里我们只保留 scheme, netloc, path
+    clean_path = parsed.path.rstrip('/')
+    
+    # 构造新的 parsed 对象 (param, query, fragment 置空)
+    new_parsed = parsed._replace(path=clean_path, params='', query='', fragment='')
+    
+    return urlunparse(new_parsed)
--- a/backend/utils/text_process.py
+++ b/backend/utils/text_process.py
@@ -0,0 +1,61 @@
+import re
+from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
+
+class TextProcessor:
+    """文本处理工具类：负责 Markdown 清洗和切分"""
+    
+    def __init__(self):
+        # 基于 Markdown 标题的语义切分器
+        self.md_splitter = MarkdownHeaderTextSplitter(
+            headers_to_split_on=[
+                ("#", "h1"),
+                ("##", "h2"),
+                ("###", "h3"),
+            ],
+            strip_headers=False
+        )
+
+        # 备用的字符切分器
+        self.char_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=800, 
+            chunk_overlap=100,
+            separators=["\n\n", "\n", "。", "！", "？", " ", ""]
+        )
+
+    def clean_markdown(self, text: str) -> str:
+        """清洗 Markdown 中的网页噪音"""
+        if not text: return ""
+        
+        # 去除 'Skip to main content'
+        text = re.sub(r'\[Skip to main content\].*?\n', '', text, flags=re.IGNORECASE)
+        # 去除页脚导航 (Previous / Next)
+        text = re.sub(r'\[Previous\].*?\[Next\].*', '', text, flags=re.DOTALL | re.IGNORECASE)
+        
+        return text.strip()
+
+    def split_markdown(self, text: str):
+        """执行切分策略：先按标题切，过长则按字符切"""
+        md_chunks = self.md_splitter.split_text(text)
+        final_chunks = []
+        
+        for chunk in md_chunks:
+            # chunk.page_content 是文本
+            # chunk.metadata 是标题层级
+            
+            if len(chunk.page_content) > 1000:
+                sub_texts = self.char_splitter.split_text(chunk.page_content)
+                for sub in sub_texts:
+                    final_chunks.append({
+                        "content": sub,
+                        "metadata": chunk.metadata
+                    })
+            else:
+                final_chunks.append({
+                    "content": chunk.page_content,
+                    "metadata": chunk.metadata
+                })
+                
+        return final_chunks
+
+# 单例工具
+text_processor = TextProcessor()