修改配置和response的细节

2025-12-30 16:57:31 +08:00
parent 8972246445
commit d191b13455
5 changed files with 308 additions and 205 deletions
--- a/backend/services/automated_crawler.py
+++ b/backend/services/automated_crawler.py
@@ -0,0 +1,201 @@
+import dashscope
+from http import HTTPStatus
+from firecrawl import FirecrawlApp
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from ..config import settings
+from .crawler_sql_service import crawler_sql_service
+
+# 初始化配置
+dashscope.api_key = settings.DASHSCOPE_API_KEY
+
+class AutomatedCrawler:
+    def __init__(self):
+        self.firecrawl = FirecrawlApp(api_key=settings.FIRECRAWL_API_KEY)
+        self.splitter = RecursiveCharacterTextSplitter(
+            chunk_size=500, 
+            chunk_overlap=100,
+            separators=["\n\n", "\n", "。", "！", "？", " ", ""]
+        )
+
+    def _get_embedding(self, text: str):
+        """内部方法：调用 Dashscope 生成向量"""
+        # 注意：此方法是内部辅助，出错返回 None，由调用方处理状态
+        embedding = None
+        try:
+            resp = dashscope.TextEmbedding.call(
+                model=dashscope.TextEmbedding.Models.text_embedding_v3, # 确认你的模型版本
+                input=text,
+                dimension=1536
+            )
+            if resp.status_code == HTTPStatus.OK:
+                embedding = resp.output['embeddings'][0]['embedding']
+            else:
+                print(f"Embedding API Error: {resp}")
+        except Exception as e:
+            print(f"Embedding Exception: {e}")
+            
+        return embedding
+
+    def map_and_ingest(self, start_url: str):
+        """
+        V2 步骤1: 地图式扫描并入库
+        """
+        print(f"[WorkFlow] Start mapping: {start_url}")
+        result = {}
+        
+        try:
+            # 1. 在数据库注册任务
+            task_info = crawler_sql_service.register_task(start_url)
+            task_id = task_info['task_id']
+            is_new_task = task_info['is_new_task']
+            
+            # 2. 调用 Firecrawl Map
+            if is_new_task:
+                map_result = self.firecrawl.map(start_url)
+            
+                urls = []
+                # 兼容 firecrawl sdk 不同版本的返回结构
+                # 如果 map_result 是对象且有 links 属性
+                if hasattr(map_result, 'links'): 
+                    for link in map_result.links:
+                        # 假设 link 是对象或字典，视具体 SDK 版本而定
+                        # 如果 link 是字符串直接 append
+                        if isinstance(link, str):
+                            urls.append(link)
+                        else:
+                            urls.append(getattr(link, 'url', str(link)))
+                # 如果是字典
+                elif isinstance(map_result, dict):
+                    urls = map_result.get('links', [])
+                
+                print(f"[WorkFlow] Found {len(urls)} links")
+                
+                # 3. 批量入库
+                res = {"msg": "No urls found to add"}
+                if urls:
+                    res = crawler_sql_service.add_urls(task_id, urls)
+            
+                result = {
+                    "msg": "Task successfully mapped and URLs added",
+                    "task_id": task_id,
+                    "is_new_task": is_new_task,
+                    "url_count": len(urls),
+                    "map_detail": res
+                }
+            else:
+                result = {
+                    "msg": "Task already exists, skipped mapping",
+                    "task_id": task_id,
+                    "is_new_task": False,
+                    "url_count": 0,
+                    "map_detail": {}
+                }
+
+        except Exception as e:
+            print(f"[WorkFlow] Map Error: {e}")
+            # 向上抛出异常，由 main.py 捕获并返回错误 Response
+            raise e
+
+        return result
+
+    def process_task_queue(self, task_id: int, limit: int = 10):
+        """
+        V2 步骤2: 消费队列 -> 抓取 -> 切片 -> 向量化 -> 存储
+        """
+        processed_count = 0
+        total_chunks_saved = 0
+        result = {}
+        
+        # 1. 获取待处理 URL
+        pending = crawler_sql_service.get_pending_urls(task_id, limit)
+        urls = pending['urls']
+        
+        if not urls:
+            result = {"msg": "Queue is empty, no processing needed", "processed_count": 0}
+        else:
+            for url in urls:
+                try:
+                    print(f"[WorkFlow] Processing: {url}")
+                    # 2. 单页抓取
+                    scrape_res = self.firecrawl.scrape(
+                        url, 
+                        params={'formats': ['markdown'], 'onlyMainContent': True}
+                    )
+                    
+                    # 兼容 SDK 返回类型 (对象或字典)
+                    content = ""
+                    metadata = {}
+                    
+                    if isinstance(scrape_res, dict):
+                        content = scrape_res.get('markdown', '')
+                        metadata = scrape_res.get('metadata', {})
+                    else:
+                        content = getattr(scrape_res, 'markdown', '')
+                        metadata = getattr(scrape_res, 'metadata', {})
+                        if not metadata and hasattr(scrape_res, 'metadata_dict'):
+                             metadata = scrape_res.metadata_dict
+
+                    title = metadata.get('title', url)
+                    
+                    if not content:
+                        print(f"[WorkFlow] Skip empty content: {url}")
+                        continue
+
+                    # 3. 切片
+                    chunks = self.splitter.split_text(content)
+                    results_to_save = []
+                    
+                    # 4. 向量化
+                    for idx, chunk_text in enumerate(chunks):
+                        vector = self._get_embedding(chunk_text)
+                        if vector:
+                            results_to_save.append({
+                                "source_url": url,
+                                "chunk_index": idx,
+                                "title": title,
+                                "content": chunk_text,
+                                "embedding": vector
+                            })
+                    
+                    # 5. 保存
+                    if results_to_save:
+                        save_res = crawler_sql_service.save_results(task_id, results_to_save)
+                        processed_count += 1
+                        total_chunks_saved += save_res['counts']['inserted'] + save_res['counts']['updated']
+                        
+                except Exception as e:
+                    print(f"[WorkFlow] Error processing {url}: {e}")
+                    # 此处不抛出异常，以免打断整个批次的循环
+                    # 实际生产建议在这里调用 service 将 url 标记为 failed
+
+            result = {
+                "msg": f"Batch processing complete. URLs processed: {processed_count}",
+                "processed_urls": processed_count,
+                "total_chunks_saved": total_chunks_saved
+            }
+            
+        return result
+
+    def search_with_embedding(self, query_text: str, task_id: int = None, limit: int = 5):
+        """
+        V2 搜索: 输入文本 -> 自动转向量 -> 搜索数据库
+        """
+        result = {}
+        
+        # 1. 获取向量
+        vector = self._get_embedding(query_text)
+        
+        if not vector:
+            result = {
+                "msg": "Failed to generate embedding for query",
+                "results": []
+            }
+        else:
+            # 2. 执行搜索
+            # search_knowledge 现在已经返回带 msg 的字典了
+            result = crawler_sql_service.search_knowledge(vector, task_id, limit)
+            
+        return result
+
+# 单例模式
+workflow = AutomatedCrawler()