import dashscope from http import HTTPStatus from firecrawl import FirecrawlApp from langchain_text_splitters import RecursiveCharacterTextSplitter from ..config import settings from .crawler_sql_service import crawler_sql_service # 初始化配置 dashscope.api_key = settings.DASHSCOPE_API_KEY class AutomatedCrawler: def __init__(self): self.firecrawl = FirecrawlApp(api_key=settings.FIRECRAWL_API_KEY) self.splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=100, separators=["\n\n", "\n", "。", "!", "?", " ", ""] ) def _get_embedding(self, text: str): """内部方法:调用 Dashscope 生成向量""" # 注意:此方法是内部辅助,出错返回 None,由调用方处理状态 embedding = None try: resp = dashscope.TextEmbedding.call( model=dashscope.TextEmbedding.Models.text_embedding_v3, # 确认你的模型版本 input=text, dimension=1536 ) if resp.status_code == HTTPStatus.OK: embedding = resp.output['embeddings'][0]['embedding'] else: print(f"Embedding API Error: {resp}") except Exception as e: print(f"Embedding Exception: {e}") return embedding def map_and_ingest(self, start_url: str): """ V2 步骤1: 地图式扫描并入库 """ print(f"[WorkFlow] Start mapping: {start_url}") result = {} try: # 1. 在数据库注册任务 task_info = crawler_sql_service.register_task(start_url) task_id = task_info['task_id'] is_new_task = task_info['is_new_task'] # 2. 调用 Firecrawl Map if is_new_task: map_result = self.firecrawl.map(start_url) urls = [] # 兼容 firecrawl sdk 不同版本的返回结构 # 如果 map_result 是对象且有 links 属性 if hasattr(map_result, 'links'): for link in map_result.links: # 假设 link 是对象或字典,视具体 SDK 版本而定 # 如果 link 是字符串直接 append if isinstance(link, str): urls.append(link) else: urls.append(getattr(link, 'url', str(link))) # 如果是字典 elif isinstance(map_result, dict): urls = map_result.get('links', []) print(f"[WorkFlow] Found {len(urls)} links") # 3. 批量入库 res = {"msg": "No urls found to add"} if urls: res = crawler_sql_service.add_urls(task_id, urls) result = { "msg": "Task successfully mapped and URLs added", "task_id": task_id, "is_new_task": is_new_task, "url_count": len(urls), "map_detail": res } else: result = { "msg": "Task already exists, skipped mapping", "task_id": task_id, "is_new_task": False, "url_count": 0, "map_detail": {} } except Exception as e: print(f"[WorkFlow] Map Error: {e}") # 向上抛出异常,由 main.py 捕获并返回错误 Response raise e return result def process_task_queue(self, task_id: int, limit: int = 10): """ V2 步骤2: 消费队列 -> 抓取 -> 切片 -> 向量化 -> 存储 """ processed_count = 0 total_chunks_saved = 0 result = {} # 1. 获取待处理 URL pending = crawler_sql_service.get_pending_urls(task_id, limit) urls = pending['urls'] if not urls: result = {"msg": "Queue is empty, no processing needed", "processed_count": 0} else: for url in urls: try: print(f"[WorkFlow] Processing: {url}") # 2. 单页抓取 scrape_res = self.firecrawl.scrape( url, params={'formats': ['markdown'], 'onlyMainContent': True} ) # 兼容 SDK 返回类型 (对象或字典) content = "" metadata = {} if isinstance(scrape_res, dict): content = scrape_res.get('markdown', '') metadata = scrape_res.get('metadata', {}) else: content = getattr(scrape_res, 'markdown', '') metadata = getattr(scrape_res, 'metadata', {}) if not metadata and hasattr(scrape_res, 'metadata_dict'): metadata = scrape_res.metadata_dict title = metadata.get('title', url) if not content: print(f"[WorkFlow] Skip empty content: {url}") continue # 3. 切片 chunks = self.splitter.split_text(content) results_to_save = [] # 4. 向量化 for idx, chunk_text in enumerate(chunks): vector = self._get_embedding(chunk_text) if vector: results_to_save.append({ "source_url": url, "chunk_index": idx, "title": title, "content": chunk_text, "embedding": vector }) # 5. 保存 if results_to_save: save_res = crawler_sql_service.save_results(task_id, results_to_save) processed_count += 1 total_chunks_saved += save_res['counts']['inserted'] + save_res['counts']['updated'] except Exception as e: print(f"[WorkFlow] Error processing {url}: {e}") # 此处不抛出异常,以免打断整个批次的循环 # 实际生产建议在这里调用 service 将 url 标记为 failed result = { "msg": f"Batch processing complete. URLs processed: {processed_count}", "processed_urls": processed_count, "total_chunks_saved": total_chunks_saved } return result def search_with_embedding(self, query_text: str, task_id: int = None, limit: int = 5): """ V2 搜索: 输入文本 -> 自动转向量 -> 搜索数据库 """ result = {} # 1. 获取向量 vector = self._get_embedding(query_text) if not vector: result = { "msg": "Failed to generate embedding for query", "results": [] } else: # 2. 执行搜索 # search_knowledge 现在已经返回带 msg 的字典了 result = crawler_sql_service.search_knowledge(vector, task_id, limit) return result # 单例模式 workflow = AutomatedCrawler()