# backend/services/crawler_service.py from firecrawl import FirecrawlApp from backend.core.config import settings from backend.services.data_service import data_service from backend.services.llm_service import llm_service from backend.utils.text_process import text_processor class CrawlerService: """ 爬虫编排服务 协调 Firecrawl, LLM, 和 DataService """ def __init__(self): self.firecrawl = FirecrawlApp(api_key=settings.FIRECRAWL_API_KEY) def map_site(self, start_url: str): print(f"[INFO] Mapping: {start_url}") try: # 1. 注册任务 task_res = data_service.register_task(start_url) # 2. 无论是否新任务,都尝试把 start_url 加入队列 urls_to_add = [start_url] # 3. 调用 Firecrawl Map try: map_res = self.firecrawl.map(start_url) # 兼容不同 SDK 版本的返回 (对象 或 字典) found_links = [] if isinstance(map_res, dict): found_links = map_res.get('links', []) else: found_links = getattr(map_res, 'links', []) for link in found_links: # 链接可能是字符串,也可能是对象 u = link if isinstance(link, str) else getattr(link, 'url', str(link)) urls_to_add.append(u) print(f"[INFO] Firecrawl Map found {len(found_links)} sub-links.") except Exception as e: print(f"[WARN] Firecrawl Map warning (proceeding with seed only): {e}") # 4. 批量入库 if urls_to_add: data_service.add_urls(task_res['task_id'], urls_to_add) return { "msg": "Mapped successfully", "count": len(urls_to_add), "task_id": task_res['task_id'] } except Exception as e: print(f"[ERROR] Map failed: {e}") raise e def process_queue(self, task_id: int, batch_size: int = 5): """ 处理队列:爬取 -> 清洗 -> 切分 -> 向量化 -> 存储 """ pending = data_service.get_pending_urls(task_id, batch_size) urls = pending['urls'] if not urls: return {"msg": "Queue empty"} processed = 0 for url in urls: try: print(f"[INFO] Scraping: {url}") # 调用 Firecrawl scrape_res = self.firecrawl.scrape( url, formats=['markdown'], only_main_content=True ) # ===================================================== # 【核心修复】兼容字典和对象两种返回格式 # ===================================================== raw_md = "" metadata = None # 1. 提取 markdown 和 metadata 本体 if isinstance(scrape_res, dict): raw_md = scrape_res.get('markdown', '') metadata = scrape_res.get('metadata', {}) else: # 如果是对象 (ScrapeResponse) raw_md = getattr(scrape_res, 'markdown', '') metadata = getattr(scrape_res, 'metadata', {}) # 2. 从 metadata 中安全提取 title title = url # 默认用 url 当标题 if metadata: if isinstance(metadata, dict): # 如果 metadata 是字典 title = metadata.get('title', url) else: # 如果 metadata 是对象 (DocumentMetadata) -> 这里就是你报错的地方 title = getattr(metadata, 'title', url) # ===================================================== if not raw_md: print(f"[WARN] Empty content for {url}") continue # 1. 清洗 clean_md = text_processor.clean_markdown(raw_md) if not clean_md: continue # 2. 切分 chunks = text_processor.split_markdown(clean_md) chunks_data = [] for i, chunk in enumerate(chunks): content = chunk['content'] headers = chunk['metadata'] header_path = " > ".join(headers.values()) # 3. 向量化 (Title + Header + Content) emb_input = f"{title}\n{header_path}\n{content}" vector = llm_service.get_embedding(emb_input) if vector: chunks_data.append({ "index": i, "content": content, "embedding": vector, "meta_info": {"header_path": header_path, "headers": headers} }) # 4. 存储 if chunks_data: data_service.save_chunks(task_id, url, title, chunks_data) processed += 1 except Exception as e: # 打印详细错误方便调试 print(f"[ERROR] Process URL {url} failed: {e}") return {"msg": "Batch processed", "count": processed} def search(self, query: str, task_id, return_num: int): """ 全链路搜索:向量生成 -> 混合检索(粗排) -> 重排序(精排) """ # 1. 生成查询向量 vector = llm_service.get_embedding(query) if not vector: return {"msg": "Embedding failed", "results": []} # 2. 计算粗排召回数量 # 逻辑:至少召回 50 个,如果用户要很多,则召回 10 倍 coarse_limit = return_num * 10 if return_num * 10 > 50 else 50 # 3. 执行混合检索 (粗排) coarse_results = data_service.search( query_text=query, query_vector=vector, task_id=task_id, candidates_num=coarse_limit # 使用计算出的粗排数量 ) candidates = coarse_results.get('results', []) if not candidates: return {"msg": "No documents found", "results": []} # 4. 执行重排序 (精排) final_results = llm_service.rerank( query=query, documents=candidates, top_n=return_num # 最终返回用户需要的数量 ) return { "results": final_results, "msg": f"Reranked {len(final_results)} from {len(candidates)} candidates" } crawler_service = CrawlerService()