修改配置和response的细节
This commit is contained in:
201
backend/services/automated_crawler.py
Normal file
201
backend/services/automated_crawler.py
Normal file
@@ -0,0 +1,201 @@
|
||||
import dashscope
|
||||
from http import HTTPStatus
|
||||
from firecrawl import FirecrawlApp
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from ..config import settings
|
||||
from .crawler_sql_service import crawler_sql_service
|
||||
|
||||
# 初始化配置
|
||||
dashscope.api_key = settings.DASHSCOPE_API_KEY
|
||||
|
||||
class AutomatedCrawler:
|
||||
def __init__(self):
|
||||
self.firecrawl = FirecrawlApp(api_key=settings.FIRECRAWL_API_KEY)
|
||||
self.splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=500,
|
||||
chunk_overlap=100,
|
||||
separators=["\n\n", "\n", "。", "!", "?", " ", ""]
|
||||
)
|
||||
|
||||
def _get_embedding(self, text: str):
|
||||
"""内部方法:调用 Dashscope 生成向量"""
|
||||
# 注意:此方法是内部辅助,出错返回 None,由调用方处理状态
|
||||
embedding = None
|
||||
try:
|
||||
resp = dashscope.TextEmbedding.call(
|
||||
model=dashscope.TextEmbedding.Models.text_embedding_v3, # 确认你的模型版本
|
||||
input=text,
|
||||
dimension=1536
|
||||
)
|
||||
if resp.status_code == HTTPStatus.OK:
|
||||
embedding = resp.output['embeddings'][0]['embedding']
|
||||
else:
|
||||
print(f"Embedding API Error: {resp}")
|
||||
except Exception as e:
|
||||
print(f"Embedding Exception: {e}")
|
||||
|
||||
return embedding
|
||||
|
||||
def map_and_ingest(self, start_url: str):
|
||||
"""
|
||||
V2 步骤1: 地图式扫描并入库
|
||||
"""
|
||||
print(f"[WorkFlow] Start mapping: {start_url}")
|
||||
result = {}
|
||||
|
||||
try:
|
||||
# 1. 在数据库注册任务
|
||||
task_info = crawler_sql_service.register_task(start_url)
|
||||
task_id = task_info['task_id']
|
||||
is_new_task = task_info['is_new_task']
|
||||
|
||||
# 2. 调用 Firecrawl Map
|
||||
if is_new_task:
|
||||
map_result = self.firecrawl.map(start_url)
|
||||
|
||||
urls = []
|
||||
# 兼容 firecrawl sdk 不同版本的返回结构
|
||||
# 如果 map_result 是对象且有 links 属性
|
||||
if hasattr(map_result, 'links'):
|
||||
for link in map_result.links:
|
||||
# 假设 link 是对象或字典,视具体 SDK 版本而定
|
||||
# 如果 link 是字符串直接 append
|
||||
if isinstance(link, str):
|
||||
urls.append(link)
|
||||
else:
|
||||
urls.append(getattr(link, 'url', str(link)))
|
||||
# 如果是字典
|
||||
elif isinstance(map_result, dict):
|
||||
urls = map_result.get('links', [])
|
||||
|
||||
print(f"[WorkFlow] Found {len(urls)} links")
|
||||
|
||||
# 3. 批量入库
|
||||
res = {"msg": "No urls found to add"}
|
||||
if urls:
|
||||
res = crawler_sql_service.add_urls(task_id, urls)
|
||||
|
||||
result = {
|
||||
"msg": "Task successfully mapped and URLs added",
|
||||
"task_id": task_id,
|
||||
"is_new_task": is_new_task,
|
||||
"url_count": len(urls),
|
||||
"map_detail": res
|
||||
}
|
||||
else:
|
||||
result = {
|
||||
"msg": "Task already exists, skipped mapping",
|
||||
"task_id": task_id,
|
||||
"is_new_task": False,
|
||||
"url_count": 0,
|
||||
"map_detail": {}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"[WorkFlow] Map Error: {e}")
|
||||
# 向上抛出异常,由 main.py 捕获并返回错误 Response
|
||||
raise e
|
||||
|
||||
return result
|
||||
|
||||
def process_task_queue(self, task_id: int, limit: int = 10):
|
||||
"""
|
||||
V2 步骤2: 消费队列 -> 抓取 -> 切片 -> 向量化 -> 存储
|
||||
"""
|
||||
processed_count = 0
|
||||
total_chunks_saved = 0
|
||||
result = {}
|
||||
|
||||
# 1. 获取待处理 URL
|
||||
pending = crawler_sql_service.get_pending_urls(task_id, limit)
|
||||
urls = pending['urls']
|
||||
|
||||
if not urls:
|
||||
result = {"msg": "Queue is empty, no processing needed", "processed_count": 0}
|
||||
else:
|
||||
for url in urls:
|
||||
try:
|
||||
print(f"[WorkFlow] Processing: {url}")
|
||||
# 2. 单页抓取
|
||||
scrape_res = self.firecrawl.scrape(
|
||||
url,
|
||||
params={'formats': ['markdown'], 'onlyMainContent': True}
|
||||
)
|
||||
|
||||
# 兼容 SDK 返回类型 (对象或字典)
|
||||
content = ""
|
||||
metadata = {}
|
||||
|
||||
if isinstance(scrape_res, dict):
|
||||
content = scrape_res.get('markdown', '')
|
||||
metadata = scrape_res.get('metadata', {})
|
||||
else:
|
||||
content = getattr(scrape_res, 'markdown', '')
|
||||
metadata = getattr(scrape_res, 'metadata', {})
|
||||
if not metadata and hasattr(scrape_res, 'metadata_dict'):
|
||||
metadata = scrape_res.metadata_dict
|
||||
|
||||
title = metadata.get('title', url)
|
||||
|
||||
if not content:
|
||||
print(f"[WorkFlow] Skip empty content: {url}")
|
||||
continue
|
||||
|
||||
# 3. 切片
|
||||
chunks = self.splitter.split_text(content)
|
||||
results_to_save = []
|
||||
|
||||
# 4. 向量化
|
||||
for idx, chunk_text in enumerate(chunks):
|
||||
vector = self._get_embedding(chunk_text)
|
||||
if vector:
|
||||
results_to_save.append({
|
||||
"source_url": url,
|
||||
"chunk_index": idx,
|
||||
"title": title,
|
||||
"content": chunk_text,
|
||||
"embedding": vector
|
||||
})
|
||||
|
||||
# 5. 保存
|
||||
if results_to_save:
|
||||
save_res = crawler_sql_service.save_results(task_id, results_to_save)
|
||||
processed_count += 1
|
||||
total_chunks_saved += save_res['counts']['inserted'] + save_res['counts']['updated']
|
||||
|
||||
except Exception as e:
|
||||
print(f"[WorkFlow] Error processing {url}: {e}")
|
||||
# 此处不抛出异常,以免打断整个批次的循环
|
||||
# 实际生产建议在这里调用 service 将 url 标记为 failed
|
||||
|
||||
result = {
|
||||
"msg": f"Batch processing complete. URLs processed: {processed_count}",
|
||||
"processed_urls": processed_count,
|
||||
"total_chunks_saved": total_chunks_saved
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
def search_with_embedding(self, query_text: str, task_id: int = None, limit: int = 5):
|
||||
"""
|
||||
V2 搜索: 输入文本 -> 自动转向量 -> 搜索数据库
|
||||
"""
|
||||
result = {}
|
||||
|
||||
# 1. 获取向量
|
||||
vector = self._get_embedding(query_text)
|
||||
|
||||
if not vector:
|
||||
result = {
|
||||
"msg": "Failed to generate embedding for query",
|
||||
"results": []
|
||||
}
|
||||
else:
|
||||
# 2. 执行搜索
|
||||
# search_knowledge 现在已经返回带 msg 的字典了
|
||||
result = crawler_sql_service.search_knowledge(vector, task_id, limit)
|
||||
|
||||
return result
|
||||
|
||||
# 单例模式
|
||||
workflow = AutomatedCrawler()
|
||||
Reference in New Issue
Block a user