From 75b32b4bcd1abdeda3e11a35bd883ba2f5add3cd Mon Sep 17 00:00:00 2001 From: QingGang Date: Mon, 22 Dec 2025 22:08:51 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90=E8=8A=82=E7=82=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/schemas.py | 4 +- backend/service.py | 152 ++++++++++++++++++++------------- nodes/chunk_and_embedding.json | 51 +++++++++++ nodes/chunk_and_embedding.py | 125 +++++++++++++++++++++++++++ nodes/parse_firecrawl_map.py | 22 +++++ nodes/parse_pending_urls.py | 20 +++++ nodes/parse_register.py | 23 +++++ pyproject.toml | 2 +- scripts/test_apis.py | 87 +++++++++++++++++++ 9 files changed, 426 insertions(+), 60 deletions(-) create mode 100644 nodes/chunk_and_embedding.json create mode 100644 nodes/chunk_and_embedding.py create mode 100644 nodes/parse_firecrawl_map.py create mode 100644 nodes/parse_pending_urls.py create mode 100644 nodes/parse_register.py create mode 100644 scripts/test_apis.py diff --git a/backend/schemas.py b/backend/schemas.py index 9bfc064..041abd5 100644 --- a/backend/schemas.py +++ b/backend/schemas.py @@ -12,8 +12,10 @@ class AddUrlsRequest(BaseModel): task_id: int urls: List[str] +# schemas.py class CrawlResult(BaseModel): - url: str + source_url: str + chunk_index: int # 新增字段 title: Optional[str] = None content: Optional[str] = None embedding: Optional[List[float]] = None diff --git a/backend/service.py b/backend/service.py index 771fa8a..a7271b3 100644 --- a/backend/service.py +++ b/backend/service.py @@ -1,5 +1,5 @@ -from sqlalchemy import select, update, and_ -from sqlalchemy.dialects.postgresql import insert as pg_insert +# service.py +from sqlalchemy import select, insert, update, delete, and_ from .database import db_instance from .utils import normalize_url @@ -8,84 +8,120 @@ class CrawlerService: self.db = db_instance def register_task(self, url: str): - """注册新任务并初始化队列""" + """完全使用库 API 实现的注册""" clean_url = normalize_url(url) with self.db.engine.begin() as conn: - # 1. 查重 - find_stmt = select(self.db.tasks.c.id).where(self.db.tasks.c.root_url == clean_url) - existing = conn.execute(find_stmt).fetchone() + # 使用 select() API + query = select(self.db.tasks.c.id).where(self.db.tasks.c.root_url == clean_url) + existing = conn.execute(query).fetchone() if existing: return {"task_id": existing[0], "is_new_task": False} - # 2. 插入新任务 - new_task = conn.execute( - pg_insert(self.db.tasks).values(root_url=clean_url).returning(self.db.tasks.c.id) - ).fetchone() - task_id = new_task[0] - - # 3. 初始化首个 URL 到队列 - conn.execute( - pg_insert(self.db.queue).values(task_id=task_id, url=clean_url, status='pending') - ) - return {"task_id": task_id, "is_new_task": True} + # 使用 insert() API + stmt = insert(self.db.tasks).values(root_url=clean_url).returning(self.db.tasks.c.id) + new_task = conn.execute(stmt).fetchone() + return {"task_id": new_task[0], "is_new_task": True} def add_urls(self, task_id: int, urls: list): - """批量存入新发现的待处理 URL(自动去重)""" - added_count = 0 + """通用 API 实现的批量添加(含详细返回)""" + success_urls, skipped_urls, failed_urls = [], [], [] + with self.db.engine.begin() as conn: for url in urls: clean_url = normalize_url(url) - stmt = pg_insert(self.db.queue).values( - task_id=task_id, - url=clean_url, - status='pending' - ).on_conflict_do_nothing(index_elements=['task_id', 'url']) - - res = conn.execute(stmt) - if res.rowcount > 0: - added_count += 1 - return {"added_count": added_count} + try: + # 检查是否存在 (通用写法) + check_q = select(self.db.queue).where( + and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url) + ) + if conn.execute(check_q).fetchone(): + skipped_urls.append(clean_url) + continue + + # 插入新 URL + conn.execute(insert(self.db.queue).values( + task_id=task_id, url=clean_url, status='pending' + )) + success_urls.append(clean_url) + except Exception: + failed_urls.append(clean_url) + + return {"success_urls": success_urls, "skipped_urls": skipped_urls, "failed_urls": failed_urls} def get_pending_urls(self, task_id: int, limit: int): - """原子化获取待处理 URL 并锁定""" + """原子锁定 API 实现""" with self.db.engine.begin() as conn: - stmt = select(self.db.queue.c.url).where( + query = select(self.db.queue.c.url).where( and_(self.db.queue.c.task_id == task_id, self.db.queue.c.status == 'pending') ).limit(limit) - urls = [r[0] for r in conn.execute(stmt).fetchall()] + urls = [r[0] for r in conn.execute(query).fetchall()] if urls: - conn.execute( - update(self.db.queue).where( - and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url.in_(urls)) - ).values(status='processing') - ) - return {"urls": urls} + upd = update(self.db.queue).where( + and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url.in_(urls)) + ).values(status='processing') + conn.execute(upd) + return {"urls": urls} def save_results(self, task_id: int, results: list): - """保存正文、向量并闭环队列状态""" + """通用 API 实现的 UPSERT 逻辑:区分插入、更新、失败""" + inserted_urls, updated_urls, failed_urls = [], [], [] + with self.db.engine.begin() as conn: for res in results: - clean_url = normalize_url(res.url) - # 存入数据 - conn.execute( - pg_insert(self.db.chunks).values( - task_id=task_id, - source_url=clean_url, - title=res.title, - content=res.content, - embedding=res.embedding - ) - ) - # 更新状态 - conn.execute( - update(self.db.queue).where( - and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url) - ).values(status='completed') - ) - return {"inserted": len(results)} + # 适配 Dify 传来的字典或对象 + data = res if isinstance(res, dict) else res.__dict__ + clean_url = normalize_url(data.get('source_url')) + c_idx = data.get('chunk_index') + + try: + # 1. 检查是否存在该切片 + find_q = select(self.db.chunks).where( + and_( + self.db.chunks.c.task_id == task_id, + self.db.chunks.c.source_url == clean_url, + self.db.chunks.c.chunk_index == c_idx + ) + ) + existing = conn.execute(find_q).fetchone() + + if existing: + # 2. 执行更新 API + upd = update(self.db.chunks).where(self.db.chunks.c.id == existing[0]).values( + title=data.get('title'), + content=data.get('content'), + embedding=data.get('embedding') + ) + conn.execute(upd) + updated_urls.append(clean_url) + else: + # 3. 执行插入 API + ins = insert(self.db.chunks).values( + task_id=task_id, + source_url=clean_url, + chunk_index=c_idx, + title=data.get('title'), + content=data.get('content'), + embedding=data.get('embedding') + ) + conn.execute(ins) + inserted_urls.append(clean_url) + + # 4. 更新队列状态 + conn.execute(update(self.db.queue).where( + and_(self.db.queue.c.task_id == task_id, self.db.queue.c.url == clean_url) + ).values(status='completed')) + + except Exception as e: + print(f"Error: {e}") + failed_urls.append(clean_url) + + return { + "inserted_urls": list(set(inserted_urls)), + "updated_urls": list(set(updated_urls)), + "failed_urls": failed_urls + } -# 全局单例 crawler_service = CrawlerService() \ No newline at end of file diff --git a/nodes/chunk_and_embedding.json b/nodes/chunk_and_embedding.json new file mode 100644 index 0000000..548deaf --- /dev/null +++ b/nodes/chunk_and_embedding.json @@ -0,0 +1,51 @@ +{ + "res_json": [ + { + "data": { + "markdown": "[Skip to main content](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#content-area)\n\n[Dify Docs home page![light logo](https://assets-docs.dify.ai/2025/05/d05cfc6ebe48f725d171dc71c64a5d16.svg)![dark logo](https://assets-docs.dify.ai/2025/05/c51f1cda47c1d9a4a162d7736f6e4c53.svg)](https://docs.dify.ai/)\n\nLatest\n![US](https://d3gk2c5xim1je2.cloudfront.net/flags/US.svg)\n\nEnglish\n\nSearch...\n\nCtrl K\n\nSearch...\n\nNavigation\n\n1\\. Import Text Data\n\n1\\. Import Text Data\n\nClick on Knowledge in the main navigation bar of Dify. On this page, you can see your existing knowledge bases. Click **Create Knowledge** to enter the setup wizard. The Knowledge supports the import of the following two online data:Click **Knowledge** in the top navigation bar of the Dify, then select **Create Knowledge**. You can upload documents to the knowledge or importing online data to it.\n\n## [​](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#upload-local-files) Upload Local Files\n\nDrag and drop or select files to upload. The number of files allowed for **batch upload** depends on your [subscription plan](https://dify.ai/pricing).**Limitations for uploading documents:**\n\n- The upload size limit for a single document is 15MB;\n- Different [subscription plans](https://dify.ai/pricing) for the SaaS version limit **batch upload numbers, total document uploads, and vector storage**\n\n![Create knowledge](https://assets-docs.dify.ai/2025/01/22064cb61356e4c005c4072d5d066cf6.png)\n\n## [​](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#import-from-online-data-source) Import From Online Data Source\n\nWhen creating a **Knowledge**, you can import data from online sources. The knowledge supports the following two types of online data: [**1.1 Import Data from Notion** \\\\\n\\\\\nLearn how to import data from Notion](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion) [**1.2 Sync from Website** \\\\\n\\\\\nLearn how to sync data from websites](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-website)\n\nIf a knowledge base is set up to use online data, you won’t be able to add local documents later or switch it to a local file-based mode. This prevents a single knowledge base from mixing multiple data sources, avoiding management complications.\n\n## [​](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme\\#adding-data-later) Adding Data Later\n\nIf you haven’t prepared your documents or other content yet, simply create an empty knowledge first. You can then upload local files or import online data whenever you’re ready.\n\nWas this page helpful?\n\nYesNo\n\n[Previous](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/introduction) [1.1 Sync Data from Notion\\\\\n\\\\\nNext](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/sync-from-notion)\n\nCtrl+I\n\nOn this page\n\n- [Upload Local Files](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#upload-local-files)\n- [Import From Online Data Source](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#import-from-online-data-source)\n- [Adding Data Later](https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme#adding-data-later)\n\nAssistant\n\nResponses are generated using AI and may contain mistakes.\n\n![Create knowledge](https://assets-docs.dify.ai/2025/01/22064cb61356e4c005c4072d5d066cf6.png)", + "metadata": { + "apple-mobile-web-app-title": "Dify Docs", + "application-name": "Dify Docs", + "cacheState": "hit", + "cachedAt": "2025-12-09T08:12:32.803Z", + "canonical": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme", + "charset": "utf-8", + "concurrencyLimited": true, + "concurrencyQueueDurationMs": 371, + "contentType": "text/html; charset=utf-8", + "creditsUsed": 1, + "favicon": "https://docs.dify.ai/mintlify-assets/_mintlify/favicons/dify-6c0370d8/tWYYD8GkT0MUJV0z/_generated/favicon/favicon-16x16.png", + "generator": "Mintlify", + "language": "en", + "msapplication-TileColor": "#0060FF", + "msapplication-config": "/mintlify-assets/_mintlify/favicons/dify-6c0370d8/tWYYD8GkT0MUJV0z/_generated/favicon/browserconfig.xml", + "next-size-adjust": "", + "og:image": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100", + "og:image:height": "630", + "og:image:width": "1200", + "og:site_name": "Dify Docs", + "og:title": "1. Import Text Data - Dify Docs", + "og:type": "website", + "og:url": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme", + "ogImage": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100", + "ogTitle": "1. Import Text Data - Dify Docs", + "ogUrl": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme", + "proxyUsed": "basic", + "scrapeId": "019b024f-f76e-746b-b13c-6ca4884fdd64", + "sourceURL": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme", + "statusCode": 200, + "title": "1. Import Text Data - Dify Docs", + "twitter:card": "summary_large_image", + "twitter:image": "https://dify-6c0370d8.mintlify.app/mintlify-assets/_next/image?url=%2F_mintlify%2Fapi%2Fog%3Fdivision%3D1.%2BImport%2BText%2BData%26title%3D1.%2BImport%2BText%2BData%26logoLight%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fd05cfc6ebe48f725d171dc71c64a5d16.svg%26logoDark%3Dhttps%253A%252F%252Fassets-docs.dify.ai%252F2025%252F05%252Fc51f1cda47c1d9a4a162d7736f6e4c53.svg%26primaryColor%3D%25230060FF%26lightColor%3D%2523688FE8%26darkColor%3D%25230034FF%26backgroundLight%3D%2523ffffff%26backgroundDark%3D%25230b0c0f&w=1200&q=100", + "twitter:image:height": "630", + "twitter:image:width": "1200", + "twitter:title": "1. Import Text Data - Dify Docs", + "url": "https://docs.dify.ai/en/use-dify/knowledge/create-knowledge/import-text-data/readme", + "viewport": "width=device-width, initial-scale=1" + }, + "warning": "This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan." + }, + "success": true + } + ] +} \ No newline at end of file diff --git a/nodes/chunk_and_embedding.py b/nodes/chunk_and_embedding.py new file mode 100644 index 0000000..f55ab1e --- /dev/null +++ b/nodes/chunk_and_embedding.py @@ -0,0 +1,125 @@ +import json +import re +import requests + +def text_cleaning(text: str) -> str: + """ + 对文本进行清洗,移除多余空格、换行符等 + """ + text = re.sub(r'\s+', ' ', text) # 替换多个空格为一个空格 + text = text.strip() # 移除首尾空格 + return text + +def text_to_chunks(text: str): + chunk_size = 800 + overlap = 100 # 100 字符重叠,意思是每块文本之间有100个字符的重叠 + step = chunk_size - overlap + + chunks = [] + text_len = len(text) + + if text_len < 50: + chunks.append(text) + else: + start = 0 + while start < text_len: + end = min(start + chunk_size, text_len) + chunk_content = text[start:end] + + # 防止切出过短的碎片,或者是最后一块 + if len(chunk_content) > 50 or start + step >= text_len: + chunks.append(chunk_content) + + start += step + return chunks + + +def chunks_embedding(texts: list[str], api_key: str) -> list[list[float]]: + if not texts: + return [] + + MODEL_NAME = "text-embedding-v4" + url = "https://dashscope.aliyuncs.com/api/v1/services/embeddings/text-embedding/text-embedding" + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + } + payload = { + "model": MODEL_NAME, + "input": {"texts": texts}, + "parameters": {"text_type": "document", "dimension": 1536} + } + + try: + response = requests.post(url, headers=headers, json=payload, timeout=60) + response.raise_for_status() + result = response.json() + + if "output" in result and "embeddings" in result["output"]: + embeddings_list = result["output"]["embeddings"] + embeddings_list.sort(key=lambda x: x["text_index"]) + + # --- 核心修复:对每个浮点数保留 8 位小数,解决精度过高报错 --- + final_vectors = [] + for item in embeddings_list: + # 将每个 float 限制在 8 位精度以内 + rounded_vector = [round(float(val), 8) for val in item["embedding"]] + final_vectors.append(rounded_vector) + return final_vectors + else: + return [None] * len(texts) + except Exception as e: + print(f"Alibaba Embedding Error: {e}") + return [None] * len(texts) + +def main(scrape_json: list, DASHSCOPE_API_KEY: str) -> dict: + """ + 输入: res_json (Firecrawl结果), DASHSCOPE_API_KEY (阿里API Key) + """ + + # --- 1. 解析 Firecrawl JSON (通用容错解析) --- + scrape_obj = scrape_json[0] + if not scrape_obj["success"]: + return {"results": []} + data = scrape_obj.get("data", []) + # 获取原始内容 + text = data.get("markdown", "") + metadata = data.get("metadata", {}) + warning = data.get("warning", "") + # ======================================================= + # --- 2. 通用 Markdown 清洗 (Generic Cleaning) --- + # ======================================================= + text = text_cleaning(text) + + # --- 3. 安全切片 (Safe Chunking) --- + # 800 字符切片,100 字符重叠 + chunks = text_to_chunks(text) + + # --- 4. 向量化 (Call Alibaba) --- + vectors = [] + if chunks: + # 这里传入 DASHSCOPE_API_KEY + vectors = chunks_embedding(chunks, DASHSCOPE_API_KEY) + + # 双重保险:确保向量列表长度一致 + if len(vectors) != len(chunks): + vectors = [None] * len(chunks) + + # --- 5. 构造 SQL 数据 --- + result_list = [] + + for idx, content in enumerate(chunks): + clean_content = content.strip() # 清洗首尾空白 + if not clean_content: continue + + result_list.append({ + "source_url": metadata.get("sourceURL", ""), + "title": metadata.get("title", ""), + "content": clean_content, + "chunk_index": idx, + "embedding": vectors[idx] + }) + + return { + "results": result_list + } diff --git a/nodes/parse_firecrawl_map.py b/nodes/parse_firecrawl_map.py new file mode 100644 index 0000000..4749ac8 --- /dev/null +++ b/nodes/parse_firecrawl_map.py @@ -0,0 +1,22 @@ +def main(map_json: list[dict]): + """ + 将Firecrawl Map节点的输出转换为干净的输出,避免杂七杂八的数据干扰 + 输入: Firecrawl Map节点的输出,结构如下 + "map_json": [ + { + "links": [ + "http://example.com/page1", + "http://example.com/page2" + ], + "success": true, + }, + ] + 因为比较简单而且与firecrawl组件绑定比,所以就直接main里写完了 + """ + + map_obj = map_json[0] + + return { + "urls": map_obj["links"], + "code": int(map_obj["success"]), + } diff --git a/nodes/parse_pending_urls.py b/nodes/parse_pending_urls.py new file mode 100644 index 0000000..bcd6357 --- /dev/null +++ b/nodes/parse_pending_urls.py @@ -0,0 +1,20 @@ +def check_status(status_code: float, body: str): + ''' + 检查状态码和约定的返回值 + ''' + if status_code != 200: + raise Exception(f"注册任务失败,状态码:{status_code}") + if "code" not in body or body["code"] != 1: + raise Exception(f"注册任务失败,返回值:{body}") + +def main(status_code: float, body: str): + try: + check_status(status_code, body) + except Exception as e: + raise e + + urls = body["data"]["urls"] + + return { + "urls": urls + } diff --git a/nodes/parse_register.py b/nodes/parse_register.py new file mode 100644 index 0000000..a1eeec2 --- /dev/null +++ b/nodes/parse_register.py @@ -0,0 +1,23 @@ +def check_status(status_code: float, body: str): + ''' + 检查状态码和约定的返回值 + ''' + if status_code != 200: + raise Exception(f"注册任务失败,状态码:{status_code}") + if "code" not in body or body["code"] != 1: + raise Exception(f"注册任务失败,返回值:{body}") + + +def main(status_code: float, body: str): + try: + check_status(status_code, body) + except Exception as e: + raise e + + task_id = body["data"]["task_id"] + is_new_task = body["data"]["is_new_task"] + + return { + "task_id": task_id, + "is_new_task": is_new_task + } diff --git a/pyproject.toml b/pyproject.toml index 9c8cac2..7ca9424 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,4 @@ - [project] +[project] name = "scripts" version = "0.1.0" description = "Add your description here" diff --git a/scripts/test_apis.py b/scripts/test_apis.py new file mode 100644 index 0000000..8da4a73 --- /dev/null +++ b/scripts/test_apis.py @@ -0,0 +1,87 @@ +import requests +import json +import random + +# 配置后端地址 +BASE_URL = "http://127.0.0.1:8000" + +def log_res(name, response): + print(f"\n=== 测试接口: {name} ===") + if response.status_code == 200: + res_json = response.json() + print(f"状态: 成功 (HTTP 200)") + print(f"返回数据: {json.dumps(res_json, indent=2, ensure_ascii=False)}") + return res_json + else: + print(f"状态: 失败 (HTTP {response.status_code})") + print(f"错误信息: {response.text}") + return None + +def run_tests(): + # 测试数据准备 + test_root_url = f"https://example.com/wiki_{random.randint(1000, 9999)}" + + # 1. 测试 /register + print("步骤 1: 注册新任务...") + res = requests.post(f"{BASE_URL}/register", json={"url": test_root_url}) + data = log_res("注册任务", res) + if not data or data['code'] != 1: return + task_id = data['data']['task_id'] + + # 2. 测试 /add_urls + print("\n步骤 2: 模拟爬虫发现了新链接,存入队列...") + sub_urls = [ + f"{test_root_url}/page1", + f"{test_root_url}/page2", + f"{test_root_url}/page1" # 故意重复一个,测试后端去重 + ] + res = requests.post(f"{BASE_URL}/add_urls", json={ + "task_id": task_id, + "urls": sub_urls + }) + log_res("存入新链接", res) + + # 3. 测试 /pending_urls + print("\n步骤 3: 模拟爬虫节点获取待处理任务...") + res = requests.post(f"{BASE_URL}/pending_urls", json={ + "task_id": task_id, + "limit": 2 + }) + data = log_res("获取待处理URL", res) + if not data or not data['data']['urls']: + print("没有获取到待处理URL,停止后续测试") + return + + target_url = data['data']['urls'][0] + + # 4. 测试 /save_results + print("\n步骤 4: 模拟爬虫抓取完成,存入知识片段和向量...") + # 模拟一个 1536 维的向量(已处理精度) + mock_embedding = [round(random.uniform(-1, 1), 8) for _ in range(1536)] + + payload = { + "task_id": task_id, + "results": [ + { + "source_url": target_url, + "chunk_index": 0, + "title": "测试页面标题 - 切片1", + "content": "这是模拟抓取到的第一段网页内容...", + "embedding": mock_embedding + }, + { + "source_url": target_url, + "chunk_index": 1, + "title": "测试页面标题 - 切片2", + "content": "这是模拟抓取到的第二段网页内容...", + "embedding": mock_embedding + } + ] + } + res = requests.post(f"{BASE_URL}/save_results", json=payload) + log_res("保存结果", res) + + print("\n✅ 所有 API 流程测试完成!") + +if __name__ == "__main__": + run_tests() \ No newline at end of file