76 lines
2.8 KiB
Python
76 lines
2.8 KiB
Python
|
|
import requests
|
|||
|
|
import json
|
|||
|
|
|
|||
|
|
def map_firecrawl_docs():
|
|||
|
|
"""
|
|||
|
|
调用本地 Firecrawl 服务的 map 功能,映射 Firecrawl 文档地址
|
|||
|
|
"""
|
|||
|
|
# 本地 Firecrawl 服务地址(默认端口3002,根据你的实际情况调整)
|
|||
|
|
base_url = "http://localhost:3002"
|
|||
|
|
# map 功能的接口路径
|
|||
|
|
map_endpoint = f"{base_url}/api/v1/map"
|
|||
|
|
|
|||
|
|
# 请求参数
|
|||
|
|
payload = {
|
|||
|
|
# Firecrawl 文档的基础地址
|
|||
|
|
"url": "https://docs.firecrawl.dev",
|
|||
|
|
# 可选配置:控制爬取深度、是否包含子路径等
|
|||
|
|
"config": {
|
|||
|
|
"depth": 2, # 爬取深度,2层足够覆盖文档主要结构
|
|||
|
|
"includeSubdomains": False,
|
|||
|
|
"onlyMainContent": True, # 只爬取主要内容,过滤导航/广告等
|
|||
|
|
"limit": 50 # 限制爬取页面数量,避免爬取过多
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 请求头(如果本地服务需要认证,添加你的 API Key)
|
|||
|
|
headers = {
|
|||
|
|
"Content-Type": "application/json",
|
|||
|
|
# 如果有 API Key,取消下面注释并替换为你的密钥
|
|||
|
|
# "Authorization": "Bearer YOUR_FIRECRAWL_API_KEY"
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 发送 POST 请求调用 map 功能
|
|||
|
|
response = requests.post(
|
|||
|
|
map_endpoint,
|
|||
|
|
data=json.dumps(payload),
|
|||
|
|
headers=headers,
|
|||
|
|
timeout=60 # 设置超时时间,避免长时间等待
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 检查响应状态
|
|||
|
|
response.raise_for_status()
|
|||
|
|
|
|||
|
|
# 解析响应结果
|
|||
|
|
result = response.json()
|
|||
|
|
print("✅ Map 功能调用成功!")
|
|||
|
|
print("\n📄 爬取结果概览:")
|
|||
|
|
print(f"总页面数: {len(result.get('links', []))}")
|
|||
|
|
print(f"基础URL: {result.get('baseUrl')}")
|
|||
|
|
|
|||
|
|
# 打印爬取到的所有链接
|
|||
|
|
print("\n🔗 爬取到的文档链接:")
|
|||
|
|
for idx, link in enumerate(result.get('links', []), 1):
|
|||
|
|
print(f"{idx}. {link}")
|
|||
|
|
|
|||
|
|
# 保存结果到本地文件(方便后续查看)
|
|||
|
|
with open("firecrawl_docs_map.json", "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(result, f, indent=2, ensure_ascii=False)
|
|||
|
|
print("\n💾 结果已保存到 firecrawl_docs_map.json 文件")
|
|||
|
|
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
except requests.exceptions.ConnectionError:
|
|||
|
|
print("❌ 连接失败!请检查本地 Firecrawl 服务是否正在运行(http://localhost:3002)")
|
|||
|
|
except requests.exceptions.Timeout:
|
|||
|
|
print("❌ 请求超时!爬取文档可能需要更长时间,可调整 timeout 参数")
|
|||
|
|
except requests.exceptions.HTTPError as e:
|
|||
|
|
print(f"❌ HTTP 错误:{e}")
|
|||
|
|
print(f"响应内容:{response.text}")
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ 未知错误:{str(e)}")
|
|||
|
|
|
|||
|
|
# 执行调用
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
map_firecrawl_docs()
|