Files
wiki_crawler/scripts/test_firecrawl.py
2026-01-27 01:41:45 +08:00

76 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import requests
import json
def map_firecrawl_docs():
"""
调用本地 Firecrawl 服务的 map 功能,映射 Firecrawl 文档地址
"""
# 本地 Firecrawl 服务地址默认端口3002根据你的实际情况调整
base_url = "http://localhost:3002"
# map 功能的接口路径
map_endpoint = f"{base_url}/api/v1/map"
# 请求参数
payload = {
# Firecrawl 文档的基础地址
"url": "https://docs.firecrawl.dev",
# 可选配置:控制爬取深度、是否包含子路径等
"config": {
"depth": 2, # 爬取深度2层足够覆盖文档主要结构
"includeSubdomains": False,
"onlyMainContent": True, # 只爬取主要内容,过滤导航/广告等
"limit": 50 # 限制爬取页面数量,避免爬取过多
}
}
# 请求头(如果本地服务需要认证,添加你的 API Key
headers = {
"Content-Type": "application/json",
# 如果有 API Key取消下面注释并替换为你的密钥
# "Authorization": "Bearer YOUR_FIRECRAWL_API_KEY"
}
try:
# 发送 POST 请求调用 map 功能
response = requests.post(
map_endpoint,
data=json.dumps(payload),
headers=headers,
timeout=60 # 设置超时时间,避免长时间等待
)
# 检查响应状态
response.raise_for_status()
# 解析响应结果
result = response.json()
print("✅ Map 功能调用成功!")
print("\n📄 爬取结果概览:")
print(f"总页面数: {len(result.get('links', []))}")
print(f"基础URL: {result.get('baseUrl')}")
# 打印爬取到的所有链接
print("\n🔗 爬取到的文档链接:")
for idx, link in enumerate(result.get('links', []), 1):
print(f"{idx}. {link}")
# 保存结果到本地文件(方便后续查看)
with open("firecrawl_docs_map.json", "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print("\n💾 结果已保存到 firecrawl_docs_map.json 文件")
return result
except requests.exceptions.ConnectionError:
print("❌ 连接失败!请检查本地 Firecrawl 服务是否正在运行http://localhost:3002")
except requests.exceptions.Timeout:
print("❌ 请求超时!爬取文档可能需要更长时间,可调整 timeout 参数")
except requests.exceptions.HTTPError as e:
print(f"❌ HTTP 错误:{e}")
print(f"响应内容:{response.text}")
except Exception as e:
print(f"❌ 未知错误:{str(e)}")
# 执行调用
if __name__ == "__main__":
map_firecrawl_docs()