结构化数据提取
把 Tool Use 当作强制 JSON 通道。定义 input_schema,把 tool_choice 锁到那个 tool,Claude 就会返回结构化对象 — 没有 markdown 围栏、没有解释、没有解析技巧。
POST
https://buzzai.cc/v1/messages
核心套路。 你其实不打算"调用"这个 tool。你是把
input_schema 当 JSON Schema 约束、用 tool_choice 强制 Claude 填它。从 tool_use 块里把对象拎出来就完事了。
为什么这比"用 JSON 回答我"靠谱
Prompt 里说"用 JSON 回答"大多数时候管用。大多数。生产环境要的是每次都管用。Tool Use 给你:
- Schema 类型化字段。模型生成前就知道某字段是 integer、enum 还是 nested object。
- 没有 markdown 围栏、没有口水话。输出是
tool_use块里的 JSON 对象,不是要你切片的字符串。 - 必填字段强制。模型遵守
required标记的字段,远比遵守 prompt 里一句话靠谱。 - 对流式友好。可以流式拿
input_json_delta增量解析 partial JSON,做实时 UI。
提取请求的形状
{
"model": "claude-sonnet-4-6",
"max_tokens": 1024,
"tools": [
{
"name": "save_invoice",
"description": "把提取的发票存到数据库。",
"input_schema": { ... 你的 schema ... }
}
],
"tool_choice": {"type": "tool", "name": "save_invoice"},
"messages": [
{"role": "user", "content": "从下面文本提取结构化数据:\n\n" + raw_text}
]
}
关键字段是 tool_choice: {type: "tool", name: "save_invoice"},强制 Claude 必须调用这个 tool,不能调别的、也不能输出纯文本。响应一定是 stop_reason: "tool_use"。
实际会用到的字段类型
String
"name": {
"type": "string",
"description": "发票上写的客户全名。"
}
日期、ID 加上 format 提示 — Claude 会读:
"invoice_date": {"type": "string", "format": "date"},
"customer_email": {"type": "string", "format": "email"}
Integer / Number
"line_count": {"type": "integer", "minimum": 1, "maximum": 1000},
"total_usd": {"type": "number", "minimum": 0}
Enum
封闭集合是收益最高的类型化。别问"category 字符串",直接给四个合法值让模型选:
"status": {
"type": "string",
"enum": ["draft", "sent", "paid", "overdue"]
}
Boolean / 可空
"is_recurring": {"type": "boolean"},
"discount_pct": {"type": ["number", "null"], "minimum": 0, "maximum": 100}
嵌套 object
"customer": {
"type": "object",
"properties": {
"name": {"type": "string"},
"email": {"type": "string", "format": "email"},
"address": {
"type": "object",
"properties": {
"street": {"type": "string"},
"city": {"type": "string"},
"country_iso2": {"type": "string", "minLength": 2, "maxLength": 2}
},
"required": ["street", "city", "country_iso2"]
}
},
"required": ["name", "email"]
}
对象数组
"line_items": {
"type": "array",
"minItems": 1,
"items": {
"type": "object",
"properties": {
"sku": {"type": "string"},
"quantity": {"type": "integer", "minimum": 1},
"unit_price_usd": {"type": "number", "minimum": 0}
},
"required": ["sku", "quantity", "unit_price_usd"]
}
}
完整样例:从 OCR 文本提取发票字段
"""
从原始 OCR 文本提取结构化发票。
依赖:pip install anthropic jsonschema
"""
import os, json
from anthropic import Anthropic
from jsonschema import Draft202012Validator, ValidationError
client = Anthropic(
base_url="https://buzzai.cc",
api_key=os.environ["BUZZ_API_KEY"],
)
INVOICE_SCHEMA = {
"type": "object",
"properties": {
"invoice_number": {"type": "string"},
"invoice_date": {"type": "string", "format": "date"},
"status": {
"type": "string",
"enum": ["draft", "sent", "paid", "overdue"],
},
"is_recurring": {"type": "boolean"},
"customer": {
"type": "object",
"properties": {
"name": {"type": "string"},
"email": {"type": "string", "format": "email"},
"country_iso2": {
"type": "string", "minLength": 2, "maxLength": 2,
},
},
"required": ["name", "email", "country_iso2"],
},
"line_items": {
"type": "array",
"minItems": 1,
"items": {
"type": "object",
"properties": {
"sku": {"type": "string"},
"quantity": {"type": "integer", "minimum": 1},
"unit_price_usd": {"type": "number", "minimum": 0},
},
"required": ["sku", "quantity", "unit_price_usd"],
},
},
"total_usd": {"type": "number", "minimum": 0},
},
"required": [
"invoice_number", "invoice_date", "status",
"customer", "line_items", "total_usd",
],
}
EXTRACTOR_TOOL = {
"name": "save_invoice",
"description": "把提取的发票存到数据库。整张发票一次性调用。",
"input_schema": INVOICE_SCHEMA,
}
VALIDATOR = Draft202012Validator(INVOICE_SCHEMA)
def extract(raw_text: str) -> dict:
resp = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
tools=[EXTRACTOR_TOOL],
tool_choice={"type": "tool", "name": "save_invoice"},
messages=[
{
"role": "user",
"content": (
"从下面发票文本里提取所有字段。"
"字段不存在就省略,不要瞎猜。"
"日期用 ISO 8601,country 用 ISO-3166 alpha-2。\n\n"
f"<INVOICE>\n{raw_text}\n</INVOICE>"
),
}
],
)
tool_block = next((b for b in resp.content if b.type == "tool_use"), None)
if tool_block is None:
raise RuntimeError("模型没有调用 tool")
data = tool_block.input
# Schema 校验。一次性把所有错都报出来。
errors = sorted(VALIDATOR.iter_errors(data), key=lambda e: e.path)
if errors:
msg = "\n".join(f" - {list(e.path)}: {e.message}" for e in errors)
raise ValidationError(f"Schema 违规:\n{msg}")
return data
if __name__ == "__main__":
sample = """
INVOICE #INV-2026-0042
Date: 2026-05-20
Bill to: Acme Robotics, hello@acme.example, US
Status: paid (auto-debit, monthly recurring)
SKU Qty Unit Price
HW-WIDGET 3 19.99
SW-LICENSE 1 99.00
Total: $158.97
"""
print(json.dumps(extract(sample), indent=2))
// 从原始文本提取结构化发票。
// 依赖:npm i @anthropic-ai/sdk ajv ajv-formats
import Anthropic from "@anthropic-ai/sdk";
import Ajv from "ajv";
import addFormats from "ajv-formats";
const client = new Anthropic({
baseURL: "https://buzzai.cc",
apiKey: process.env.BUZZ_API_KEY,
});
const INVOICE_SCHEMA = {
type: "object",
properties: {
invoice_number: { type: "string" },
invoice_date: { type: "string", format: "date" },
status: { type: "string", enum: ["draft", "sent", "paid", "overdue"] },
is_recurring: { type: "boolean" },
customer: {
type: "object",
properties: {
name: { type: "string" },
email: { type: "string", format: "email" },
country_iso2: { type: "string", minLength: 2, maxLength: 2 },
},
required: ["name", "email", "country_iso2"],
},
line_items: {
type: "array",
minItems: 1,
items: {
type: "object",
properties: {
sku: { type: "string" },
quantity: { type: "integer", minimum: 1 },
unit_price_usd: { type: "number", minimum: 0 },
},
required: ["sku", "quantity", "unit_price_usd"],
},
},
total_usd: { type: "number", minimum: 0 },
},
required: [
"invoice_number", "invoice_date", "status",
"customer", "line_items", "total_usd",
],
};
const EXTRACTOR_TOOL = {
name: "save_invoice",
description:
"把提取的发票存到数据库。整张发票一次性调用。",
input_schema: INVOICE_SCHEMA,
};
const ajv = new Ajv({ allErrors: true, strict: false });
addFormats(ajv);
const validate = ajv.compile(INVOICE_SCHEMA);
export async function extract(rawText) {
const resp = await client.messages.create({
model: "claude-sonnet-4-6",
max_tokens: 2048,
tools: [EXTRACTOR_TOOL],
tool_choice: { type: "tool", name: "save_invoice" },
messages: [
{
role: "user",
content:
"从下面发票文本里提取所有字段。" +
"字段不存在就省略,不要瞎猜。" +
"日期用 ISO 8601,country 用 ISO-3166 alpha-2。\n\n" +
`\n${rawText}\n `,
},
],
});
const toolBlock = resp.content.find((b) => b.type === "tool_use");
if (!toolBlock) throw new Error("模型没有调用 tool");
const data = toolBlock.input;
if (!validate(data)) {
const msg = validate.errors
.map((e) => ` - ${e.instancePath || "/"}: ${e.message}`)
.join("\n");
throw new Error(`Schema 违规:\n${msg}`);
}
return data;
}
const sample = `
INVOICE #INV-2026-0042
Date: 2026-05-20
Bill to: Acme Robotics, hello@acme.example, US
Status: paid (auto-debit, monthly recurring)
SKU Qty Unit Price
HW-WIDGET 3 19.99
SW-LICENSE 1 99.00
Total: $158.97
`;
console.log(JSON.stringify(await extract(sample), null, 2));
样例输出
{
"invoice_number": "INV-2026-0042",
"invoice_date": "2026-05-20",
"status": "paid",
"is_recurring": true,
"customer": {
"name": "Acme Robotics",
"email": "hello@acme.example",
"country_iso2": "US"
},
"line_items": [
{"sku": "HW-WIDGET", "quantity": 3, "unit_price_usd": 19.99},
{"sku": "SW-LICENSE", "quantity": 1, "unit_price_usd": 99.00}
],
"total_usd": 158.97
}
校验模型输出
模型大概率守规矩。大概率。信任之前先验:
- Python:
jsonschema配Draft202012Validator。 - Node:
ajv配ajv-formats。 - 已经在用类型化 model 层(Pydantic、Zod)的话,直接从它生成 JSON Schema,在同一个边界处校验。
校验失败怎么办
三种实用模式,从轻到重:
- 带错误反馈重试一次。 把模型上一次的 tool_use 追加到对话,再加一条 user 消息把校验错和"再调一次 save_invoice 修正这些问题"塞进去。能解决约 90% 的偶发失败。
- 把坏记录扔进隔离表。 批处理任务里,不能为了单条记录卡住整批。
- 升级模型。 Haiku 校验失败 → Sonnet 重试;Sonnet 校验失败 → Opus 重试。成本上去,合规率也上去。
def extract_with_retry(raw_text, max_retries=2):
messages = [{"role": "user", "content": f"提取...\n\n{raw_text}"}]
for attempt in range(max_retries + 1):
resp = client.messages.create(
model="claude-sonnet-4-6", max_tokens=2048,
tools=[EXTRACTOR_TOOL],
tool_choice={"type": "tool", "name": "save_invoice"},
messages=messages,
)
tool_block = next(b for b in resp.content if b.type == "tool_use")
errors = list(VALIDATOR.iter_errors(tool_block.input))
if not errors:
return tool_block.input
# 把错误喂回去重试
messages.append({"role": "assistant", "content": resp.content})
messages.append({"role": "user", "content": [{
"type": "tool_result",
"tool_use_id": tool_block.id,
"content": "Schema 违规:\n" + "\n".join(
f"- {list(e.path)}: {e.message}" for e in errors
) + "\n\n用修正后的值再调一次 save_invoice。",
"is_error": True,
}]})
raise ValueError("重试次数已用完")
流式读取 partial JSON
交互 UI 用流式提取。每个 input_json_delta 事件带一段 partial_json;拼起来,用容错 JSON 解析器边解边渲:
with client.messages.stream(
model="claude-sonnet-4-6",
max_tokens=2048,
tools=[EXTRACTOR_TOOL],
tool_choice={"type": "tool", "name": "save_invoice"},
messages=[{"role": "user", "content": prompt}],
) as stream:
buf = ""
for event in stream:
if (event.type == "content_block_delta" and
event.delta.type == "input_json_delta"):
buf += event.delta.partial_json
# 能解析多少就在 UI 渲多少
final = json.loads(buf)
用流式容错的解析器(json5、partial-json,或者自己写一个"补到最近的右括号"的小工具),让 UI 在最终换行到达前就能更新。
多记录提取
从一份文档里提取多条记录,两种做法:
单 tool + 数组字段
定义一个 extractor,顶层是数组:
"input_schema": {
"type": "object",
"properties": {
"records": {"type": "array", "items": { ... 单条记录 schema ... }}
},
"required": ["records"]
}
最简单。适合记录 schema 一致、文档已知上限(比如最多 50 行的 5 页发票)。
循环 + tool_choice "any"
变形提取(不同记录类型、数量未知)用 tool_choice: {"type": "any"},让模型连续输出多个 tool_use 块。循环把每次保存的 tool_result 反馈回去,直到模型输出 end_turn。
选模型
| 模型 | 适用 |
|---|---|
claude-haiku-4-5-20251001 | schema 简单 + 高 QPS 的默认。最快最便宜。每条都校验。 |
claude-sonnet-4-6 | 复杂 schema、深嵌套、源文本歧义大。Haiku 之上的重试层。 |
claude-opus-4-7 | 合规高风险场景(法务、医疗),漏字段或填错都不可接受。值得开 thinking。 |
相关链接
- Tool Use 概念
- POST /v1/messages 参考
- 实战:Agent 循环 — 校验 + 重试的迭代流
- JSON Schema 规范