第十章:Agent 评估与 Multi-Agent 系统
Agent 是 AI 应用的高级形态,其评估比单一 LLM/RAG 更复杂。本章深入探讨 Agent 评估的特殊性和方法论。
Agent 评估的特殊挑战
与传统 LLM 评估的差异
graph TB
subgraph "传统 LLM 评估"
A1[输入] --> A2[模型]
A2 --> A3[输出]
A3 --> A4[评估]
end
subgraph "Agent 评估"
B1[任务] --> B2[Agent]
B2 --> B3[规划]
B3 --> B4[执行]
B4 --> B5[工具调用]
B5 --> B6[中间结果]
B6 --> B7[最终输出]
B7 --> B8[多维度评估]
B4 --> B4a[可能循环]
B4a --> B3
end
| 维度 | 传统 LLM | Agent |
|---|---|---|
| 输入 | 单一 Prompt | 任务描述 + 上下文 |
| 输出 | 文本/结构化数据 | 多步结果 + 状态变化 |
| 执行 | 一次性 | 多步迭代 |
| 工具使用 | 无 | 核心能力 |
| 状态管理 | 无状态 | 有状态 |
| 评估重点 | 输出质量 | 任务完成度 + 过程质量 |
Agent 评估的核心维度
graph TB
A[Agent 评估维度] --> B[结果质量]
A --> C[过程质量]
A --> D[效率指标]
A --> E[安全合规]
B --> B1[任务完成率]
B --> B2[结果正确性]
B --> B3[输出质量]
C --> C1[规划合理性]
C --> C2[工具使用正确性]
C --> C3[错误恢复能力]
D --> D1[步骤效率]
D --> D2[时间成本]
D --> D3[Token消耗]
E --> E1[权限边界]
E --> E2[数据安全]
E --> E3[操作审计]
Agent 评估指标体系
结果层指标
任务完成率 (Task Success Rate)
# agent_eval/metrics.py
class TaskSuccessMetric:
"""
任务完成率评估器
"""
def __init__(self, strict: bool = False):
self.strict = strict # 严格模式:完全匹配
def evaluate(
self,
task: AgentTask,
execution_result: ExecutionResult
) -> Dict:
"""
评估任务是否完成
"""
# 检查任务完成条件
success_criteria = task.success_criteria
results = {}
total_criteria = len(success_criteria)
met_criteria = 0
for criterion in success_criteria:
criterion_type = criterion.get("type")
if criterion_type == "output_match":
# 输出匹配检查
result = self._check_output_match(
execution_result.output,
criterion.get("expected"),
criterion.get("match_type", "exact")
)
results[criterion["name"]] = result
if result["met"]:
met_criteria += 1
elif criterion_type == "state_change":
# 状态变化检查
result = self._check_state_change(
execution_result.state_changes,
criterion.get("expected_changes")
)
results[criterion["name"]] = result
if result["met"]:
met_criteria += 1
elif criterion_type == "tool_invocation":
# 工具调用检查
result = self._check_tool_invocation(
execution_result.tool_calls,
criterion.get("expected_tools")
)
results[criterion["name"]] = result
if result["met"]:
met_criteria += 1
elif criterion_type == "custom_validator":
# 自定义验证函数
validator = criterion.get("validator")
result = validator(execution_result)
results[criterion["name"]] = result
if result.get("met", False):
met_criteria += 1
# 计算完成度
completion_rate = met_criteria / total_criteria if total_criteria > 0 else 0
# 判断是否成功
if self.strict:
success = completion_rate == 1.0
else:
success = completion_rate >= criterion.get("threshold", 0.8)
return {
"success": success,
"completion_rate": completion_rate,
"criteria_results": results,
"met_criteria": met_criteria,
"total_criteria": total_criteria,
}
def _check_output_match(self, output, expected, match_type):
"""检查输出匹配"""
if match_type == "exact":
met = output.strip() == expected.strip()
elif match_type == "contains":
met = expected in output
elif match_type == "regex":
import re
met = bool(re.search(expected, output))
elif match_type == "semantic":
# 语义相似度
met = self._semantic_similarity(output, expected) > 0.8
else:
met = False
return {"met": met, "type": match_type}
def _check_state_change(self, actual_changes, expected_changes):
"""检查状态变化"""
met_changes = []
for expected in expected_changes:
for actual in actual_changes:
if actual.get("key") == expected.get("key"):
if actual.get("value") == expected.get("value"):
met_changes.append(expected)
break
return {
"met": len(met_changes) == len(expected_changes),
"met_count": len(met_changes),
"expected_count": len(expected_changes),
}
def _check_tool_invocation(self, tool_calls, expected_tools):
"""检查工具调用"""
called_tools = [call.get("tool") for call in tool_calls]
all_called = all(tool in called_tools for tool in expected_tools)
return {
"met": all_called,
"called": called_tools,
"expected": expected_tools,
}
结果正确性评分
class ResultCorrectnessMetric:
"""
结果正确性评估
对于有标准答案的任务
"""
def evaluate(
self,
task: AgentTask,
execution_result: ExecutionResult,
ground_truth: Any = None
) -> Dict:
"""
评估结果正确性
"""
if ground_truth is None:
# 使用任务中的标准答案
ground_truth = task.ground_truth
if ground_truth is None:
return {"score": None, "reason": "无标准答案"}
# 根据任务类型选择评估方法
task_type = task.type
if task_type == "qa":
return self._eval_qa(execution_result.output, ground_truth)
elif task_type == "code_generation":
return self._eval_code(execution_result.output, ground_truth)
elif task_type == "data_extraction":
return self._eval_extraction(execution_result.output, ground_truth)
elif task_type == "api_operation":
return self._eval_api(execution_result.state_changes, ground_truth)
else:
return self._eval_generic(execution_result.output, ground_truth)
def _eval_qa(self, output, ground_truth):
"""QA 评估"""
from evaluation.semantic_similarity import SemanticSimilarity
evaluator = SemanticSimilarity()
score = evaluator.evaluate(output, ground_truth)
return {
"score": score,
"method": "semantic_similarity",
}
def _eval_code(self, output, ground_truth):
"""代码评估"""
# 1. 语法检查
import ast
try:
ast.parse(output)
syntax_valid = True
except:
syntax_valid = False
# 2. 执行测试用例
test_cases = ground_truth.get("test_cases", [])
passed = 0
for test in test_cases:
try:
# 执行代码并测试
exec_globals = {}
exec(output, exec_globals)
result = exec_globals.get(ground_truth.get("function_name"))(*test["input"])
if result == test["expected"]:
passed += 1
except:
pass
return {
"score": passed / len(test_cases) if test_cases else 0,
"syntax_valid": syntax_valid,
"tests_passed": passed,
"tests_total": len(test_cases),
}
def _eval_api_operation(self, state_changes, ground_truth):
"""API 操作评估"""
expected_ops = ground_truth.get("operations", [])
correct_ops = 0
for expected in expected_ops:
for actual in state_changes:
if (actual.get("action") == expected.get("action") and
actual.get("target") == expected.get("target")):
correct_ops += 1
break
return {
"score": correct_ops / len(expected_ops) if expected_ops else 0,
}
过程层指标
步骤合理性评估
class StepRationalityMetric:
"""
步骤合理性评估
评估 Agent 执行步骤是否合理
"""
def evaluate(
self,
task: AgentTask,
execution_trace: ExecutionTrace
) -> Dict:
"""
评估执行步骤合理性
"""
steps = execution_trace.steps
# 1. 检查是否有冗余步骤
redundancy = self._check_redundancy(steps)
# 2. 检查步骤顺序是否合理
order_score = self._check_step_order(steps, task)
# 3. 检查是否有无效步骤
invalid_steps = self._check_invalid_steps(steps)
# 4. 检查步骤效率
efficiency = self._check_efficiency(steps, task)
# 综合评分
weights = {
"redundancy": 0.25,
"order": 0.25,
"validity": 0.25,
"efficiency": 0.25,
}
overall = (
(1 - redundancy["rate"]) * weights["redundancy"] +
order_score * weights["order"] +
(1 - invalid_steps["rate"]) * weights["validity"] +
efficiency["score"] * weights["efficiency"]
)
return {
"overall_score": overall,
"redundancy": redundancy,
"order_score": order_score,
"invalid_steps": invalid_steps,
"efficiency": efficiency,
}
def _check_redundancy(self, steps):
"""检查冗余步骤"""
# 检测重复的相同操作
operation_counts = {}
for step in steps:
op_key = f"{step.get('action')}:{step.get('target')}"
operation_counts[op_key] = operation_counts.get(op_key, 0) + 1
redundant = sum(1 for count in operation_counts.values() if count > 1)
return {
"rate": redundant / len(steps) if steps else 0,
"redundant_operations": [k for k, v in operation_counts.items() if v > 1],
}
def _check_step_order(self, steps, task):
"""检查步骤顺序"""
# 定义依赖关系
dependencies = task.step_dependencies or []
violations = 0
step_positions = {step.get("id"): i for i, step in enumerate(steps)}
for dep in dependencies:
before_id = dep.get("before")
after_id = dep.get("after")
if before_id in step_positions and after_id in step_positions:
if step_positions[before_id] > step_positions[after_id]:
violations += 1
return 1 - (violations / len(dependencies) if dependencies else 0)
def _check_invalid_steps(self, steps):
"""检查无效步骤"""
invalid = []
for step in steps:
if step.get("status") == "failed" and not step.get("recovered"):
invalid.append(step.get("id"))
return {
"rate": len(invalid) / len(steps) if steps else 0,
"invalid_step_ids": invalid,
}
def _check_efficiency(self, steps, task):
"""检查效率"""
optimal_steps = task.optimal_step_count
if optimal_steps is None:
return {"score": 1.0, "reason": "无最优步数参考"}
actual = len(steps)
optimal = optimal_steps
if actual <= optimal:
score = 1.0
else:
# 超出越多,分数越低
score = max(0, 1 - (actual - optimal) / optimal)
return {
"score": score,
"actual_steps": actual,
"optimal_steps": optimal,
}
工具使用评估
class ToolUsageMetric:
"""
工具使用评估
评估 Agent 工具调用的正确性和效率
"""
def evaluate(
self,
task: AgentTask,
execution_trace: ExecutionTrace
) -> Dict:
"""
评估工具使用
"""
tool_calls = execution_trace.tool_calls
# 1. 工具选择正确性
selection_score = self._eval_tool_selection(tool_calls, task)
# 2. 参数正确性
param_score = self._eval_tool_parameters(tool_calls, task)
# 3. 调用成功率
success_rate = self._eval_call_success(tool_calls)
# 4. 工具使用效率
efficiency = self._eval_tool_efficiency(tool_calls, task)
return {
"selection_score": selection_score,
"parameter_score": param_score,
"success_rate": success_rate,
"efficiency": efficiency,
"overall": (selection_score + param_score + success_rate + efficiency) / 4,
}
def _eval_tool_selection(self, tool_calls, task):
"""评估工具选择"""
expected_tools = set(task.required_tools or [])
used_tools = set(call.get("tool") for call in tool_calls)
if not expected_tools:
return 1.0
# 检查是否使用了所有必要工具
coverage = len(expected_tools & used_tools) / len(expected_tools)
# 检查是否使用了不相关工具
extra_tools = used_tools - expected_tools
penalty = len(extra_tools) / (len(used_tools) or 1)
return max(0, coverage - penalty * 0.2)
def _eval_tool_parameters(self, tool_calls, task):
"""评估参数正确性"""
correct_calls = 0
for call in tool_calls:
tool_name = call.get("tool")
params = call.get("parameters", {})
# 获取工具的参数规范
spec = self._get_tool_spec(tool_name)
if spec is None:
continue
# 检查必需参数
required = spec.get("required", [])
all_present = all(p in params for p in required)
# 检查参数类型
types_correct = True
for param, value in params.items():
expected_type = spec.get("properties", {}).get(param, {}).get("type")
if expected_type and not self._check_type(value, expected_type):
types_correct = False
break
if all_present and types_correct:
correct_calls += 1
return correct_calls / len(tool_calls) if tool_calls else 1.0
def _eval_call_success(self, tool_calls):
"""评估调用成功率"""
successful = sum(1 for call in tool_calls if call.get("status") == "success")
return successful / len(tool_calls) if tool_calls else 1.0
def _eval_tool_efficiency(self, tool_calls, task):
"""评估工具使用效率"""
# 检查是否有不必要的重复调用
call_counts = {}
for call in tool_calls:
key = f"{call.get('tool')}:{str(call.get('parameters', {}))}"
call_counts[key] = call_counts.get(key, 0) + 1
duplicates = sum(1 for count in call_counts.values() if count > 1)
return max(0, 1 - duplicates / len(tool_calls)) if tool_calls else 1.0
效率指标
class EfficiencyMetrics:
"""
效率指标评估
"""
def evaluate(
self,
task: AgentTask,
execution_trace: ExecutionTrace
) -> Dict:
"""
计算效率指标
"""
return {
"step_count": len(execution_trace.steps),
"tool_call_count": len(execution_trace.tool_calls),
"total_tokens": execution_trace.total_tokens,
"total_time_seconds": execution_trace.total_time,
"time_per_step": execution_trace.total_time / len(execution_trace.steps) if execution_trace.steps else 0,
"tokens_per_step": execution_trace.total_tokens / len(execution_trace.steps) if execution_trace.steps else 0,
"efficiency_score": self._compute_efficiency_score(execution_trace, task),
}
def _compute_efficiency_score(self, trace, task):
"""计算综合效率得分"""
baseline = task.baseline_metrics or {}
if not baseline:
return 1.0
scores = []
# 步骤数对比
if "step_count" in baseline:
expected = baseline["step_count"]
actual = len(trace.steps)
if actual <= expected:
scores.append(1.0)
else:
scores.append(max(0, expected / actual))
# 时间对比
if "time_seconds" in baseline:
expected = baseline["time_seconds"]
actual = trace.total_time
if actual <= expected:
scores.append(1.0)
else:
scores.append(max(0, expected / actual))
# Token 对比
if "tokens" in baseline:
expected = baseline["tokens"]
actual = trace.total_tokens
if actual <= expected:
scores.append(1.0)
else:
scores.append(max(0, expected / actual))
return sum(scores) / len(scores) if scores else 1.0
Multi-Agent 系统评估
Multi-Agent 评估的特殊性
graph TB
subgraph "Multi-Agent 系统"
A[任务分解] --> B[Agent 1]
A --> C[Agent 2]
A --> D[Agent 3]
B --> E[结果聚合]
C --> E
D --> E
B -.->|协作| C
C -.->|协作| D
end
subgraph "评估维度"
F[个体性能]
G[协作效率]
H[整体结果]
end
Multi-Agent 评估框架
# agent_eval/multi_agent.py
class MultiAgentEvaluator:
"""
Multi-Agent 系统评估器
"""
def __init__(self):
self.individual_evaluator = IndividualAgentEvaluator()
self.collaboration_evaluator = CollaborationEvaluator()
self.system_evaluator = SystemEvaluator()
async def evaluate(
self,
task: MultiAgentTask,
execution_result: MultiAgentExecutionResult
) -> Dict:
"""
综合评估 Multi-Agent 系统
"""
# 1. 个体 Agent 评估
individual_results = {}
for agent_id, traces in execution_result.agent_traces.items():
individual_results[agent_id] = await self.individual_evaluator.evaluate(
task.agent_tasks.get(agent_id),
traces
)
# 2. 协作评估
collaboration_result = await self.collaboration_evaluator.evaluate(
execution_result.collaboration_trace,
task.collaboration_requirements
)
# 3. 系统整体评估
system_result = await self.system_evaluator.evaluate(
task,
execution_result
)
# 综合评分
weights = {
"individual": 0.3,
"collaboration": 0.3,
"system": 0.4,
}
overall = (
self._average_score(individual_results) * weights["individual"] +
collaboration_result["overall_score"] * weights["collaboration"] +
system_result["overall_score"] * weights["system"]
)
return {
"overall_score": overall,
"individual_results": individual_results,
"collaboration_result": collaboration_result,
"system_result": system_result,
}
def _average_score(self, results: Dict) -> float:
"""计算平均分"""
scores = [r.get("overall_score", 0) for r in results.values()]
return sum(scores) / len(scores) if scores else 0
class CollaborationEvaluator:
"""
协作评估器
"""
async def evaluate(
self,
collaboration_trace: CollaborationTrace,
requirements: Dict
) -> Dict:
"""
评估 Agent 间协作
"""
# 1. 通信效率
communication_score = self._eval_communication(
collaboration_trace.messages
)
# 2. 任务分配合理性
allocation_score = self._eval_task_allocation(
collaboration_trace.task_assignments,
requirements.get("expected_allocation")
)
# 3. 冲突处理
conflict_score = self._eval_conflict_handling(
collaboration_trace.conflicts
)
# 4. 信息共享
info_sharing_score = self._eval_info_sharing(
collaboration_trace.shared_context
)
return {
"communication_score": communication_score,
"allocation_score": allocation_score,
"conflict_score": conflict_score,
"info_sharing_score": info_sharing_score,
"overall_score": (communication_score + allocation_score +
conflict_score + info_sharing_score) / 4,
}
def _eval_communication(self, messages):
"""评估通信效率"""
if not messages:
return 1.0
# 检查消息是否简洁有效
total_len = sum(len(m.get("content", "")) for m in messages)
avg_len = total_len / len(messages)
# 消息应该简洁(理想平均 < 500 字符)
brevity_score = max(0, 1 - (avg_len - 200) / 500)
# 检查是否有冗余消息
unique_senders = set(m.get("sender") for m in messages)
unique_receivers = set(m.get("receiver") for m in messages)
coverage_score = 1.0 # 所有 Agent 都参与通信
return (brevity_score + coverage_score) / 2
def _eval_task_allocation(self, assignments, expected):
"""评估任务分配"""
if not expected:
return 1.0
correct = 0
for exp in expected:
agent = exp.get("agent")
task = exp.get("task")
for actual in assignments:
if actual.get("agent") == agent and actual.get("task") == task:
correct += 1
break
return correct / len(expected)
def _eval_conflict_handling(self, conflicts):
"""评估冲突处理"""
if not conflicts:
return 1.0
resolved = sum(1 for c in conflicts if c.get("resolved"))
return resolved / len(conflicts)
def _eval_info_sharing(self, shared_context):
"""评估信息共享"""
if not shared_context:
return 0.5 # 无共享信息
# 检查共享信息的质量和及时性
updates = shared_context.get("updates", [])
if not updates:
return 0.5
# 检查更新是否及时
timely = sum(1 for u in updates if u.get("timely", True))
return timely / len(updates)
Agent 测试数据集构建
任务案例设计
# datasets/agent_tasks_v1.0.yaml
metadata:
name: "agent_task_benchmark"
version: "1.0"
total_tasks: 50
categories:
- id: "api_operations"
name: "API 操作任务"
tasks:
- id: "api_001"
name: "查询用户订单"
description: "根据用户ID查询其所有订单"
input:
user_id: "user_12345"
success_criteria:
- type: "tool_invocation"
name: "调用查询API"
expected_tools: ["get_user_orders"]
- type: "output_match"
name: "返回订单列表"
expected_type: "list"
ground_truth:
operations:
- action: "api_call"
target: "get_user_orders"
params: {"user_id": "user_12345"}
optimal_step_count: 2
baseline_metrics:
time_seconds: 5
tokens: 500
- id: "multi_step_reasoning"
name: "多步推理任务"
tasks:
- id: "reason_001"
name: "分析销售数据并生成报告"
description: "获取上月销售数据,分析趋势,生成报告"
input:
month: "2024-01"
success_criteria:
- type: "tool_invocation"
name: "获取数据"
expected_tools: ["get_sales_data"]
- type: "tool_invocation"
name: "分析数据"
expected_tools: ["analyze_trends"]
- type: "tool_invocation"
name: "生成报告"
expected_tools: ["generate_report"]
- type: "output_match"
name: "报告包含关键指标"
contains: ["销售额", "增长率"]
step_dependencies:
- before: "analyze_trends"
after: "get_sales_data"
- before: "generate_report"
after: "analyze_trends"
optimal_step_count: 4
- id: "error_recovery"
name: "错误恢复任务"
tasks:
- id: "error_001"
name: "处理API限流"
description: "当遇到API限流时,正确处理并重试"
input:
request: "fetch_user_data"
simulation:
inject_error:
type: "rate_limit"
at_step: 1
success_criteria:
- type: "custom_validator"
name: "正确处理错误"
validator: "check_error_handling"
- type: "tool_invocation"
name: "最终成功"
expected_tools: ["fetch_user_data"]
require_success: true
optimal_step_count: 3 # 包含重试
评估执行器
# agent_eval/executor.py
class AgentEvaluationRunner:
"""
Agent 评估执行器
"""
def __init__(self, config: Dict):
self.config = config
self.metrics = {
"task_success": TaskSuccessMetric(),
"correctness": ResultCorrectnessMetric(),
"step_rationality": StepRationalityMetric(),
"tool_usage": ToolUsageMetric(),
"efficiency": EfficiencyMetrics(),
}
async def run_evaluation(
self,
agent: Agent,
task_dataset: List[AgentTask]
) -> Dict:
"""
运行完整评估
"""
results = []
for task in task_dataset:
# 执行任务
execution_result = await self._execute_task(agent, task)
# 评估各项指标
task_metrics = {}
for metric_name, metric in self.metrics.items():
task_metrics[metric_name] = metric.evaluate(task, execution_result)
results.append({
"task_id": task.id,
"execution": execution_result,
"metrics": task_metrics,
})
# 聚合结果
summary = self._aggregate_results(results)
return {
"results": results,
"summary": summary,
}
async def _execute_task(self, agent: Agent, task: AgentTask) -> ExecutionResult:
"""执行单个任务"""
import time
start_time = time.time()
# 调用 Agent 执行
result = await agent.execute(
task=task.description,
context=task.input,
tools=task.available_tools,
)
end_time = time.time()
return ExecutionResult(
task_id=task.id,
output=result.output,
steps=result.steps,
tool_calls=result.tool_calls,
state_changes=result.state_changes,
total_tokens=result.total_tokens,
total_time=end_time - start_time,
)
def _aggregate_results(self, results: List) -> Dict:
"""聚合评估结果"""
total = len(results)
# 任务成功率
success_count = sum(
1 for r in results
if r["metrics"]["task_success"]["success"]
)
# 各指标平均分
metric_averages = {}
for metric_name in self.metrics.keys():
scores = [
r["metrics"][metric_name].get("overall_score",
r["metrics"][metric_name].get("score", 0))
for r in results
]
metric_averages[metric_name] = sum(scores) / len(scores)
return {
"total_tasks": total,
"successful_tasks": success_count,
"success_rate": success_count / total,
"metric_averages": metric_averages,
}
Agent 评估报告
报告模板
┌────────────────────────────────────────────────────────────────────┐
│ Agent 评估报告 │
│ 评估时间: 2024-01-20 10:30 │
├────────────────────────────────────────────────────────────────────┤
│ 总体概览 │
│ ────────────────────────────────────────────────────── │
│ 总任务数: 50 成功: 42 失败: 8 成功率: 84% │
│ 综合得分: 0.82 │
├────────────────────────────────────────────────────────────────────┤
│ 指标详情 │
│ ────────────────────────────────────────────────────── │
│ │ 指标 │ 得分 │ 状态 │ 说明 │ │
│ │ 任务完成率 │ 0.84 │ ✅ │ 42/50 任务成功 │ │
│ │ 结果正确性 │ 0.78 │ ⚠️ │ 部分结果存在偏差 │ │
│ │ 步骤合理性 │ 0.85 │ ✅ │ 步骤规划合理 │ │
│ │ 工具使用 │ 0.80 │ ✅ │ 工具选择基本正确 │ │
│ │ 效率 │ 0.72 │ ⚠️ │ 步骤数略多于最优 │ │
├────────────────────────────────────────────────────────────────────┤
│ 分类表现 │
│ ────────────────────────────────────────────────────── │
│ │ 任务类型 │ 数量 │ 成功率 │ 平均得分 │ 问题 │ │
│ │ API操作 │ 20 │ 95% │ 0.88 │ 无 │ │
│ │ 多步推理 │ 15 │ 80% │ 0.78 │ 步骤冗余 │ │
│ │ 错误恢复 │ 10 │ 60% │ 0.65 │ 恢复失败 │ │
│ │ 数据处理 │ 5 │ 100% │ 0.92 │ 无 │ │
├────────────────────────────────────────────────────────────────────┤
│ 失败任务分析 │
│ ────────────────────────────────────────────────────── │
│ 1. error_003: API限流处理失败 │
│ - 失败原因: 未正确实现重试逻辑 │
│ - 建议: 增加错误处理和重试机制 │
│ │
│ 2. reason_007: 数据分析报告生成失败 │
│ - 失败原因: 工具调用顺序错误 │
│ - 建议: 优化任务规划逻辑 │
├────────────────────────────────────────────────────────────────────┤
│ 优化建议 │
│ ────────────────────────────────────────────────────── │
│ 1. 【高优先级】加强错误恢复能力,增加重试和备用方案 │
│ 2. 【中优先级】优化步骤规划,减少冗余操作 │
│ 3. 【低优先级】提升结果验证,确保输出正确性 │
└────────────────────────────────────────────────────────────────────┘
小结
Agent 评估的核心要点:
| 维度 | 关键指标 | 评估方法 |
|---|---|---|
| 结果 | 任务完成率、正确性 | 成功条件检查、标准答案对比 |
| 过程 | 步骤合理性、工具使用 | 执行轨迹分析、依赖检查 |
| 效率 | 时间、Token、步骤数 | 基线对比、最优参考 |
| 安全 | 权限、审计、合规 | 规则检查、边界测试 |
✅ 是否定义了明确的成功条件? ✅ 是否评估了执行过程而不仅是结果? ✅ 是否考虑了错误恢复能力? ✅ 是否有效率基线对比? ✅ Multi-Agent 是否评估了协作效率? ✅ 是否有安全边界测试?
下一附录,我们将提供读者练习和实践作业。