附录 D:读者练习与实践
本章提供动手练习,帮助读者巩固所学知识。
练习一:构建评估指标
任务描述
实现一个自定义评估指标,用于评估客服回复的"友好度"。
要求
- 实现
FriendlinessMetric类 - 支持以下评估维度:
- 礼貌用语使用(请、谢谢、抱歉等)
- 语气积极性(正面词汇比例)
- 回应及时性假设
- 返回 0-1 分数和详细分析
参考框架
class FriendlinessMetric:
"""
友好度评估指标
评估客服回复的友好程度
"""
def __init__(self):
# TODO: 初始化礼貌用语词典
self.polite_words = []
self.positive_words = []
def evaluate(self, response: str, context: Dict = None) -> Dict:
"""
评估回复友好度
Args:
response: 客服回复文本
context: 可选的上下文信息
Returns:
{
"score": float, # 0-1 分数
"polite_score": float,
"positive_score": float,
"analysis": str
}
"""
# TODO: 实现评估逻辑
pass
def _count_polite_words(self, text: str) -> int:
"""统计礼貌用语"""
# TODO
pass
def _analyze_sentiment(self, text: str) -> float:
"""分析情感倾向"""
# TODO
pass
测试用例
def test_friendliness_metric():
metric = FriendlinessMetric()
# 测试用例 1:友好回复
result = metric.evaluate("非常抱歉给您带来不便,我马上为您处理,请稍等。")
assert result["score"] > 0.7, "友好回复得分应大于0.7"
# 测试用例 2:冷淡回复
result = metric.evaluate("知道了,等一下。")
assert result["score"] < 0.4, "冷淡回复得分应小于0.4"
# 测试用例 3:中性回复
result = metric.evaluate("您的订单正在处理中。")
assert 0.4 <= result["score"] <= 0.7, "中性回复得分应在0.4-0.7之间"
参考答案
点击查看参考实现
import re
from typing import Dict, List
class FriendlinessMetric:
"""
友好度评估指标
"""
def __init__(self):
self.polite_words = [
"请", "谢谢", "感谢", "抱歉", "对不起", "不好意思",
"您", "为您", "帮助您", "请稍等", "马上", "竭诚"
]
self.positive_words = [
"高兴", "满意", "乐意", "感谢", "好的", "没问题",
"理解", "明白", "放心", "安心", "顺利", "成功"
]
self.negative_words = [
"不行", "不能", "无法", "错误", "失败", "问题",
"抱歉", "遗憾", "麻烦", "困难"
]
def evaluate(self, response: str, context: Dict = None) -> Dict:
"""评估回复友好度"""
# 1. 礼貌用语评分
polite_count = self._count_polite_words(response)
polite_score = min(1.0, polite_count / 3) # 3个为满分
# 2. 情感倾向评分
sentiment_score = self._analyze_sentiment(response)
# 3. 语气分析
tone_score = self._analyze_tone(response)
# 综合评分(加权平均)
weights = {
"polite": 0.4,
"sentiment": 0.4,
"tone": 0.2
}
overall = (
polite_score * weights["polite"] +
sentiment_score * weights["sentiment"] +
tone_score * weights["tone"]
)
# 生成分析
analysis = self._generate_analysis(polite_score, sentiment_score, tone_score)
return {
"score": overall,
"polite_score": polite_score,
"sentiment_score": sentiment_score,
"tone_score": tone_score,
"analysis": analysis
}
def _count_polite_words(self, text: str) -> int:
"""统计礼貌用语"""
count = 0
for word in self.polite_words:
if word in text:
count += 1
return count
def _analyze_sentiment(self, text: str) -> float:
"""分析情感倾向"""
positive_count = sum(1 for w in self.positive_words if w in text)
negative_count = sum(1 for w in self.negative_words if w in text)
total = positive_count + negative_count
if total == 0:
return 0.5 # 中性
return positive_count / total
def _analyze_tone(self, text: str) -> float:
"""分析语气"""
# 检查是否使用感叹号(可能表示急躁)
exclamation_count = text.count("!")
# 检查是否使用疑问句结尾(表示关心)
ends_with_question = text.strip().endswith("?")
# 计算得分
score = 0.5
if exclamation_count > 2:
score -= 0.2 # 过多感叹号扣分
if ends_with_question:
score += 0.2 # 疑问句加分
return max(0, min(1, score))
def _generate_analysis(self, polite, sentiment, tone):
"""生成分析说明"""
parts = []
if polite > 0.7:
parts.append("礼貌用语使用充分")
elif polite > 0.4:
parts.append("礼貌用语使用适中")
else:
parts.append("礼貌用语使用不足")
if sentiment > 0.7:
parts.append("情感倾向积极")
elif sentiment < 0.3:
parts.append("情感倾向偏消极")
return ";".join(parts)
练习二:构建 Golden Set
任务描述
为一个"智能翻译助手"构建评估数据集(Golden Set)。
要求
- 设计 20 个测试案例
- 覆盖以下场景:
- 日常对话(5个)
- 商务邮件(5个)
- 技术文档(5个)
- 文学翻译(5个)
- 每个案例包含:输入、参考翻译、评估标准
参考模板
# golden_set_translation_v1.yaml
metadata:
name: "translation_golden_set"
version: "1.0"
language_pair: "zh-en"
total_cases: 20
cases:
- id: "daily_001"
category: "daily_conversation"
difficulty: "easy"
input:
source_text: "你好,请问最近的地铁站在哪里?"
source_lang: "zh"
target_lang: "en"
reference:
translations:
- "Hello, where is the nearest subway station?"
- "Hi, could you tell me where the nearest metro station is?"
criteria:
- name: "accuracy"
description: "翻译准确,意思完整"
weight: 0.5
- name: "fluency"
description: "英语表达自然流畅"
weight: 0.3
- name: "tone"
description: "语气恰当(礼貌询问)"
weight: 0.2
提交要求
完成 YAML 文件,确保:
- 每个案例有明确的评估标准
- 参考翻译质量高
- 覆盖不同难度级别
练习三:实现评估流水线
任务描述
实现一个完整的评估流水线,对 LLM 输出进行批量评估。
要求
class EvaluationPipeline:
"""
评估流水线
"""
def __init__(self, config: Dict):
"""
初始化流水线
Args:
config: 配置包含 evaluators, dataset, output 等
"""
pass
async def run(self) -> Dict:
"""
执行评估流水线
Returns:
评估结果和报告
"""
pass
def _load_dataset(self, path: str) -> List[TestCase]:
"""加载数据集"""
pass
async def _evaluate_single(self, case: TestCase) -> EvalResult:
"""评估单个案例"""
pass
def _aggregate_results(self, results: List[EvalResult]) -> Dict:
"""聚合结果"""
pass
def _generate_report(self, results: List, summary: Dict) -> str:
"""生成报告"""
pass
测试验证
async def test_pipeline():
config = {
"dataset": "datasets/test_set.yaml",
"evaluators": [
{"type": "semantic_similarity", "weight": 0.5},
{"type": "g_eval", "weight": 0.5}
],
"output": "results/"
}
pipeline = EvaluationPipeline(config)
result = await pipeline.run()
assert result["summary"]["total"] > 0
assert result["summary"]["avg_score"] >= 0
assert "report" in result
练习四:设计监控仪表板
任务描述
设计一个 AI 系统监控仪表板配置。
要求
使用 Grafana Dashboard JSON 格式,设计包含以下组件的仪表板:
- 总体质量概览(Gauge)
- 请求量趋势(时间序列图)
- 错误分布(饼图)
- 延迟热力图(Heatmap)
- 告警列表(Alert List)
参考结构
{
"dashboard": {
"title": "AI System Monitoring",
"panels": [
{
"title": "Overall Quality Score",
"type": "gauge",
"targets": [...],
"options": {...}
},
{
"title": "Request Rate",
"type": "graph",
"targets": [...]
}
]
}
}
练习五:Agent 任务测试
任务描述
设计一个 Agent 任务测试案例集,测试"智能订餐助手"。
要求
- 定义 10 个测试任务
- 包含以下类型:
- 简单查询(菜单、价格)
- 复杂操作(下单、修改订单)
- 异常处理(缺货、网络错误)
- 多轮对话(推荐、比价)
模板
agent_tasks:
- id: "order_001"
name: "查询今日菜单"
type: "query"
input:
user_message: "今天有什么菜?"
success_criteria:
- type: "output_match"
contains: ["菜单", "菜品"]
- type: "tool_invocation"
expected_tools: ["get_menu"]
optimal_steps: 2
- id: "order_002"
name: "下单购买"
type: "action"
input:
user_message: "我要一份宫保鸡丁和一碗米饭"
success_criteria:
- type: "state_change"
expected:
- action: "create_order"
items: ["宫保鸡丁", "米饭"]
- type: "output_match"
contains: ["下单成功", "订单号"]
optimal_steps: 3
练习六:实现 A/B 测试框架
任务描述
实现一个简单的 A/B 测试框架,用于对比两个模型版本。
要求
class ABTestFramework:
"""
A/B 测试框架
"""
def setup_test(self, name: str, variant_a, variant_b, split: float = 0.5):
"""设置测试"""
pass
def route_request(self) -> str:
"""路由请求到变体"""
pass
def record_result(self, variant: str, metrics: Dict):
"""记录结果"""
pass
def analyze(self) -> Dict:
"""分析结果"""
pass
def statistical_significance(self) -> bool:
"""检查统计显著性"""
pass
测试场景
def test_ab_framework():
ab = ABTestFramework()
ab.setup_test(
name="model_comparison",
variant_a={"model": "gpt-3.5-turbo"},
variant_b={"model": "gpt-4"},
split=0.5
)
# 模拟请求
for i in range(100):
variant = ab.route_request()
# 模拟评估
score = 0.85 if variant == "B" else 0.75
ab.record_result(variant, {"score": score})
result = ab.analyze()
assert result["significant"] == True
assert result["recommendation"] == "adopt_b"
综合项目
项目:构建完整的客服 AI Harness
目标
构建一个完整的客服 AI 评估系统,包含:
- 评估指标体系(至少 5 个指标)
- Golden Set(至少 50 个案例)
- 评估流水线(批量执行)
- 监控系统(Prometheus + Grafana 配置)
- 报告系统(自动生成报告)
交付物
project/
├── config/
│ ├── eval_config.yaml
│ └── monitoring_config.yaml
├── datasets/
│ ├── golden_set.yaml
│ ├── boundary_set.yaml
│ └── adversarial_set.yaml
├── src/
│ ├── evaluators/
│ │ ├── accuracy.py
│ │ ├── friendliness.py
│ │ └── safety.py
│ ├── pipeline.py
│ └── report_generator.py
├── monitoring/
│ ├── prometheus.yml
│ └── grafana_dashboard.json
└── README.md
评估标准
| 组件 | 分值 | 评分标准 |
|---|---|---|
| 指标设计 | 20分 | 完整性、合理性、可实现性 |
| 数据集 | 20分 | 覆盖度、质量、多样性 |
| 流水线 | 20分 | 功能完整、代码质量 |
| 监控 | 20分 | 配置正确、可视化清晰 |
| 文档 | 20分 | 说明完整、可复现 |
参考答案获取
完整的参考答案请访问:answers/ 目录