Skip to content

Commit 72a9641

Browse files
committed
优化了6个计算指标,忽略预测长度为0、gt长度为0的数据,只计算有效数据
2 parents 5e39ad3 + b90b73d commit 72a9641

7 files changed

Lines changed: 890 additions & 44 deletions

File tree

examples/basic_usage.py

Lines changed: 260 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -944,13 +944,272 @@ def demo_multi_extraction():
944944
print(f" - {name}_time: 单样本抽取耗时(秒)")
945945
print(f" - {name}_*_score: 各指标得分(如{name}_text_edit)")
946946

947+
948+
def demo_llm_webkit_with_preprocessed_html_evaluation():
949+
"""演示LLM-WebKit预处理HTML功能的评测"""
950+
951+
print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
952+
953+
# 设置日志
954+
setup_logging(level="INFO")
955+
956+
# 1. 创建包含预处理HTML的测试数据集
957+
print("1. 创建包含预处理HTML的测试数据集...")
958+
959+
samples = []
960+
961+
# 样本1: 包含预处理的HTML(模拟第一阶段LLM简化后的结果)
962+
sample_1_data = {
963+
"id": "preprocessed_sample_1",
964+
"html": """<html><body><h1>原始复杂HTML</h1><p>这里是原始的复杂HTML内容...</p></body></html>""",
965+
# 这是关键:包含llm_webkit_html字段(预处理后的简化HTML)
966+
"llm_webkit_html": """
967+
<div _item_id="1">
968+
<h1>深度学习基础教程</h1>
969+
<p>深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。</p>
970+
</div>
971+
<div _item_id="2">
972+
<h2>核心概念</h2>
973+
<p>神经网络由多个层组成,每层包含多个神经元。</p>
974+
</div>
975+
<div _item_id="3">
976+
<pre><code class="language-python">
977+
import torch
978+
import torch.nn as nn
979+
980+
class SimpleNet(nn.Module):
981+
def __init__(self):
982+
super().__init__()
983+
self.fc = nn.Linear(784, 10)
984+
985+
def forward(self, x):
986+
return self.fc(x)
987+
</code></pre>
988+
</div>
989+
""",
990+
"groundtruth_content": """# 深度学习基础教程
991+
992+
深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。
993+
994+
## 核心概念
995+
996+
神经网络由多个层组成,每层包含多个神经元。
997+
998+
```python
999+
import torch
1000+
import torch.nn as nn
1001+
1002+
class SimpleNet(nn.Module):
1003+
def __init__(self):
1004+
super().__init__()
1005+
self.fc = nn.Linear(784, 10)
1006+
1007+
def forward(self, x):
1008+
return self.fc(x)
1009+
```""",
1010+
"groundtruth_content_list": [
1011+
{"type": "heading", "content": "深度学习基础教程", "level": 1},
1012+
{"type": "paragraph", "content": "深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。"},
1013+
{"type": "heading", "content": "核心概念", "level": 2},
1014+
{"type": "paragraph", "content": "神经网络由多个层组成,每层包含多个神经元。"},
1015+
{"type": "code", "content": "import torch\nimport torch.nn as nn\n\nclass SimpleNet(nn.Module):\n def __init__(self):\n super().__init__()\n self.fc = nn.Linear(784, 10)\n \n def forward(self, x):\n return self.fc(x)", "language": "python"}
1016+
]
1017+
}
1018+
# samples.append(DataSample.from_dict(sample_1_data))
1019+
1020+
# 样本2: 包含表格的预处理HTML
1021+
sample_2_data = {
1022+
"id": "preprocessed_sample_2",
1023+
"html": """<html><body><h1>原始表格页面</h1><table>...</table></body></html>""",
1024+
"llm_webkit_html": """
1025+
<div _item_id="1">
1026+
<h1>模型性能对比</h1>
1027+
<p>以下是不同深度学习模型在CIFAR-10数据集上的表现:</p>
1028+
</div>
1029+
<div _item_id="2">
1030+
<table>
1031+
<thead>
1032+
<tr>
1033+
<th>模型</th>
1034+
<th>准确率</th>
1035+
<th>参数量</th>
1036+
</tr>
1037+
</thead>
1038+
<tbody>
1039+
<tr>
1040+
<td>ResNet-18</td>
1041+
<td>95.3%</td>
1042+
<td>11.7M</td>
1043+
</tr>
1044+
<tr>
1045+
<td>VGG-16</td>
1046+
<td>92.7%</td>
1047+
<td>138M</td>
1048+
</tr>
1049+
</tbody>
1050+
</table>
1051+
</div>
1052+
""",
1053+
"groundtruth_content": """# 模型性能对比
1054+
1055+
以下是不同深度学习模型在CIFAR-10数据集上的表现:
1056+
1057+
| 模型 | 准确率 | 参数量 |
1058+
|------|--------|--------|
1059+
| ResNet-18 | 95.3% | 11.7M |
1060+
| VGG-16 | 92.7% | 138M |""",
1061+
"groundtruth_content_list": [
1062+
{"type": "heading", "content": "模型性能对比", "level": 1},
1063+
{"type": "paragraph", "content": "以下是不同深度学习模型在CIFAR-10数据集上的表现:"},
1064+
{"type": "table", "content": "| 模型 | 准确率 | 参数量 |\n|------|--------|---------|\n| ResNet-18 | 95.3% | 11.7M |\n| VGG-16 | 92.7% | 138M |"}
1065+
]
1066+
}
1067+
# samples.append(DataSample.from_dict(sample_2_data))
1068+
#
1069+
# # 创建数据集并添加样本
1070+
# dataset = BenchmarkDataset(name="preprocessed_html_test", description="预处理HTML功能测试数据集")
1071+
1072+
1073+
1074+
# 本地加载数据集
1075+
jsonl_file_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
1076+
1077+
# 使用DataLoader加载本地JSONL数据
1078+
dataset = DataLoader.load_jsonl(jsonl_file_path)
1079+
for sample in samples:
1080+
dataset.add_sample(sample)
1081+
1082+
print(f"✅ 测试数据集包含 {len(dataset)} 个样本")
1083+
print("📋 每个样本都包含:")
1084+
print(" - html: 原始复杂HTML")
1085+
print(" - llm_webkit_html: 预处理后的简化HTML(包含_item_id标记)")
1086+
print(" - groundtruth_content: 标准答案")
1087+
print()
1088+
1089+
# 2. 创建预处理HTML模式的LLM-WebKit抽取器
1090+
print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
1091+
1092+
config = {
1093+
"use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式
1094+
"preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名
1095+
}
1096+
1097+
extractor = ExtractorFactory.create("llm-webkit", config=config)
1098+
print(f"✅ 抽取器创建成功")
1099+
print(f"📋 配置信息:")
1100+
print(f" - use_preprocessed_html: {extractor.inference_config.use_preprocessed_html}")
1101+
print(f" - preprocessed_html_field: {extractor.inference_config.preprocessed_html_field}")
1102+
print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
1103+
print()
1104+
1105+
# 3. 性能对比:展示预处理HTML模式的优势
1106+
print("3. 性能优势演示...")
1107+
print("🚀 预处理HTML模式的优势:")
1108+
print(" ✅ 无需加载大型LLM模型(节省内存)")
1109+
print(" ✅ 跳过HTML简化推理步骤(节省时间)")
1110+
print(" ✅ 只需要基础的llm_web_kit依赖")
1111+
print(" ✅ 适合批量处理已预处理的数据")
1112+
print()
1113+
1114+
# 4. 运行评测
1115+
print("4. 开始评测...")
1116+
print("=" * 50)
1117+
1118+
evaluator = Evaluator()
1119+
result = evaluator.evaluate(
1120+
dataset=dataset,
1121+
extractor=extractor,
1122+
max_samples=None
1123+
)
1124+
1125+
# 5. 显示评测结果
1126+
print("\n5. 📊 预处理HTML模式评测结果:")
1127+
print("=" * 50)
1128+
1129+
results_dict = result.to_dict()
1130+
metrics = results_dict.get('overall_metrics', {})
1131+
1132+
# 显示关键指标
1133+
print(f"\n🏆 综合指标:")
1134+
print(f" overall: {metrics.get('overall', 0):.4f}")
1135+
1136+
print(f"\n📝 内容提取质量:")
1137+
print(f" text_edit: {metrics.get('text_edit', 0):.4f}")
1138+
print(f" code_edit: {metrics.get('code_edit', 0):.4f}")
1139+
print(f" table_edit: {metrics.get('table_edit', 0):.4f}")
1140+
print(f" table_TEDS: {metrics.get('table_TEDS', 0):.4f}")
1141+
1142+
print(f"\n⚡ 性能统计:")
1143+
sample_results = results_dict.get('sample_results', [])
1144+
if sample_results:
1145+
extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')]
1146+
if extraction_times:
1147+
avg_time = sum(extraction_times) / len(extraction_times)
1148+
print(f" 平均提取时间: {avg_time:.3f}秒")
1149+
print(f" 处理速度: {1/avg_time:.1f}样本/秒")
1150+
1151+
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
1152+
print(f" 成功样本数: {success_count}/{len(dataset)}")
1153+
1154+
# 6. 展示样本提取结果
1155+
print(f"\n6. 📄 样本提取结果预览:")
1156+
print("-" * 50)
1157+
1158+
for i, sample_result in enumerate(sample_results[:2]): # 只显示前2个样本
1159+
print(f"\n样本 {i+1}: {sample_result.get('sample_id', 'Unknown')}")
1160+
if sample_result.get('extraction_success'):
1161+
content = sample_result.get('extracted_content', '')
1162+
preview = content[:100].replace('\n', ' ') if content else '无内容'
1163+
print(f" ✅ 提取成功")
1164+
print(f" 📝 内容预览: {preview}...")
1165+
print(f" ⏱️ 提取时间: {sample_result.get('extraction_time', 0):.3f}秒")
1166+
else:
1167+
print(f" ❌ 提取失败")
1168+
1169+
# 7. 保存结果
1170+
print(f"\n7. 💾 保存评测结果...")
1171+
1172+
results_dir = Path("results")
1173+
results_dir.mkdir(exist_ok=True)
1174+
1175+
results_path = results_dir / "preprocessed_html_evaluation_results.json"
1176+
report_path = results_dir / "preprocessed_html_evaluation_report.csv"
1177+
1178+
DataSaver.save_evaluation_results(result, results_path)
1179+
DataSaver.save_summary_report(result, report_path)
1180+
1181+
print(f"✅ 详细结果已保存到: {results_path}")
1182+
print(f"✅ CSV报告已保存到: {report_path}")
1183+
1184+
# 8. 使用建议
1185+
print(f"\n8. 💡 实际使用建议:")
1186+
print("=" * 50)
1187+
print("🔧 何时使用预处理HTML模式:")
1188+
print(" 1. 已有LLM简化后的HTML数据")
1189+
print(" 2. 需要批量处理大量数据")
1190+
print(" 3. 部署环境内存有限")
1191+
print(" 4. 对提取速度有较高要求")
1192+
print()
1193+
print("📝 数据准备要求:")
1194+
print(" 1. 确保预处理HTML包含_item_id属性")
1195+
print(" 2. 保持原始HTML作为备用")
1196+
print(" 3. 验证预处理质量")
1197+
print()
1198+
print("⚙️ 配置参数说明:")
1199+
print(" - use_preprocessed_html: True/False")
1200+
print(" - preprocessed_html_field: 字段名(默认'llm_webkit_html')")
1201+
1202+
print("\n✅ 预处理HTML功能演示完成!")
1203+
1204+
9471205
if __name__ == "__main__":
9481206
try:
9491207
# demo_basic_mock_evaluation()
9501208
# demo_llm_webkit_evaluation() # 使用LLM-WebKit评测示例
1209+
demo_llm_webkit_with_preprocessed_html_evaluation()
9511210
# demo_extractor_comparison()
9521211
# demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集
953-
demo_multi_extraction() # 演示多个抽取器同时评测
1212+
# demo_multi_extraction() # 演示多个抽取器同时评测
9541213
# demo_lld_workers_extraction()
9551214
print("\n✅ 示例运行完成!")
9561215

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ torch
77
html2text
88
resiliparse
99
trafilatura
10+
# llm-web-kit==3.2.0
1011
https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.5-released/magic_html-0.1.5-py3-none-any.whl
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2+
llm-webkit,preprocessed_html_test,2,1.0,0.5029,0.5,1.0,0.5,0.5,0.0143

0 commit comments

Comments
 (0)