Skip to content

Commit a0f4d63

Browse files
committed
test demo_llm_webkit_with_preprocessed_html_evaluation()
1 parent 2460023 commit a0f4d63

1 file changed

Lines changed: 36 additions & 35 deletions

File tree

examples/basic_usage.py

Lines changed: 36 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,7 @@ def demo_multi_extraction():
804804
# 配置文件路径
805805
data_dir = Path("../data")
806806
# dataset_path = data_dir / "sample_dataset.jsonl"
807-
dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
807+
dataset_path = "/home/lulindong/Pycharm_projects/cc/test_10.jsonl"
808808

809809
print(f"📂 数据集文件: {dataset_path}")
810810

@@ -947,17 +947,17 @@ def demo_multi_extraction():
947947

948948
def demo_llm_webkit_with_preprocessed_html_evaluation():
949949
"""演示LLM-WebKit预处理HTML功能的评测"""
950-
950+
951951
print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
952-
952+
953953
# 设置日志
954954
setup_logging(level="INFO")
955-
955+
956956
# 1. 创建包含预处理HTML的测试数据集
957957
print("1. 创建包含预处理HTML的测试数据集...")
958-
958+
959959
samples = []
960-
960+
961961
# 样本1: 包含预处理的HTML(模拟第一阶段LLM简化后的结果)
962962
sample_1_data = {
963963
"id": "preprocessed_sample_1",
@@ -1016,10 +1016,10 @@ def forward(self, x):
10161016
]
10171017
}
10181018
# samples.append(DataSample.from_dict(sample_1_data))
1019-
1019+
10201020
# 样本2: 包含表格的预处理HTML
10211021
sample_2_data = {
1022-
"id": "preprocessed_sample_2",
1022+
"id": "preprocessed_sample_2",
10231023
"html": """<html><body><h1>原始表格页面</h1><table>...</table></body></html>""",
10241024
"llm_webkit_html": """
10251025
<div _item_id="1">
@@ -1065,8 +1065,8 @@ def forward(self, x):
10651065
]
10661066
}
10671067
# samples.append(DataSample.from_dict(sample_2_data))
1068-
#
1069-
# # 创建数据集并添加样本
1068+
1069+
# 创建数据集并添加样本
10701070
# dataset = BenchmarkDataset(name="preprocessed_html_test", description="预处理HTML功能测试数据集")
10711071

10721072

@@ -1076,32 +1076,34 @@ def forward(self, x):
10761076

10771077
# 使用DataLoader加载本地JSONL数据
10781078
dataset = DataLoader.load_jsonl(jsonl_file_path)
1079-
for sample in samples:
1080-
dataset.add_sample(sample)
1081-
1079+
# for sample in samples:
1080+
# dataset.add_sample(sample)
1081+
# 在评测前添加,验证抽取器是否使用了正确的HTML字段
1082+
1083+
10821084
print(f"✅ 测试数据集包含 {len(dataset)} 个样本")
10831085
print("📋 每个样本都包含:")
10841086
print(" - html: 原始复杂HTML")
10851087
print(" - llm_webkit_html: 预处理后的简化HTML(包含_item_id标记)")
10861088
print(" - groundtruth_content: 标准答案")
10871089
print()
1088-
1090+
10891091
# 2. 创建预处理HTML模式的LLM-WebKit抽取器
10901092
print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
1091-
1093+
10921094
config = {
10931095
"use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式
10941096
"preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名
10951097
}
1096-
1098+
10971099
extractor = ExtractorFactory.create("llm-webkit", config=config)
10981100
print(f"✅ 抽取器创建成功")
10991101
print(f"📋 配置信息:")
11001102
print(f" - use_preprocessed_html: {extractor.inference_config.use_preprocessed_html}")
11011103
print(f" - preprocessed_html_field: {extractor.inference_config.preprocessed_html_field}")
11021104
print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
11031105
print()
1104-
1106+
11051107
# 3. 性能对比:展示预处理HTML模式的优势
11061108
print("3. 性能优势演示...")
11071109
print("🚀 预处理HTML模式的优势:")
@@ -1110,35 +1112,36 @@ def forward(self, x):
11101112
print(" ✅ 只需要基础的llm_web_kit依赖")
11111113
print(" ✅ 适合批量处理已预处理的数据")
11121114
print()
1113-
1115+
11141116
# 4. 运行评测
11151117
print("4. 开始评测...")
11161118
print("=" * 50)
1117-
1119+
11181120
evaluator = Evaluator()
11191121
result = evaluator.evaluate(
11201122
dataset=dataset,
11211123
extractor=extractor,
11221124
max_samples=None
11231125
)
1124-
1126+
11251127
# 5. 显示评测结果
11261128
print("\n5. 📊 预处理HTML模式评测结果:")
11271129
print("=" * 50)
1128-
1130+
11291131
results_dict = result.to_dict()
11301132
metrics = results_dict.get('overall_metrics', {})
1131-
1133+
11321134
# 显示关键指标
11331135
print(f"\n🏆 综合指标:")
11341136
print(f" overall: {metrics.get('overall', 0):.4f}")
1135-
1137+
11361138
print(f"\n📝 内容提取质量:")
1139+
print(f" formula_edit: {metrics.get('formula_edit', 0):.4f}")
11371140
print(f" text_edit: {metrics.get('text_edit', 0):.4f}")
11381141
print(f" code_edit: {metrics.get('code_edit', 0):.4f}")
11391142
print(f" table_edit: {metrics.get('table_edit', 0):.4f}")
11401143
print(f" table_TEDS: {metrics.get('table_TEDS', 0):.4f}")
1141-
1144+
11421145
print(f"\n⚡ 性能统计:")
11431146
sample_results = results_dict.get('sample_results', [])
11441147
if sample_results:
@@ -1147,14 +1150,14 @@ def forward(self, x):
11471150
avg_time = sum(extraction_times) / len(extraction_times)
11481151
print(f" 平均提取时间: {avg_time:.3f}秒")
11491152
print(f" 处理速度: {1/avg_time:.1f}样本/秒")
1150-
1153+
11511154
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
11521155
print(f" 成功样本数: {success_count}/{len(dataset)}")
1153-
1156+
11541157
# 6. 展示样本提取结果
11551158
print(f"\n6. 📄 样本提取结果预览:")
11561159
print("-" * 50)
1157-
1160+
11581161
for i, sample_result in enumerate(sample_results[:2]): # 只显示前2个样本
11591162
print(f"\n样本 {i+1}: {sample_result.get('sample_id', 'Unknown')}")
11601163
if sample_result.get('extraction_success'):
@@ -1165,22 +1168,21 @@ def forward(self, x):
11651168
print(f" ⏱️ 提取时间: {sample_result.get('extraction_time', 0):.3f}秒")
11661169
else:
11671170
print(f" ❌ 提取失败")
1168-
11691171
# 7. 保存结果
11701172
print(f"\n7. 💾 保存评测结果...")
1171-
1173+
11721174
results_dir = Path("results")
11731175
results_dir.mkdir(exist_ok=True)
1174-
1176+
11751177
results_path = results_dir / "preprocessed_html_evaluation_results.json"
11761178
report_path = results_dir / "preprocessed_html_evaluation_report.csv"
1177-
1179+
11781180
DataSaver.save_evaluation_results(result, results_path)
11791181
DataSaver.save_summary_report(result, report_path)
1180-
1182+
11811183
print(f"✅ 详细结果已保存到: {results_path}")
11821184
print(f"✅ CSV报告已保存到: {report_path}")
1183-
1185+
11841186
# 8. 使用建议
11851187
print(f"\n8. 💡 实际使用建议:")
11861188
print("=" * 50)
@@ -1198,9 +1200,8 @@ def forward(self, x):
11981200
print("⚙️ 配置参数说明:")
11991201
print(" - use_preprocessed_html: True/False")
12001202
print(" - preprocessed_html_field: 字段名(默认'llm_webkit_html')")
1201-
1202-
print("\n✅ 预处理HTML功能演示完成!")
12031203

1204+
print("\n✅ 预处理HTML功能演示完成!")
12041205

12051206
if __name__ == "__main__":
12061207
try:

0 commit comments

Comments
 (0)