Skip to content

Commit a575621

Browse files
committed
add gt and pre of code formula table text in result jsonl
1 parent be911c8 commit a575621

2 files changed

Lines changed: 32 additions & 10 deletions

File tree

examples/basic_usage.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,7 @@ def demo_multi_extraction():
804804
# 配置文件路径
805805
data_dir = Path("../data")
806806
# dataset_path = data_dir / "sample_dataset.jsonl"
807-
dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
807+
dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_2456_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
808808

809809
print(f"📂 数据集文件: {dataset_path}")
810810

@@ -957,7 +957,7 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
957957
print("1. 从真实数据集加载预处理HTML数据...")
958958

959959
# 使用DataLoader加载真实的样本数据
960-
dataset_path = Path("data/WebMainBench_dataset_sample2.jsonl")
960+
dataset_path = Path("/home/lulindong/Pycharm_projects/cc/WebMainBench_2456_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
961961
print(f"📂 数据集文件: {dataset_path}")
962962

963963
if not dataset_path.exists():
@@ -969,7 +969,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
969969
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
970970
dataset.name = "real_preprocessed_html_test"
971971
dataset.description = "基于真实数据的预处理HTML功能测试"
972-
973972

974973
print(f"✅ 真实数据集加载成功,包含 {len(dataset)} 个样本")
975974
print("📋 真实数据样本包含:")
@@ -1078,15 +1077,22 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
10781077
print(f" ⏱️ 提取时间: {sample_result.get('extraction_time', 0):.3f}秒")
10791078
else:
10801079
print(f" ❌ 提取失败")
1081-
10821080
# 7. 保存结果
10831081
print(f"\n7. 💾 保存评测结果...")
10841082

10851083
results_dir = Path("results")
10861084
results_dir.mkdir(exist_ok=True)
1087-
1088-
results_path = results_dir / "preprocessed_html_evaluation_results.json"
1089-
report_path = results_dir / "preprocessed_html_evaluation_report.csv"
1085+
# 新增:保存带抽取结果的增强数据集(JSONL格式)
1086+
jsonl_dataset_path = results_dir / f"{extractor.name}_preprocessed_html_dataset_with_results.jsonl"
1087+
DataSaver.save_dataset_with_extraction(
1088+
results=result,
1089+
dataset=dataset, # 原始数据集对象
1090+
file_path=jsonl_dataset_path,
1091+
extractor_name="llm-webkit" # 抽取器名称前缀
1092+
)
1093+
print(f"✅ 带抽取结果的JSONL数据集已保存到: {jsonl_dataset_path}")
1094+
results_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_results.json"
1095+
report_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_report.csv"
10901096

10911097
DataSaver.save_evaluation_results(result, results_path)
10921098
DataSaver.save_summary_report(result, report_path)
@@ -1119,10 +1125,10 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
11191125
try:
11201126
# demo_basic_mock_evaluation()
11211127
# demo_llm_webkit_evaluation() # 使用LLM-WebKit评测示例
1122-
demo_llm_webkit_with_preprocessed_html_evaluation()
1128+
# demo_llm_webkit_with_preprocessed_html_evaluation()
11231129
# demo_extractor_comparison()
11241130
# demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集
1125-
# demo_multi_extraction() # 演示多个抽取器同时评测
1131+
demo_multi_extraction() # 演示多个抽取器同时评测
11261132
# demo_lld_workers_extraction()
11271133
print("\n✅ 示例运行完成!")
11281134

webmainbench/data/saver.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ def save_dataset_with_extraction(results: Union["EvaluationResult", Dict[str, An
239239

240240
# Add extraction results if available
241241
extraction_result = extraction_map.get(sample.id)
242+
from webmainbench.metrics.base import BaseMetric
242243
if extraction_result:
243244
# Add extracted content with extractor name prefix
244245
sample_dict[f'{extractor_name}_content'] = extraction_result.get('extracted_content', '')
@@ -251,7 +252,22 @@ def save_dataset_with_extraction(results: Union["EvaluationResult", Dict[str, An
251252
for metric_name, metric_data in metrics.items():
252253
if isinstance(metric_data, dict) and metric_data.get('success', False):
253254
sample_dict[f'{extractor_name}_{metric_name}_score'] = metric_data.get('score', 0)
254-
255+
256+
# 解析预测值(predicted)
257+
predicted_content = extraction_result.get('extracted_content', '')
258+
predicted_parts = BaseMetric._extract_from_markdown(predicted_content) # 关键:解析预测内容
259+
for part_type in ['code', 'formula', 'table', 'text']:
260+
sample_dict[f'{extractor_name}_predicted_{part_type}'] = predicted_parts.get(part_type, '')
261+
262+
# 解析真实值(groundtruth)
263+
groundtruth_content = sample_dict.get('groundtruth_content', '')
264+
groundtruth_parts = BaseMetric._extract_from_markdown(groundtruth_content) # 关键:解析真实内容
265+
for part_type in ['code', 'formula', 'table', 'text']:
266+
sample_dict[f'{extractor_name}_groundtruth_{part_type}'] = groundtruth_parts.get(part_type,
267+
'')
268+
269+
270+
255271
enriched_samples.append(sample_dict)
256272

257273
# Save as JSONL

0 commit comments

Comments
 (0)