@@ -804,7 +804,7 @@ def demo_multi_extraction():
804804 # 配置文件路径
805805 data_dir = Path ("../data" )
806806 # dataset_path = data_dir / "sample_dataset.jsonl"
807- dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit .jsonl"
807+ dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_1904_v1_WebMainBench_dataset_merge_with_llm_webkit .jsonl"
808808
809809 print (f"📂 数据集文件: { dataset_path } " )
810810
@@ -957,7 +957,7 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
957957 print ("1. 从真实数据集加载预处理HTML数据..." )
958958
959959 # 使用DataLoader加载真实的样本数据
960- dataset_path = Path ("data/WebMainBench_dataset_sample2 .jsonl" )
960+ dataset_path = Path ("/home/lulindong/Pycharm_projects/cc/WebMainBench_1904_v1_WebMainBench_dataset_merge_with_llm_webkit .jsonl" )
961961 print (f"📂 数据集文件: { dataset_path } " )
962962
963963 if not dataset_path .exists ():
@@ -969,7 +969,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
969969 dataset = DataLoader .load_jsonl (dataset_path , include_results = False )
970970 dataset .name = "real_preprocessed_html_test"
971971 dataset .description = "基于真实数据的预处理HTML功能测试"
972-
973972
974973 print (f"✅ 真实数据集加载成功,包含 { len (dataset )} 个样本" )
975974 print ("📋 真实数据样本包含:" )
@@ -1069,15 +1068,22 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
10691068 print (f" ⏱️ 提取时间: { sample_result .get ('extraction_time' , 0 ):.3f} 秒" )
10701069 else :
10711070 print (f" ❌ 提取失败" )
1072-
10731071 # 7. 保存结果
10741072 print (f"\n 7. 💾 保存评测结果..." )
10751073
10761074 results_dir = Path ("results" )
10771075 results_dir .mkdir (exist_ok = True )
1078-
1079- results_path = results_dir / "preprocessed_html_evaluation_results.json"
1080- report_path = results_dir / "preprocessed_html_evaluation_report.csv"
1076+ # 新增:保存带抽取结果的增强数据集(JSONL格式)
1077+ jsonl_dataset_path = results_dir / f"{ extractor .name } _preprocessed_html_dataset_with_results.jsonl"
1078+ DataSaver .save_dataset_with_extraction (
1079+ results = result ,
1080+ dataset = dataset , # 原始数据集对象
1081+ file_path = jsonl_dataset_path ,
1082+ extractor_name = "llm-webkit" # 抽取器名称前缀
1083+ )
1084+ print (f"✅ 带抽取结果的JSONL数据集已保存到: { jsonl_dataset_path } " )
1085+ results_path = results_dir / f"{ extractor .name } _preprocessed_html_evaluation_results.json"
1086+ report_path = results_dir / f"{ extractor .name } _preprocessed_html_evaluation_report.csv"
10811087
10821088 DataSaver .save_evaluation_results (result , results_path )
10831089 DataSaver .save_summary_report (result , report_path )
0 commit comments