@@ -804,7 +804,7 @@ def demo_multi_extraction():
804804 # 配置文件路径
805805 data_dir = Path ("../data" )
806806 # dataset_path = data_dir / "sample_dataset.jsonl"
807- dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit .jsonl"
807+ dataset_path = "/home/lulindong/Pycharm_projects/cc/test_10 .jsonl"
808808
809809 print (f"📂 数据集文件: { dataset_path } " )
810810
@@ -947,17 +947,17 @@ def demo_multi_extraction():
947947
948948def demo_llm_webkit_with_preprocessed_html_evaluation ():
949949 """演示LLM-WebKit预处理HTML功能的评测"""
950-
950+
951951 print ("\n === LLM-WebKit 预处理HTML功能演示 ===\n " )
952-
952+
953953 # 设置日志
954954 setup_logging (level = "INFO" )
955-
955+
956956 # 1. 创建包含预处理HTML的测试数据集
957957 print ("1. 创建包含预处理HTML的测试数据集..." )
958-
958+
959959 samples = []
960-
960+
961961 # 样本1: 包含预处理的HTML(模拟第一阶段LLM简化后的结果)
962962 sample_1_data = {
963963 "id" : "preprocessed_sample_1" ,
@@ -1016,10 +1016,10 @@ def forward(self, x):
10161016 ]
10171017 }
10181018 # samples.append(DataSample.from_dict(sample_1_data))
1019-
1019+
10201020 # 样本2: 包含表格的预处理HTML
10211021 sample_2_data = {
1022- "id" : "preprocessed_sample_2" ,
1022+ "id" : "preprocessed_sample_2" ,
10231023 "html" : """<html><body><h1>原始表格页面</h1><table>...</table></body></html>""" ,
10241024 "llm_webkit_html" : """
10251025 <div _item_id="1">
@@ -1065,8 +1065,8 @@ def forward(self, x):
10651065 ]
10661066 }
10671067 # samples.append(DataSample.from_dict(sample_2_data))
1068- #
1069- # # 创建数据集并添加样本
1068+
1069+ # 创建数据集并添加样本
10701070 # dataset = BenchmarkDataset(name="preprocessed_html_test", description="预处理HTML功能测试数据集")
10711071
10721072
@@ -1076,32 +1076,34 @@ def forward(self, x):
10761076
10771077 # 使用DataLoader加载本地JSONL数据
10781078 dataset = DataLoader .load_jsonl (jsonl_file_path )
1079- for sample in samples :
1080- dataset .add_sample (sample )
1081-
1079+ # for sample in samples:
1080+ # dataset.add_sample(sample)
1081+ # 在评测前添加,验证抽取器是否使用了正确的HTML字段
1082+
1083+
10821084 print (f"✅ 测试数据集包含 { len (dataset )} 个样本" )
10831085 print ("📋 每个样本都包含:" )
10841086 print (" - html: 原始复杂HTML" )
10851087 print (" - llm_webkit_html: 预处理后的简化HTML(包含_item_id标记)" )
10861088 print (" - groundtruth_content: 标准答案" )
10871089 print ()
1088-
1090+
10891091 # 2. 创建预处理HTML模式的LLM-WebKit抽取器
10901092 print ("2. 创建预处理HTML模式的LLM-WebKit抽取器..." )
1091-
1093+
10921094 config = {
10931095 "use_preprocessed_html" : True , # 🔑 关键配置:启用预处理HTML模式
10941096 "preprocessed_html_field" : "llm_webkit_html" # 指定预处理HTML字段名
10951097 }
1096-
1098+
10971099 extractor = ExtractorFactory .create ("llm-webkit" , config = config )
10981100 print (f"✅ 抽取器创建成功" )
10991101 print (f"📋 配置信息:" )
11001102 print (f" - use_preprocessed_html: { extractor .inference_config .use_preprocessed_html } " )
11011103 print (f" - preprocessed_html_field: { extractor .inference_config .preprocessed_html_field } " )
11021104 print (f" - 跳过LLM推理: 是(直接处理预处理HTML)" )
11031105 print ()
1104-
1106+
11051107 # 3. 性能对比:展示预处理HTML模式的优势
11061108 print ("3. 性能优势演示..." )
11071109 print ("🚀 预处理HTML模式的优势:" )
@@ -1110,35 +1112,36 @@ def forward(self, x):
11101112 print (" ✅ 只需要基础的llm_web_kit依赖" )
11111113 print (" ✅ 适合批量处理已预处理的数据" )
11121114 print ()
1113-
1115+
11141116 # 4. 运行评测
11151117 print ("4. 开始评测..." )
11161118 print ("=" * 50 )
1117-
1119+
11181120 evaluator = Evaluator ()
11191121 result = evaluator .evaluate (
11201122 dataset = dataset ,
11211123 extractor = extractor ,
11221124 max_samples = None
11231125 )
1124-
1126+
11251127 # 5. 显示评测结果
11261128 print ("\n 5. 📊 预处理HTML模式评测结果:" )
11271129 print ("=" * 50 )
1128-
1130+
11291131 results_dict = result .to_dict ()
11301132 metrics = results_dict .get ('overall_metrics' , {})
1131-
1133+
11321134 # 显示关键指标
11331135 print (f"\n 🏆 综合指标:" )
11341136 print (f" overall: { metrics .get ('overall' , 0 ):.4f} " )
1135-
1137+
11361138 print (f"\n 📝 内容提取质量:" )
1139+ print (f" formula_edit: { metrics .get ('formula_edit' , 0 ):.4f} " )
11371140 print (f" text_edit: { metrics .get ('text_edit' , 0 ):.4f} " )
11381141 print (f" code_edit: { metrics .get ('code_edit' , 0 ):.4f} " )
11391142 print (f" table_edit: { metrics .get ('table_edit' , 0 ):.4f} " )
11401143 print (f" table_TEDS: { metrics .get ('table_TEDS' , 0 ):.4f} " )
1141-
1144+
11421145 print (f"\n ⚡ 性能统计:" )
11431146 sample_results = results_dict .get ('sample_results' , [])
11441147 if sample_results :
@@ -1147,14 +1150,14 @@ def forward(self, x):
11471150 avg_time = sum (extraction_times ) / len (extraction_times )
11481151 print (f" 平均提取时间: { avg_time :.3f} 秒" )
11491152 print (f" 处理速度: { 1 / avg_time :.1f} 样本/秒" )
1150-
1153+
11511154 success_count = len ([s for s in sample_results if s .get ('extraction_success' , False )])
11521155 print (f" 成功样本数: { success_count } /{ len (dataset )} " )
1153-
1156+
11541157 # 6. 展示样本提取结果
11551158 print (f"\n 6. 📄 样本提取结果预览:" )
11561159 print ("-" * 50 )
1157-
1160+
11581161 for i , sample_result in enumerate (sample_results [:2 ]): # 只显示前2个样本
11591162 print (f"\n 样本 { i + 1 } : { sample_result .get ('sample_id' , 'Unknown' )} " )
11601163 if sample_result .get ('extraction_success' ):
@@ -1165,22 +1168,21 @@ def forward(self, x):
11651168 print (f" ⏱️ 提取时间: { sample_result .get ('extraction_time' , 0 ):.3f} 秒" )
11661169 else :
11671170 print (f" ❌ 提取失败" )
1168-
11691171 # 7. 保存结果
11701172 print (f"\n 7. 💾 保存评测结果..." )
1171-
1173+
11721174 results_dir = Path ("results" )
11731175 results_dir .mkdir (exist_ok = True )
1174-
1176+
11751177 results_path = results_dir / "preprocessed_html_evaluation_results.json"
11761178 report_path = results_dir / "preprocessed_html_evaluation_report.csv"
1177-
1179+
11781180 DataSaver .save_evaluation_results (result , results_path )
11791181 DataSaver .save_summary_report (result , report_path )
1180-
1182+
11811183 print (f"✅ 详细结果已保存到: { results_path } " )
11821184 print (f"✅ CSV报告已保存到: { report_path } " )
1183-
1185+
11841186 # 8. 使用建议
11851187 print (f"\n 8. 💡 实际使用建议:" )
11861188 print ("=" * 50 )
@@ -1198,9 +1200,8 @@ def forward(self, x):
11981200 print ("⚙️ 配置参数说明:" )
11991201 print (" - use_preprocessed_html: True/False" )
12001202 print (" - preprocessed_html_field: 字段名(默认'llm_webkit_html')" )
1201-
1202- print ("\n ✅ 预处理HTML功能演示完成!" )
12031203
1204+ print ("\n ✅ 预处理HTML功能演示完成!" )
12041205
12051206if __name__ == "__main__" :
12061207 try :
0 commit comments