@@ -944,13 +944,272 @@ def demo_multi_extraction():
944944 print (f" - { name } _time: 单样本抽取耗时(秒)" )
945945 print (f" - { name } _*_score: 各指标得分(如{ name } _text_edit)" )
946946
947+
948+ def demo_llm_webkit_with_preprocessed_html_evaluation ():
949+ """演示LLM-WebKit预处理HTML功能的评测"""
950+
951+ print ("\n === LLM-WebKit 预处理HTML功能演示 ===\n " )
952+
953+ # 设置日志
954+ setup_logging (level = "INFO" )
955+
956+ # 1. 创建包含预处理HTML的测试数据集
957+ print ("1. 创建包含预处理HTML的测试数据集..." )
958+
959+ samples = []
960+
961+ # 样本1: 包含预处理的HTML(模拟第一阶段LLM简化后的结果)
962+ sample_1_data = {
963+ "id" : "preprocessed_sample_1" ,
964+ "html" : """<html><body><h1>原始复杂HTML</h1><p>这里是原始的复杂HTML内容...</p></body></html>""" ,
965+ # 这是关键:包含llm_webkit_html字段(预处理后的简化HTML)
966+ "llm_webkit_html" : """
967+ <div _item_id="1">
968+ <h1>深度学习基础教程</h1>
969+ <p>深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。</p>
970+ </div>
971+ <div _item_id="2">
972+ <h2>核心概念</h2>
973+ <p>神经网络由多个层组成,每层包含多个神经元。</p>
974+ </div>
975+ <div _item_id="3">
976+ <pre><code class="language-python">
977+ import torch
978+ import torch.nn as nn
979+
980+ class SimpleNet(nn.Module):
981+ def __init__(self):
982+ super().__init__()
983+ self.fc = nn.Linear(784, 10)
984+
985+ def forward(self, x):
986+ return self.fc(x)
987+ </code></pre>
988+ </div>
989+ """ ,
990+ "groundtruth_content" : """# 深度学习基础教程
991+
992+ 深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。
993+
994+ ## 核心概念
995+
996+ 神经网络由多个层组成,每层包含多个神经元。
997+
998+ ```python
999+ import torch
1000+ import torch.nn as nn
1001+
1002+ class SimpleNet(nn.Module):
1003+ def __init__(self):
1004+ super().__init__()
1005+ self.fc = nn.Linear(784, 10)
1006+
1007+ def forward(self, x):
1008+ return self.fc(x)
1009+ ```""" ,
1010+ "groundtruth_content_list" : [
1011+ {"type" : "heading" , "content" : "深度学习基础教程" , "level" : 1 },
1012+ {"type" : "paragraph" , "content" : "深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。" },
1013+ {"type" : "heading" , "content" : "核心概念" , "level" : 2 },
1014+ {"type" : "paragraph" , "content" : "神经网络由多个层组成,每层包含多个神经元。" },
1015+ {"type" : "code" , "content" : "import torch\n import torch.nn as nn\n \n class SimpleNet(nn.Module):\n def __init__(self):\n super().__init__()\n self.fc = nn.Linear(784, 10)\n \n def forward(self, x):\n return self.fc(x)" , "language" : "python" }
1016+ ]
1017+ }
1018+ # samples.append(DataSample.from_dict(sample_1_data))
1019+
1020+ # 样本2: 包含表格的预处理HTML
1021+ sample_2_data = {
1022+ "id" : "preprocessed_sample_2" ,
1023+ "html" : """<html><body><h1>原始表格页面</h1><table>...</table></body></html>""" ,
1024+ "llm_webkit_html" : """
1025+ <div _item_id="1">
1026+ <h1>模型性能对比</h1>
1027+ <p>以下是不同深度学习模型在CIFAR-10数据集上的表现:</p>
1028+ </div>
1029+ <div _item_id="2">
1030+ <table>
1031+ <thead>
1032+ <tr>
1033+ <th>模型</th>
1034+ <th>准确率</th>
1035+ <th>参数量</th>
1036+ </tr>
1037+ </thead>
1038+ <tbody>
1039+ <tr>
1040+ <td>ResNet-18</td>
1041+ <td>95.3%</td>
1042+ <td>11.7M</td>
1043+ </tr>
1044+ <tr>
1045+ <td>VGG-16</td>
1046+ <td>92.7%</td>
1047+ <td>138M</td>
1048+ </tr>
1049+ </tbody>
1050+ </table>
1051+ </div>
1052+ """ ,
1053+ "groundtruth_content" : """# 模型性能对比
1054+
1055+ 以下是不同深度学习模型在CIFAR-10数据集上的表现:
1056+
1057+ | 模型 | 准确率 | 参数量 |
1058+ |------|--------|--------|
1059+ | ResNet-18 | 95.3% | 11.7M |
1060+ | VGG-16 | 92.7% | 138M |""" ,
1061+ "groundtruth_content_list" : [
1062+ {"type" : "heading" , "content" : "模型性能对比" , "level" : 1 },
1063+ {"type" : "paragraph" , "content" : "以下是不同深度学习模型在CIFAR-10数据集上的表现:" },
1064+ {"type" : "table" , "content" : "| 模型 | 准确率 | 参数量 |\n |------|--------|---------|\n | ResNet-18 | 95.3% | 11.7M |\n | VGG-16 | 92.7% | 138M |" }
1065+ ]
1066+ }
1067+ # samples.append(DataSample.from_dict(sample_2_data))
1068+ #
1069+ # # 创建数据集并添加样本
1070+ # dataset = BenchmarkDataset(name="preprocessed_html_test", description="预处理HTML功能测试数据集")
1071+
1072+
1073+
1074+ # 本地加载数据集
1075+ jsonl_file_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
1076+
1077+ # 使用DataLoader加载本地JSONL数据
1078+ dataset = DataLoader .load_jsonl (jsonl_file_path )
1079+ for sample in samples :
1080+ dataset .add_sample (sample )
1081+
1082+ print (f"✅ 测试数据集包含 { len (dataset )} 个样本" )
1083+ print ("📋 每个样本都包含:" )
1084+ print (" - html: 原始复杂HTML" )
1085+ print (" - llm_webkit_html: 预处理后的简化HTML(包含_item_id标记)" )
1086+ print (" - groundtruth_content: 标准答案" )
1087+ print ()
1088+
1089+ # 2. 创建预处理HTML模式的LLM-WebKit抽取器
1090+ print ("2. 创建预处理HTML模式的LLM-WebKit抽取器..." )
1091+
1092+ config = {
1093+ "use_preprocessed_html" : True , # 🔑 关键配置:启用预处理HTML模式
1094+ "preprocessed_html_field" : "llm_webkit_html" # 指定预处理HTML字段名
1095+ }
1096+
1097+ extractor = ExtractorFactory .create ("llm-webkit" , config = config )
1098+ print (f"✅ 抽取器创建成功" )
1099+ print (f"📋 配置信息:" )
1100+ print (f" - use_preprocessed_html: { extractor .inference_config .use_preprocessed_html } " )
1101+ print (f" - preprocessed_html_field: { extractor .inference_config .preprocessed_html_field } " )
1102+ print (f" - 跳过LLM推理: 是(直接处理预处理HTML)" )
1103+ print ()
1104+
1105+ # 3. 性能对比:展示预处理HTML模式的优势
1106+ print ("3. 性能优势演示..." )
1107+ print ("🚀 预处理HTML模式的优势:" )
1108+ print (" ✅ 无需加载大型LLM模型(节省内存)" )
1109+ print (" ✅ 跳过HTML简化推理步骤(节省时间)" )
1110+ print (" ✅ 只需要基础的llm_web_kit依赖" )
1111+ print (" ✅ 适合批量处理已预处理的数据" )
1112+ print ()
1113+
1114+ # 4. 运行评测
1115+ print ("4. 开始评测..." )
1116+ print ("=" * 50 )
1117+
1118+ evaluator = Evaluator ()
1119+ result = evaluator .evaluate (
1120+ dataset = dataset ,
1121+ extractor = extractor ,
1122+ max_samples = None
1123+ )
1124+
1125+ # 5. 显示评测结果
1126+ print ("\n 5. 📊 预处理HTML模式评测结果:" )
1127+ print ("=" * 50 )
1128+
1129+ results_dict = result .to_dict ()
1130+ metrics = results_dict .get ('overall_metrics' , {})
1131+
1132+ # 显示关键指标
1133+ print (f"\n 🏆 综合指标:" )
1134+ print (f" overall: { metrics .get ('overall' , 0 ):.4f} " )
1135+
1136+ print (f"\n 📝 内容提取质量:" )
1137+ print (f" text_edit: { metrics .get ('text_edit' , 0 ):.4f} " )
1138+ print (f" code_edit: { metrics .get ('code_edit' , 0 ):.4f} " )
1139+ print (f" table_edit: { metrics .get ('table_edit' , 0 ):.4f} " )
1140+ print (f" table_TEDS: { metrics .get ('table_TEDS' , 0 ):.4f} " )
1141+
1142+ print (f"\n ⚡ 性能统计:" )
1143+ sample_results = results_dict .get ('sample_results' , [])
1144+ if sample_results :
1145+ extraction_times = [s .get ('extraction_time' , 0 ) for s in sample_results if s .get ('extraction_success' )]
1146+ if extraction_times :
1147+ avg_time = sum (extraction_times ) / len (extraction_times )
1148+ print (f" 平均提取时间: { avg_time :.3f} 秒" )
1149+ print (f" 处理速度: { 1 / avg_time :.1f} 样本/秒" )
1150+
1151+ success_count = len ([s for s in sample_results if s .get ('extraction_success' , False )])
1152+ print (f" 成功样本数: { success_count } /{ len (dataset )} " )
1153+
1154+ # 6. 展示样本提取结果
1155+ print (f"\n 6. 📄 样本提取结果预览:" )
1156+ print ("-" * 50 )
1157+
1158+ for i , sample_result in enumerate (sample_results [:2 ]): # 只显示前2个样本
1159+ print (f"\n 样本 { i + 1 } : { sample_result .get ('sample_id' , 'Unknown' )} " )
1160+ if sample_result .get ('extraction_success' ):
1161+ content = sample_result .get ('extracted_content' , '' )
1162+ preview = content [:100 ].replace ('\n ' , ' ' ) if content else '无内容'
1163+ print (f" ✅ 提取成功" )
1164+ print (f" 📝 内容预览: { preview } ..." )
1165+ print (f" ⏱️ 提取时间: { sample_result .get ('extraction_time' , 0 ):.3f} 秒" )
1166+ else :
1167+ print (f" ❌ 提取失败" )
1168+
1169+ # 7. 保存结果
1170+ print (f"\n 7. 💾 保存评测结果..." )
1171+
1172+ results_dir = Path ("results" )
1173+ results_dir .mkdir (exist_ok = True )
1174+
1175+ results_path = results_dir / "preprocessed_html_evaluation_results.json"
1176+ report_path = results_dir / "preprocessed_html_evaluation_report.csv"
1177+
1178+ DataSaver .save_evaluation_results (result , results_path )
1179+ DataSaver .save_summary_report (result , report_path )
1180+
1181+ print (f"✅ 详细结果已保存到: { results_path } " )
1182+ print (f"✅ CSV报告已保存到: { report_path } " )
1183+
1184+ # 8. 使用建议
1185+ print (f"\n 8. 💡 实际使用建议:" )
1186+ print ("=" * 50 )
1187+ print ("🔧 何时使用预处理HTML模式:" )
1188+ print (" 1. 已有LLM简化后的HTML数据" )
1189+ print (" 2. 需要批量处理大量数据" )
1190+ print (" 3. 部署环境内存有限" )
1191+ print (" 4. 对提取速度有较高要求" )
1192+ print ()
1193+ print ("📝 数据准备要求:" )
1194+ print (" 1. 确保预处理HTML包含_item_id属性" )
1195+ print (" 2. 保持原始HTML作为备用" )
1196+ print (" 3. 验证预处理质量" )
1197+ print ()
1198+ print ("⚙️ 配置参数说明:" )
1199+ print (" - use_preprocessed_html: True/False" )
1200+ print (" - preprocessed_html_field: 字段名(默认'llm_webkit_html')" )
1201+
1202+ print ("\n ✅ 预处理HTML功能演示完成!" )
1203+
1204+
9471205if __name__ == "__main__" :
9481206 try :
9491207 # demo_basic_mock_evaluation()
9501208 # demo_llm_webkit_evaluation() # 使用LLM-WebKit评测示例
1209+ demo_llm_webkit_with_preprocessed_html_evaluation ()
9511210 # demo_extractor_comparison()
9521211 # demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集
953- demo_multi_extraction () # 演示多个抽取器同时评测
1212+ # demo_multi_extraction() # 演示多个抽取器同时评测
9541213 # demo_lld_workers_extraction()
9551214 print ("\n ✅ 示例运行完成!" )
9561215
0 commit comments