@@ -943,13 +943,264 @@ def demo_multi_extraction():
943943 print (f" - { name } _time: 单样本抽取耗时(秒)" )
944944 print (f" - { name } _*_score: 各指标得分(如{ name } _text_edit)" )
945945
946+
947+ def demo_llm_webkit_with_preprocessed_html_evaluation ():
948+ """演示LLM-WebKit预处理HTML功能的评测"""
949+
950+ print ("\n === LLM-WebKit 预处理HTML功能演示 ===\n " )
951+
952+ # 设置日志
953+ setup_logging (level = "INFO" )
954+
955+ # 1. 创建包含预处理HTML的测试数据集
956+ print ("1. 创建包含预处理HTML的测试数据集..." )
957+
958+ samples = []
959+
960+ # 样本1: 包含预处理的HTML(模拟第一阶段LLM简化后的结果)
961+ sample_1_data = {
962+ "id" : "preprocessed_sample_1" ,
963+ "html" : """<html><body><h1>原始复杂HTML</h1><p>这里是原始的复杂HTML内容...</p></body></html>""" ,
964+ # 这是关键:包含llm_webkit_html字段(预处理后的简化HTML)
965+ "llm_webkit_html" : """
966+ <div _item_id="1">
967+ <h1>深度学习基础教程</h1>
968+ <p>深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。</p>
969+ </div>
970+ <div _item_id="2">
971+ <h2>核心概念</h2>
972+ <p>神经网络由多个层组成,每层包含多个神经元。</p>
973+ </div>
974+ <div _item_id="3">
975+ <pre><code class="language-python">
976+ import torch
977+ import torch.nn as nn
978+
979+ class SimpleNet(nn.Module):
980+ def __init__(self):
981+ super().__init__()
982+ self.fc = nn.Linear(784, 10)
983+
984+ def forward(self, x):
985+ return self.fc(x)
986+ </code></pre>
987+ </div>
988+ """ ,
989+ "groundtruth_content" : """# 深度学习基础教程
990+
991+ 深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。
992+
993+ ## 核心概念
994+
995+ 神经网络由多个层组成,每层包含多个神经元。
996+
997+ ```python
998+ import torch
999+ import torch.nn as nn
1000+
1001+ class SimpleNet(nn.Module):
1002+ def __init__(self):
1003+ super().__init__()
1004+ self.fc = nn.Linear(784, 10)
1005+
1006+ def forward(self, x):
1007+ return self.fc(x)
1008+ ```""" ,
1009+ "groundtruth_content_list" : [
1010+ {"type" : "heading" , "content" : "深度学习基础教程" , "level" : 1 },
1011+ {"type" : "paragraph" , "content" : "深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。" },
1012+ {"type" : "heading" , "content" : "核心概念" , "level" : 2 },
1013+ {"type" : "paragraph" , "content" : "神经网络由多个层组成,每层包含多个神经元。" },
1014+ {"type" : "code" , "content" : "import torch\n import torch.nn as nn\n \n class SimpleNet(nn.Module):\n def __init__(self):\n super().__init__()\n self.fc = nn.Linear(784, 10)\n \n def forward(self, x):\n return self.fc(x)" , "language" : "python" }
1015+ ]
1016+ }
1017+ samples .append (DataSample .from_dict (sample_1_data ))
1018+
1019+ # 样本2: 包含表格的预处理HTML
1020+ sample_2_data = {
1021+ "id" : "preprocessed_sample_2" ,
1022+ "html" : """<html><body><h1>原始表格页面</h1><table>...</table></body></html>""" ,
1023+ "llm_webkit_html" : """
1024+ <div _item_id="1">
1025+ <h1>模型性能对比</h1>
1026+ <p>以下是不同深度学习模型在CIFAR-10数据集上的表现:</p>
1027+ </div>
1028+ <div _item_id="2">
1029+ <table>
1030+ <thead>
1031+ <tr>
1032+ <th>模型</th>
1033+ <th>准确率</th>
1034+ <th>参数量</th>
1035+ </tr>
1036+ </thead>
1037+ <tbody>
1038+ <tr>
1039+ <td>ResNet-18</td>
1040+ <td>95.3%</td>
1041+ <td>11.7M</td>
1042+ </tr>
1043+ <tr>
1044+ <td>VGG-16</td>
1045+ <td>92.7%</td>
1046+ <td>138M</td>
1047+ </tr>
1048+ </tbody>
1049+ </table>
1050+ </div>
1051+ """ ,
1052+ "groundtruth_content" : """# 模型性能对比
1053+
1054+ 以下是不同深度学习模型在CIFAR-10数据集上的表现:
1055+
1056+ | 模型 | 准确率 | 参数量 |
1057+ |------|--------|--------|
1058+ | ResNet-18 | 95.3% | 11.7M |
1059+ | VGG-16 | 92.7% | 138M |""" ,
1060+ "groundtruth_content_list" : [
1061+ {"type" : "heading" , "content" : "模型性能对比" , "level" : 1 },
1062+ {"type" : "paragraph" , "content" : "以下是不同深度学习模型在CIFAR-10数据集上的表现:" },
1063+ {"type" : "table" , "content" : "| 模型 | 准确率 | 参数量 |\n |------|--------|---------|\n | ResNet-18 | 95.3% | 11.7M |\n | VGG-16 | 92.7% | 138M |" }
1064+ ]
1065+ }
1066+ samples .append (DataSample .from_dict (sample_2_data ))
1067+
1068+ # 创建数据集并添加样本
1069+ dataset = BenchmarkDataset (name = "preprocessed_html_test" , description = "预处理HTML功能测试数据集" )
1070+ for sample in samples :
1071+ dataset .add_sample (sample )
1072+
1073+ print (f"✅ 测试数据集包含 { len (dataset )} 个样本" )
1074+ print ("📋 每个样本都包含:" )
1075+ print (" - html: 原始复杂HTML" )
1076+ print (" - llm_webkit_html: 预处理后的简化HTML(包含_item_id标记)" )
1077+ print (" - groundtruth_content: 标准答案" )
1078+ print ()
1079+
1080+ # 2. 创建预处理HTML模式的LLM-WebKit抽取器
1081+ print ("2. 创建预处理HTML模式的LLM-WebKit抽取器..." )
1082+
1083+ config = {
1084+ "use_preprocessed_html" : True , # 🔑 关键配置:启用预处理HTML模式
1085+ "preprocessed_html_field" : "llm_webkit_html" # 指定预处理HTML字段名
1086+ }
1087+
1088+ extractor = ExtractorFactory .create ("llm-webkit" , config = config )
1089+ print (f"✅ 抽取器创建成功" )
1090+ print (f"📋 配置信息:" )
1091+ print (f" - use_preprocessed_html: { extractor .inference_config .use_preprocessed_html } " )
1092+ print (f" - preprocessed_html_field: { extractor .inference_config .preprocessed_html_field } " )
1093+ print (f" - 跳过LLM推理: 是(直接处理预处理HTML)" )
1094+ print ()
1095+
1096+ # 3. 性能对比:展示预处理HTML模式的优势
1097+ print ("3. 性能优势演示..." )
1098+ print ("🚀 预处理HTML模式的优势:" )
1099+ print (" ✅ 无需加载大型LLM模型(节省内存)" )
1100+ print (" ✅ 跳过HTML简化推理步骤(节省时间)" )
1101+ print (" ✅ 只需要基础的llm_web_kit依赖" )
1102+ print (" ✅ 适合批量处理已预处理的数据" )
1103+ print ()
1104+
1105+ # 4. 运行评测
1106+ print ("4. 开始评测..." )
1107+ print ("=" * 50 )
1108+
1109+ evaluator = Evaluator ()
1110+ result = evaluator .evaluate (
1111+ dataset = dataset ,
1112+ extractor = extractor ,
1113+ max_samples = None
1114+ )
1115+
1116+ # 5. 显示评测结果
1117+ print ("\n 5. 📊 预处理HTML模式评测结果:" )
1118+ print ("=" * 50 )
1119+
1120+ results_dict = result .to_dict ()
1121+ metrics = results_dict .get ('overall_metrics' , {})
1122+
1123+ # 显示关键指标
1124+ print (f"\n 🏆 综合指标:" )
1125+ print (f" overall: { metrics .get ('overall' , 0 ):.4f} " )
1126+
1127+ print (f"\n 📝 内容提取质量:" )
1128+ print (f" text_edit: { metrics .get ('text_edit' , 0 ):.4f} " )
1129+ print (f" code_edit: { metrics .get ('code_edit' , 0 ):.4f} " )
1130+ print (f" table_edit: { metrics .get ('table_edit' , 0 ):.4f} " )
1131+ print (f" table_TEDS: { metrics .get ('table_TEDS' , 0 ):.4f} " )
1132+
1133+ print (f"\n ⚡ 性能统计:" )
1134+ sample_results = results_dict .get ('sample_results' , [])
1135+ if sample_results :
1136+ extraction_times = [s .get ('extraction_time' , 0 ) for s in sample_results if s .get ('extraction_success' )]
1137+ if extraction_times :
1138+ avg_time = sum (extraction_times ) / len (extraction_times )
1139+ print (f" 平均提取时间: { avg_time :.3f} 秒" )
1140+ print (f" 处理速度: { 1 / avg_time :.1f} 样本/秒" )
1141+
1142+ success_count = len ([s for s in sample_results if s .get ('extraction_success' , False )])
1143+ print (f" 成功样本数: { success_count } /{ len (dataset )} " )
1144+
1145+ # 6. 展示样本提取结果
1146+ print (f"\n 6. 📄 样本提取结果预览:" )
1147+ print ("-" * 50 )
1148+
1149+ for i , sample_result in enumerate (sample_results [:2 ]): # 只显示前2个样本
1150+ print (f"\n 样本 { i + 1 } : { sample_result .get ('sample_id' , 'Unknown' )} " )
1151+ if sample_result .get ('extraction_success' ):
1152+ content = sample_result .get ('extracted_content' , '' )
1153+ preview = content [:100 ].replace ('\n ' , ' ' ) if content else '无内容'
1154+ print (f" ✅ 提取成功" )
1155+ print (f" 📝 内容预览: { preview } ..." )
1156+ print (f" ⏱️ 提取时间: { sample_result .get ('extraction_time' , 0 ):.3f} 秒" )
1157+ else :
1158+ print (f" ❌ 提取失败" )
1159+
1160+ # 7. 保存结果
1161+ print (f"\n 7. 💾 保存评测结果..." )
1162+
1163+ results_dir = Path ("results" )
1164+ results_dir .mkdir (exist_ok = True )
1165+
1166+ results_path = results_dir / "preprocessed_html_evaluation_results.json"
1167+ report_path = results_dir / "preprocessed_html_evaluation_report.csv"
1168+
1169+ DataSaver .save_evaluation_results (result , results_path )
1170+ DataSaver .save_summary_report (result , report_path )
1171+
1172+ print (f"✅ 详细结果已保存到: { results_path } " )
1173+ print (f"✅ CSV报告已保存到: { report_path } " )
1174+
1175+ # 8. 使用建议
1176+ print (f"\n 8. 💡 实际使用建议:" )
1177+ print ("=" * 50 )
1178+ print ("🔧 何时使用预处理HTML模式:" )
1179+ print (" 1. 已有LLM简化后的HTML数据" )
1180+ print (" 2. 需要批量处理大量数据" )
1181+ print (" 3. 部署环境内存有限" )
1182+ print (" 4. 对提取速度有较高要求" )
1183+ print ()
1184+ print ("📝 数据准备要求:" )
1185+ print (" 1. 确保预处理HTML包含_item_id属性" )
1186+ print (" 2. 保持原始HTML作为备用" )
1187+ print (" 3. 验证预处理质量" )
1188+ print ()
1189+ print ("⚙️ 配置参数说明:" )
1190+ print (" - use_preprocessed_html: True/False" )
1191+ print (" - preprocessed_html_field: 字段名(默认'llm_webkit_html')" )
1192+
1193+ print ("\n ✅ 预处理HTML功能演示完成!" )
1194+
1195+
9461196if __name__ == "__main__" :
9471197 try :
9481198 # demo_basic_mock_evaluation()
9491199 # demo_llm_webkit_evaluation() # 使用LLM-WebKit评测示例
1200+ demo_llm_webkit_with_preprocessed_html_evaluation ()
9501201 # demo_extractor_comparison()
9511202 # demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集
952- demo_multi_extraction () # 演示多个抽取器同时评测
1203+ # demo_multi_extraction() # 演示多个抽取器同时评测
9531204 # demo_lld_workers_extraction()
9541205 print ("\n ✅ 示例运行完成!" )
9551206
0 commit comments