Skip to content

Commit b90b73d

Browse files
authored
Merge pull request #17 from e06084/main
feat: add llm_webkit_with_preprocessed_html
2 parents 3c5928d + d4d8a8f commit b90b73d

7 files changed

Lines changed: 882 additions & 44 deletions

File tree

examples/basic_usage.py

Lines changed: 252 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -943,13 +943,264 @@ def demo_multi_extraction():
943943
print(f" - {name}_time: 单样本抽取耗时(秒)")
944944
print(f" - {name}_*_score: 各指标得分(如{name}_text_edit)")
945945

946+
947+
def demo_llm_webkit_with_preprocessed_html_evaluation():
948+
"""演示LLM-WebKit预处理HTML功能的评测"""
949+
950+
print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
951+
952+
# 设置日志
953+
setup_logging(level="INFO")
954+
955+
# 1. 创建包含预处理HTML的测试数据集
956+
print("1. 创建包含预处理HTML的测试数据集...")
957+
958+
samples = []
959+
960+
# 样本1: 包含预处理的HTML(模拟第一阶段LLM简化后的结果)
961+
sample_1_data = {
962+
"id": "preprocessed_sample_1",
963+
"html": """<html><body><h1>原始复杂HTML</h1><p>这里是原始的复杂HTML内容...</p></body></html>""",
964+
# 这是关键:包含llm_webkit_html字段(预处理后的简化HTML)
965+
"llm_webkit_html": """
966+
<div _item_id="1">
967+
<h1>深度学习基础教程</h1>
968+
<p>深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。</p>
969+
</div>
970+
<div _item_id="2">
971+
<h2>核心概念</h2>
972+
<p>神经网络由多个层组成,每层包含多个神经元。</p>
973+
</div>
974+
<div _item_id="3">
975+
<pre><code class="language-python">
976+
import torch
977+
import torch.nn as nn
978+
979+
class SimpleNet(nn.Module):
980+
def __init__(self):
981+
super().__init__()
982+
self.fc = nn.Linear(784, 10)
983+
984+
def forward(self, x):
985+
return self.fc(x)
986+
</code></pre>
987+
</div>
988+
""",
989+
"groundtruth_content": """# 深度学习基础教程
990+
991+
深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。
992+
993+
## 核心概念
994+
995+
神经网络由多个层组成,每层包含多个神经元。
996+
997+
```python
998+
import torch
999+
import torch.nn as nn
1000+
1001+
class SimpleNet(nn.Module):
1002+
def __init__(self):
1003+
super().__init__()
1004+
self.fc = nn.Linear(784, 10)
1005+
1006+
def forward(self, x):
1007+
return self.fc(x)
1008+
```""",
1009+
"groundtruth_content_list": [
1010+
{"type": "heading", "content": "深度学习基础教程", "level": 1},
1011+
{"type": "paragraph", "content": "深度学习是机器学习的一个重要分支,通过多层神经网络来学习数据的表征。"},
1012+
{"type": "heading", "content": "核心概念", "level": 2},
1013+
{"type": "paragraph", "content": "神经网络由多个层组成,每层包含多个神经元。"},
1014+
{"type": "code", "content": "import torch\nimport torch.nn as nn\n\nclass SimpleNet(nn.Module):\n def __init__(self):\n super().__init__()\n self.fc = nn.Linear(784, 10)\n \n def forward(self, x):\n return self.fc(x)", "language": "python"}
1015+
]
1016+
}
1017+
samples.append(DataSample.from_dict(sample_1_data))
1018+
1019+
# 样本2: 包含表格的预处理HTML
1020+
sample_2_data = {
1021+
"id": "preprocessed_sample_2",
1022+
"html": """<html><body><h1>原始表格页面</h1><table>...</table></body></html>""",
1023+
"llm_webkit_html": """
1024+
<div _item_id="1">
1025+
<h1>模型性能对比</h1>
1026+
<p>以下是不同深度学习模型在CIFAR-10数据集上的表现:</p>
1027+
</div>
1028+
<div _item_id="2">
1029+
<table>
1030+
<thead>
1031+
<tr>
1032+
<th>模型</th>
1033+
<th>准确率</th>
1034+
<th>参数量</th>
1035+
</tr>
1036+
</thead>
1037+
<tbody>
1038+
<tr>
1039+
<td>ResNet-18</td>
1040+
<td>95.3%</td>
1041+
<td>11.7M</td>
1042+
</tr>
1043+
<tr>
1044+
<td>VGG-16</td>
1045+
<td>92.7%</td>
1046+
<td>138M</td>
1047+
</tr>
1048+
</tbody>
1049+
</table>
1050+
</div>
1051+
""",
1052+
"groundtruth_content": """# 模型性能对比
1053+
1054+
以下是不同深度学习模型在CIFAR-10数据集上的表现:
1055+
1056+
| 模型 | 准确率 | 参数量 |
1057+
|------|--------|--------|
1058+
| ResNet-18 | 95.3% | 11.7M |
1059+
| VGG-16 | 92.7% | 138M |""",
1060+
"groundtruth_content_list": [
1061+
{"type": "heading", "content": "模型性能对比", "level": 1},
1062+
{"type": "paragraph", "content": "以下是不同深度学习模型在CIFAR-10数据集上的表现:"},
1063+
{"type": "table", "content": "| 模型 | 准确率 | 参数量 |\n|------|--------|---------|\n| ResNet-18 | 95.3% | 11.7M |\n| VGG-16 | 92.7% | 138M |"}
1064+
]
1065+
}
1066+
samples.append(DataSample.from_dict(sample_2_data))
1067+
1068+
# 创建数据集并添加样本
1069+
dataset = BenchmarkDataset(name="preprocessed_html_test", description="预处理HTML功能测试数据集")
1070+
for sample in samples:
1071+
dataset.add_sample(sample)
1072+
1073+
print(f"✅ 测试数据集包含 {len(dataset)} 个样本")
1074+
print("📋 每个样本都包含:")
1075+
print(" - html: 原始复杂HTML")
1076+
print(" - llm_webkit_html: 预处理后的简化HTML(包含_item_id标记)")
1077+
print(" - groundtruth_content: 标准答案")
1078+
print()
1079+
1080+
# 2. 创建预处理HTML模式的LLM-WebKit抽取器
1081+
print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
1082+
1083+
config = {
1084+
"use_preprocessed_html": True, # 🔑 关键配置:启用预处理HTML模式
1085+
"preprocessed_html_field": "llm_webkit_html" # 指定预处理HTML字段名
1086+
}
1087+
1088+
extractor = ExtractorFactory.create("llm-webkit", config=config)
1089+
print(f"✅ 抽取器创建成功")
1090+
print(f"📋 配置信息:")
1091+
print(f" - use_preprocessed_html: {extractor.inference_config.use_preprocessed_html}")
1092+
print(f" - preprocessed_html_field: {extractor.inference_config.preprocessed_html_field}")
1093+
print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
1094+
print()
1095+
1096+
# 3. 性能对比:展示预处理HTML模式的优势
1097+
print("3. 性能优势演示...")
1098+
print("🚀 预处理HTML模式的优势:")
1099+
print(" ✅ 无需加载大型LLM模型(节省内存)")
1100+
print(" ✅ 跳过HTML简化推理步骤(节省时间)")
1101+
print(" ✅ 只需要基础的llm_web_kit依赖")
1102+
print(" ✅ 适合批量处理已预处理的数据")
1103+
print()
1104+
1105+
# 4. 运行评测
1106+
print("4. 开始评测...")
1107+
print("=" * 50)
1108+
1109+
evaluator = Evaluator()
1110+
result = evaluator.evaluate(
1111+
dataset=dataset,
1112+
extractor=extractor,
1113+
max_samples=None
1114+
)
1115+
1116+
# 5. 显示评测结果
1117+
print("\n5. 📊 预处理HTML模式评测结果:")
1118+
print("=" * 50)
1119+
1120+
results_dict = result.to_dict()
1121+
metrics = results_dict.get('overall_metrics', {})
1122+
1123+
# 显示关键指标
1124+
print(f"\n🏆 综合指标:")
1125+
print(f" overall: {metrics.get('overall', 0):.4f}")
1126+
1127+
print(f"\n📝 内容提取质量:")
1128+
print(f" text_edit: {metrics.get('text_edit', 0):.4f}")
1129+
print(f" code_edit: {metrics.get('code_edit', 0):.4f}")
1130+
print(f" table_edit: {metrics.get('table_edit', 0):.4f}")
1131+
print(f" table_TEDS: {metrics.get('table_TEDS', 0):.4f}")
1132+
1133+
print(f"\n⚡ 性能统计:")
1134+
sample_results = results_dict.get('sample_results', [])
1135+
if sample_results:
1136+
extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')]
1137+
if extraction_times:
1138+
avg_time = sum(extraction_times) / len(extraction_times)
1139+
print(f" 平均提取时间: {avg_time:.3f}秒")
1140+
print(f" 处理速度: {1/avg_time:.1f}样本/秒")
1141+
1142+
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
1143+
print(f" 成功样本数: {success_count}/{len(dataset)}")
1144+
1145+
# 6. 展示样本提取结果
1146+
print(f"\n6. 📄 样本提取结果预览:")
1147+
print("-" * 50)
1148+
1149+
for i, sample_result in enumerate(sample_results[:2]): # 只显示前2个样本
1150+
print(f"\n样本 {i+1}: {sample_result.get('sample_id', 'Unknown')}")
1151+
if sample_result.get('extraction_success'):
1152+
content = sample_result.get('extracted_content', '')
1153+
preview = content[:100].replace('\n', ' ') if content else '无内容'
1154+
print(f" ✅ 提取成功")
1155+
print(f" 📝 内容预览: {preview}...")
1156+
print(f" ⏱️ 提取时间: {sample_result.get('extraction_time', 0):.3f}秒")
1157+
else:
1158+
print(f" ❌ 提取失败")
1159+
1160+
# 7. 保存结果
1161+
print(f"\n7. 💾 保存评测结果...")
1162+
1163+
results_dir = Path("results")
1164+
results_dir.mkdir(exist_ok=True)
1165+
1166+
results_path = results_dir / "preprocessed_html_evaluation_results.json"
1167+
report_path = results_dir / "preprocessed_html_evaluation_report.csv"
1168+
1169+
DataSaver.save_evaluation_results(result, results_path)
1170+
DataSaver.save_summary_report(result, report_path)
1171+
1172+
print(f"✅ 详细结果已保存到: {results_path}")
1173+
print(f"✅ CSV报告已保存到: {report_path}")
1174+
1175+
# 8. 使用建议
1176+
print(f"\n8. 💡 实际使用建议:")
1177+
print("=" * 50)
1178+
print("🔧 何时使用预处理HTML模式:")
1179+
print(" 1. 已有LLM简化后的HTML数据")
1180+
print(" 2. 需要批量处理大量数据")
1181+
print(" 3. 部署环境内存有限")
1182+
print(" 4. 对提取速度有较高要求")
1183+
print()
1184+
print("📝 数据准备要求:")
1185+
print(" 1. 确保预处理HTML包含_item_id属性")
1186+
print(" 2. 保持原始HTML作为备用")
1187+
print(" 3. 验证预处理质量")
1188+
print()
1189+
print("⚙️ 配置参数说明:")
1190+
print(" - use_preprocessed_html: True/False")
1191+
print(" - preprocessed_html_field: 字段名(默认'llm_webkit_html')")
1192+
1193+
print("\n✅ 预处理HTML功能演示完成!")
1194+
1195+
9461196
if __name__ == "__main__":
9471197
try:
9481198
# demo_basic_mock_evaluation()
9491199
# demo_llm_webkit_evaluation() # 使用LLM-WebKit评测示例
1200+
demo_llm_webkit_with_preprocessed_html_evaluation()
9501201
# demo_extractor_comparison()
9511202
# demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集
952-
demo_multi_extraction() # 演示多个抽取器同时评测
1203+
# demo_multi_extraction() # 演示多个抽取器同时评测
9531204
# demo_lld_workers_extraction()
9541205
print("\n✅ 示例运行完成!")
9551206

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ torch
77
html2text
88
resiliparse
99
trafilatura
10+
# llm-web-kit==3.2.0
1011
https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.5-released/magic_html-0.1.5-py3-none-any.whl
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2+
llm-webkit,preprocessed_html_test,2,1.0,0.5029,0.5,1.0,0.5,0.5,0.0143

0 commit comments

Comments
 (0)