Skip to content

Commit a3428db

Browse files
committed
fix code match :delete inline code
1 parent b86036d commit a3428db

28 files changed

Lines changed: 3144 additions & 332 deletions

README.md

Lines changed: 112 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -74,23 +74,30 @@ print(f"Overall Score: {result.overall_metrics['overall']:.4f}")
7474
{
7575
"track_id": "0b7f2636-d35f-40bf-9b7f-94be4bcbb396",
7676
"html": "<html><body><h1 cc-select=\"true\">这是标题</h1></body></html>", # 人工标注带cc-select="true" 属性
77-
"groundtruth_content": "# 标题\n\n正文内容",
78-
"groundtruth_content_list": [
79-
{"type": "heading", "content": "标题", "level": 1},
80-
{"type": "paragraph", "content": "正文内容"}
81-
],
8277
"url": "https://orderyourbooks.com/product-category/college-books-p-u/?products-per-page=all",
83-
"layout_id": "orderyourbooks.com_4",
84-
"max_layer_n": 10,
85-
"url_host_name": "orderyourbooks.com",
86-
"raw_warc_path": "s3://cc-raw-huawei/crawl-data/CC-MAIN-2025-13/segments/1742004433093.21/warc/CC-MAIN-20250319080618-20250319110618-00909.warc.gz?bytes=461610805,172252",
87-
"language": "en",
88-
"__dom_depth": 19,
89-
"__dom_width": 10231,
90-
"__type": "__max_depth",
91-
"__tag": "DOM_WIDTH",
92-
"marked_type": "unwanted", # normal:正常标注的网页;unable:正文内容无法抉择;unwanted:无需标注的网页;
93-
"unwanted_reason": "list"
78+
"main_html": "<h1 cc-select=\"true\">这是标题</h1>", # 从html中剪枝得到的正文html
79+
"convert_main_content": "# 这是标题", # 从main_html+html2text转化来
80+
"groundtruth_content": "# 这是标题", # 人工校准的markdown(部分提供)
81+
"meta": {
82+
"language": "en", # 网页的语言
83+
"style": "artical", # 网页的文体
84+
"DOM_WIDTH": 176,
85+
"DOM_DEPTH": 27,
86+
"text_linktext_ratio": 0.12252270850536746,
87+
"table_text_ratio": 0,
88+
"table_dom_depth": -1,
89+
"text_distribution_dispersion": 0.2663,
90+
"table": [], # [], ["layout"], ["data"], ["layout", "data"]
91+
"equation": [], # [], ["inline"], ["interline"], ["inline", "interline"]
92+
"code": [], # [], ["inline"], ["interline"], ["inline", "interline"]
93+
"table_complexity_score": 0,
94+
"dom_complexity_score": 0.8442,
95+
"text_dispersion_score": 0.2663,
96+
"content_diversity_score": 0,
97+
"link_complexity_score": 0.1225,
98+
"overall_complexity_score": 0.3083,
99+
"level": "mid" # simple, mid, hard
100+
}
94101
}
95102
```
96103

@@ -197,6 +204,95 @@ class MyExtractor(BaseExtractor):
197204
ExtractorFactory.register("my-extractor", MyExtractor)
198205
```
199206

207+
### 数据集统计分析工具
208+
209+
WebMainBench 提供了强大的数据集统计分析工具 `scripts/statics.py`,用于分析数据集的各种特征并自动生成复杂度评分和难易程度分类。
210+
211+
#### 功能特性
212+
213+
- **DOM结构分析**:计算网页DOM树的深度和宽度
214+
- **文本链接比例分析**:统计文本与链接的比例关系
215+
- **表格复杂度分析**:评估表格内容的复杂程度
216+
- **内容类型检测**:自动识别公式、代码、表格等特殊内容
217+
- **复杂度评分**:基于多维度指标计算综合复杂度得分
218+
- **动态难易程度分类**:基于数据分布自动分类为 simple/mid/hard
219+
220+
#### 使用方法
221+
222+
```bash
223+
# 基本用法
224+
python scripts/statics.py data/input.jsonl --output data/output_with_stats.jsonl
225+
226+
# 使用默认数据集
227+
python scripts/statics.py
228+
```
229+
230+
#### 参数说明
231+
232+
```bash
233+
# 查看所有可用参数
234+
python scripts/statics.py --help
235+
236+
```
237+
238+
#### 输出结果
239+
240+
工具会在每条数据的 `meta` 字段中添加以下统计信息:
241+
242+
```json
243+
{
244+
"meta": {
245+
"DOM_DEPTH": 25, // DOM树深度
246+
"DOM_WIDTH": 1200, // DOM树宽度
247+
"text_linktext_ratio": 0.85, // 文本链接比例
248+
"table_complexity_score": 0.3, // 表格复杂度得分
249+
"dom_complexity_score": 0.6, // DOM复杂度得分
250+
"text_dispersion_score": 0.4, // 文本分布得分
251+
"content_diversity_score": 0.7, // 内容多样性得分
252+
"link_complexity_score": 0.5, // 链接复杂度得分
253+
"overall_complexity_score": 0.52, // 综合复杂度得分
254+
"level": "mid" // 难易程度 (simple/mid/hard)
255+
}
256+
}
257+
```
258+
259+
#### 复杂度评分算法
260+
261+
综合复杂度得分由以下维度加权计算:
262+
263+
- **DOM结构复杂度 (25%)**:基于DOM深度和宽度,使用动态归一化
264+
- **文本分布复杂度 (25%)**:基于文本在DOM中的分布离散程度
265+
- **内容多样性 (25%)**:基于公式、代码、表格等特殊内容的种类
266+
- **链接复杂度 (25%)**:基于文本与链接的比例关系
267+
268+
#### 运行示例
269+
270+
```bash
271+
# 分析数据集并生成统计报告
272+
python scripts/statics.py data/sample_dataset.jsonl --output data/analyzed_dataset.jsonl
273+
274+
# 输出示例:
275+
🔄 第一阶段: 计算基础统计和复杂度得分...
276+
📊 已处理 100 条数据...
277+
📊 已处理 200 条数据...
278+
279+
🔄 第二阶段: 计算动态阈值和难易程度分类...
280+
📊 复杂度分布阈值计算:
281+
总样本数: 1,827
282+
30%分位数 (simple/mid分界): 0.3245
283+
70%分位数 (mid/hard分界): 0.6789
284+
复杂度得分范围: 0.0944 - 1.0000
285+
286+
📊 难易程度分类结果:
287+
Simple: 548 (30.0%)
288+
Mid: 731 (40.0%)
289+
Hard: 548 (30.0%)
290+
291+
📝 正在写入数据到: data/analyzed_dataset.jsonl
292+
✅ 成功写入 1,827 条数据
293+
```
294+
295+
200296
## 项目架构
201297

202298
```

examples/basic_usage.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,7 @@ def demo_multi_extraction():
738738
# 配置文件路径
739739
data_dir = Path("../data")
740740
# dataset_path = data_dir / "sample_dataset.jsonl"
741-
dataset_path = "/home/lulindong/Pycharm_projects/cc/1827_split_jsonl/1-200.jsonl"
741+
dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_1904_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
742742

743743
print(f"📂 数据集文件: {dataset_path}")
744744

@@ -889,7 +889,7 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
889889

890890
# 1. 从真实数据集加载包含预处理HTML的数据
891891
print("1. 从真实数据集加载预处理HTML数据...")
892-
dataset_path = Path("/home/lulindong/Pycharm_projects/cc/1827_split_jsonl/1-200.jsonl")
892+
dataset_path = Path("data/track_id_diff_result_56.jsonl")
893893
print(f"📂 数据集文件: {dataset_path}")
894894

895895
# 加载数据集
@@ -939,7 +939,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
939939

940940
print(f"\n📝 内容提取质量:")
941941
print(f" text_edit: {metrics.get('text_edit', 0):.4f}")
942-
print(f" formula_edit: {metrics.get('formula_edit', 0):.4f}")
943942
print(f" code_edit: {metrics.get('code_edit', 0):.4f}")
944943
print(f" table_edit: {metrics.get('table_edit', 0):.4f}")
945944
print(f" table_TEDS: {metrics.get('table_TEDS', 0):.4f}")
@@ -985,10 +984,10 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
985984
try:
986985
# demo_basic_mock_evaluation()
987986
# demo_llm_webkit_evaluation() # 使用LLM-WebKit评测示例
988-
# demo_llm_webkit_with_preprocessed_html_evaluation()
987+
demo_llm_webkit_with_preprocessed_html_evaluation()
989988
# demo_extractor_comparison()
990989
# demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集
991-
demo_multi_extraction() # 演示多个抽取器同时评测
990+
# demo_multi_extraction() # 演示多个抽取器同时评测
992991
print("\n✅ 示例运行完成!")
993992

994993
except Exception as e:

examples/main_html_eval.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
#!/usr/bin/env python3
2+
"""
3+
WebMainBench 基本使用示例
4+
"""
5+
6+
import json
7+
from pathlib import Path
8+
9+
# 导入 WebMainBench 模块
10+
from webmainbench import (
11+
DataLoader, DataSaver, BenchmarkDataset, DataSample,
12+
ExtractorFactory, MainHTMLEvaluator,
13+
format_results, setup_logging
14+
)
15+
16+
17+
def load_benchdata(dataset_path: str) -> BenchmarkDataset:
18+
dataset_path = Path(dataset_path)
19+
print(f"📂 数据集文件: {dataset_path}")
20+
21+
if not dataset_path.exists():
22+
print(f"❌ 数据文件不存在: {dataset_path}")
23+
print("请确保已运行数据提取命令创建样本数据集")
24+
return
25+
26+
# 加载数据集
27+
dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
28+
dataset.name = "real_preprocessed_html_test"
29+
dataset.description = "基于真实数据的预处理HTML功能测试"
30+
return dataset
31+
32+
33+
def load_extractor(model_path: str):
34+
extractor = ExtractorFactory.create("dripper", config={"model_path": model_path})
35+
return extractor
36+
37+
38+
def save_results(result_file: Path, results: list[dict]):
39+
with result_file.open("w", encoding="utf-8") as f:
40+
for res in results:
41+
f.write(json.dumps(res, ensure_ascii=False) + "\n")
42+
43+
44+
45+
def demo_llm_webkit_with_preprocessed_html_evaluation(model_path: str):
46+
"""演示LLM-WebKit预处理HTML功能的评测"""
47+
48+
print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
49+
50+
# 设置日志
51+
setup_logging(level="INFO")
52+
53+
# 1. 从真实数据集加载包含预处理HTML的数据
54+
print("1. 从真实数据集加载预处理HTML数据...")
55+
56+
# 使用DataLoader加载真实的样本数据
57+
58+
dataset = load_benchdata("data/WebMainBench_llm-webkit_v1_WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
59+
print(f"✅ 真实数据集加载成功,包含 {len(dataset)} 个样本")
60+
61+
62+
63+
# 2. 创建预处理HTML模式的LLM-WebKit抽取器
64+
print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
65+
66+
extractor = load_extractor(model_path)
67+
print(f"✅ 抽取器创建成功")
68+
print(f"📋 配置信息:")
69+
print(f" - 跳过LLM推理: 是(直接处理预处理HTML)")
70+
print()
71+
72+
# 4. 运行评测
73+
print("4. 开始评测...")
74+
print("=" * 50)
75+
76+
evaluator = MainHTMLEvaluator()
77+
result = evaluator.evaluate(
78+
dataset=dataset,
79+
extractor=extractor,
80+
max_samples=None
81+
)
82+
83+
# 5. 显示评测结果
84+
print("\n5. 📊 预处理HTML模式评测结果:")
85+
print("=" * 50)
86+
87+
results_dict = result.to_dict()
88+
metrics = results_dict.get('overall_metrics', {})
89+
90+
# 显示关键指标
91+
print(f"\n🏆 综合指标:")
92+
for key in metrics.keys():
93+
print(f" {key}: {metrics[key]:.4f}")
94+
95+
print(f"\n⚡ 性能统计:")
96+
sample_results = results_dict.get('sample_results', [])
97+
if sample_results:
98+
extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')]
99+
if extraction_times:
100+
avg_time = sum(extraction_times) / len(extraction_times)
101+
print(f" 平均提取时间: {avg_time:.3f}秒")
102+
print(f" 处理速度: {1/avg_time:.1f}样本/秒")
103+
104+
success_count = len([s for s in sample_results if s.get('extraction_success', False)])
105+
print(f" 成功样本数: {success_count}/{len(dataset)}")
106+
107+
# 7. 保存结果
108+
print(f"\n6. 💾 保存评测结果...")
109+
110+
results_dir = Path("results")
111+
results_dir.mkdir(exist_ok=True)
112+
# 新增:保存带抽取结果的增强数据集(JSONL格式)
113+
jsonl_dataset_path = results_dir / f"{extractor.name}_preprocessed_html_dataset_with_results.jsonl"
114+
save_results(jsonl_dataset_path, result.sample_results)
115+
print(f"✅ 结果已保存到: {jsonl_dataset_path}")
116+
117+
118+
print(f"✅ 带抽取结果的JSONL数据集已保存到: {jsonl_dataset_path}")
119+
results_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_results.json"
120+
report_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_report.csv"
121+
122+
DataSaver.save_evaluation_results(result, results_path)
123+
DataSaver.save_summary_report(result, report_path)
124+
125+
print(f"✅ 详细结果已保存到: {results_path}")
126+
print(f"✅ CSV报告已保存到: {report_path}")
127+
128+
129+
130+
if __name__ == "__main__":
131+
import argparse
132+
parser = argparse.ArgumentParser(description="WebMainBench 基本使用示例")
133+
parser.add_argument("--model_path", required=True, help="LLM model路径")
134+
args = parser.parse_args()
135+
try:
136+
demo_llm_webkit_with_preprocessed_html_evaluation(args.model_path)
137+
print("\n✅ 示例运行完成!")
138+
139+
except Exception as e:
140+
print(f"\n❌ 运行出错: {e}")
141+
import traceback
142+
traceback.print_exc()

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ trafilatura
1010
# llm-web-kit==3.2.0
1111
https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.5-released/magic_html-0.1.5-py3-none-any.whl
1212
streamlit
13-
markdown
13+
markdown
14+
jieba

results/dataset_with_results.jsonl

Lines changed: 4 additions & 4 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)