opendatalab
diff --git a/‎README.md‎
Lines changed: 112 additions & 16 deletions b/‎README.md‎
Lines changed: 112 additions & 16 deletions
diff --git a/‎examples/basic_usage.py‎
Lines changed: 4 additions & 5 deletions b/‎examples/basic_usage.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎examples/main_html_eval.py‎
Lines changed: 142 additions & 0 deletions b/‎examples/main_html_eval.py‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 2 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎results/dataset_with_results.jsonl‎
Lines changed: 4 additions & 4 deletions b/‎results/dataset_with_results.jsonl‎
Lines changed: 4 additions & 4 deletions
@@ -74,23 +74,30 @@ print(f"Overall Score: {result.overall_metrics['overall']:.4f}")
 {
   "track_id": "0b7f2636-d35f-40bf-9b7f-94be4bcbb396",
   "html": "<html><body><h1 cc-select=\"true\">这是标题</h1></body></html>",   # 人工标注带cc-select="true" 属性
-  "groundtruth_content": "# 标题\n\n正文内容",
-  "groundtruth_content_list": [
-      {"type": "heading", "content": "标题", "level": 1},
-      {"type": "paragraph", "content": "正文内容"}
-   ],
   "url": "https://orderyourbooks.com/product-category/college-books-p-u/?products-per-page=all",
-  "layout_id": "orderyourbooks.com_4",
-  "max_layer_n": 10,
-  "url_host_name": "orderyourbooks.com",
-  "raw_warc_path": "s3://cc-raw-huawei/crawl-data/CC-MAIN-2025-13/segments/1742004433093.21/warc/CC-MAIN-20250319080618-20250319110618-00909.warc.gz?bytes=461610805,172252",
-  "language": "en",
-  "__dom_depth": 19,
-  "__dom_width": 10231,
-  "__type": "__max_depth",
-  "__tag": "DOM_WIDTH",
-  "marked_type": "unwanted",  # normal：正常标注的网页；unable：正文内容无法抉择；unwanted：无需标注的网页；
-  "unwanted_reason": "list"
+  "main_html": "<h1 cc-select=\"true\">这是标题</h1>",  # 从html中剪枝得到的正文html
+  "convert_main_content": "# 这是标题",  # 从main_html+html2text转化来
+  "groundtruth_content": "# 这是标题",  # 人工校准的markdown（部分提供）
+  "meta": {
+    "language": "en",  # 网页的语言
+    "style": "artical",  # 网页的文体
+    "DOM_WIDTH": 176,
+    "DOM_DEPTH": 27,
+    "text_linktext_ratio": 0.12252270850536746,
+    "table_text_ratio": 0,
+    "table_dom_depth": -1,
+    "text_distribution_dispersion": 0.2663,
+    "table": [],  # [], ["layout"], ["data"], ["layout", "data"]
+    "equation": [],  # [], ["inline"], ["interline"], ["inline", "interline"]
+    "code": [],  # [], ["inline"], ["interline"], ["inline", "interline"]
+    "table_complexity_score": 0,
+    "dom_complexity_score": 0.8442,
+    "text_dispersion_score": 0.2663,
+    "content_diversity_score": 0,
+    "link_complexity_score": 0.1225,
+    "overall_complexity_score": 0.3083,
+    "level": "mid"  # simple, mid, hard
+  }
 }
 ```
 
@@ -197,6 +204,95 @@ class MyExtractor(BaseExtractor):
 ExtractorFactory.register("my-extractor", MyExtractor)
 ```
 
+### 数据集统计分析工具
+
+WebMainBench 提供了强大的数据集统计分析工具 `scripts/statics.py`，用于分析数据集的各种特征并自动生成复杂度评分和难易程度分类。
+
+#### 功能特性
+
+- **DOM结构分析**：计算网页DOM树的深度和宽度
+- **文本链接比例分析**：统计文本与链接的比例关系
+- **表格复杂度分析**：评估表格内容的复杂程度
+- **内容类型检测**：自动识别公式、代码、表格等特殊内容
+- **复杂度评分**：基于多维度指标计算综合复杂度得分
+- **动态难易程度分类**：基于数据分布自动分类为 simple/mid/hard
+
+#### 使用方法
+
+```bash
+# 基本用法
+python scripts/statics.py data/input.jsonl --output data/output_with_stats.jsonl
+
+# 使用默认数据集
+python scripts/statics.py
+```
+
+#### 参数说明
+
+```bash
+# 查看所有可用参数
+python scripts/statics.py --help
+
+```
+
+#### 输出结果
+
+工具会在每条数据的 `meta` 字段中添加以下统计信息：
+
+```json
+{
+  "meta": {
+    "DOM_DEPTH": 25,                    // DOM树深度
+    "DOM_WIDTH": 1200,                  // DOM树宽度
+    "text_linktext_ratio": 0.85,        // 文本链接比例
+    "table_complexity_score": 0.3,      // 表格复杂度得分
+    "dom_complexity_score": 0.6,        // DOM复杂度得分
+    "text_dispersion_score": 0.4,       // 文本分布得分
+    "content_diversity_score": 0.7,     // 内容多样性得分
+    "link_complexity_score": 0.5,       // 链接复杂度得分
+    "overall_complexity_score": 0.52,   // 综合复杂度得分
+    "level": "mid"                      // 难易程度 (simple/mid/hard)
+  }
+}
+```
+
+#### 复杂度评分算法
+
+综合复杂度得分由以下维度加权计算：
+
+- **DOM结构复杂度 (25%)**：基于DOM深度和宽度，使用动态归一化
+- **文本分布复杂度 (25%)**：基于文本在DOM中的分布离散程度
+- **内容多样性 (25%)**：基于公式、代码、表格等特殊内容的种类
+- **链接复杂度 (25%)**：基于文本与链接的比例关系
+
+#### 运行示例
+
+```bash
+# 分析数据集并生成统计报告
+python scripts/statics.py data/sample_dataset.jsonl --output data/analyzed_dataset.jsonl
+
+# 输出示例：
+🔄 第一阶段: 计算基础统计和复杂度得分...
+  📊 已处理 100 条数据...
+  📊 已处理 200 条数据...
+
+🔄 第二阶段: 计算动态阈值和难易程度分类...
+📊 复杂度分布阈值计算:
+   总样本数: 1,827
+   30%分位数 (simple/mid分界): 0.3245
+   70%分位数 (mid/hard分界): 0.6789
+   复杂度得分范围: 0.0944 - 1.0000
+
+📊 难易程度分类结果:
+   Simple: 548 (30.0%)
+   Mid:    731 (40.0%)  
+   Hard:   548 (30.0%)
+
+📝 正在写入数据到: data/analyzed_dataset.jsonl
+✅ 成功写入 1,827 条数据
+```
+
+
 ## 项目架构
 
 ```
 
@@ -738,7 +738,7 @@ def demo_multi_extraction():
     # 配置文件路径
     data_dir = Path("../data")
     # dataset_path = data_dir / "sample_dataset.jsonl"
-    dataset_path = "/home/lulindong/Pycharm_projects/cc/1827_split_jsonl/1-200.jsonl"
+    dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_1904_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
 
     print(f"📂 数据集文件: {dataset_path}")
 
@@ -889,7 +889,7 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
 
     # 1. 从真实数据集加载包含预处理HTML的数据
     print("1. 从真实数据集加载预处理HTML数据...")
-    dataset_path = Path("/home/lulindong/Pycharm_projects/cc/1827_split_jsonl/1-200.jsonl")
+    dataset_path = Path("data/track_id_diff_result_56.jsonl")
     print(f"📂 数据集文件: {dataset_path}")
 
     # 加载数据集
@@ -939,7 +939,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
 
     print(f"\n📝 内容提取质量:")
     print(f"  text_edit: {metrics.get('text_edit', 0):.4f}")
-    print(f"  formula_edit: {metrics.get('formula_edit', 0):.4f}")
     print(f"  code_edit: {metrics.get('code_edit', 0):.4f}")
     print(f"  table_edit: {metrics.get('table_edit', 0):.4f}")
     print(f"  table_TEDS: {metrics.get('table_TEDS', 0):.4f}")
@@ -985,10 +984,10 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
     try:
         # demo_basic_mock_evaluation()
         # demo_llm_webkit_evaluation()  # 使用LLM-WebKit评测示例
-        # demo_llm_webkit_with_preprocessed_html_evaluation()
+        demo_llm_webkit_with_preprocessed_html_evaluation()
         # demo_extractor_comparison()
         # demo_dataset_with_extraction()  # 演示保存带有抽取内容的数据集
-        demo_multi_extraction() # 演示多个抽取器同时评测
+        # demo_multi_extraction() # 演示多个抽取器同时评测
         print("\n✅ 示例运行完成！")
 
     except Exception as e:
 
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+WebMainBench 基本使用示例
+"""
+
+import json
+from pathlib import Path
+
+# 导入 WebMainBench 模块
+from webmainbench import (
+    DataLoader, DataSaver, BenchmarkDataset, DataSample,
+    ExtractorFactory, MainHTMLEvaluator,
+    format_results, setup_logging
+)
+
+
+def load_benchdata(dataset_path: str) -> BenchmarkDataset:
+    dataset_path = Path(dataset_path)
+    print(f"📂 数据集文件: {dataset_path}")
+    
+    if not dataset_path.exists():
+        print(f"❌ 数据文件不存在: {dataset_path}")
+        print("请确保已运行数据提取命令创建样本数据集")
+        return
+    
+    # 加载数据集
+    dataset = DataLoader.load_jsonl(dataset_path, include_results=False)
+    dataset.name = "real_preprocessed_html_test"
+    dataset.description = "基于真实数据的预处理HTML功能测试"
+    return dataset
+
+
+def load_extractor(model_path: str):
+    extractor = ExtractorFactory.create("dripper", config={"model_path": model_path})
+    return extractor
+
+
+def save_results(result_file: Path, results: list[dict]):
+    with result_file.open("w", encoding="utf-8") as f:
+        for res in results:
+            f.write(json.dumps(res, ensure_ascii=False) + "\n")
+    
+    
+
+def demo_llm_webkit_with_preprocessed_html_evaluation(model_path: str):
+    """演示LLM-WebKit预处理HTML功能的评测"""
+    
+    print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
+    
+    # 设置日志
+    setup_logging(level="INFO")
+    
+    # 1. 从真实数据集加载包含预处理HTML的数据
+    print("1. 从真实数据集加载预处理HTML数据...")
+    
+    # 使用DataLoader加载真实的样本数据
+   
+    dataset = load_benchdata("data/WebMainBench_llm-webkit_v1_WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
+    print(f"✅ 真实数据集加载成功，包含 {len(dataset)} 个样本")
+    
+
+    
+    # 2. 创建预处理HTML模式的LLM-WebKit抽取器
+    print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
+    
+    extractor = load_extractor(model_path)
+    print(f"✅ 抽取器创建成功")
+    print(f"📋 配置信息:")
+    print(f"  - 跳过LLM推理: 是（直接处理预处理HTML）")
+    print()
+    
+    # 4. 运行评测
+    print("4. 开始评测...")
+    print("=" * 50)
+    
+    evaluator = MainHTMLEvaluator()
+    result = evaluator.evaluate(
+        dataset=dataset,
+        extractor=extractor,
+        max_samples=None
+    )
+    
+    # 5. 显示评测结果
+    print("\n5. 📊 预处理HTML模式评测结果:")
+    print("=" * 50)
+    
+    results_dict = result.to_dict()
+    metrics = results_dict.get('overall_metrics', {})
+    
+    # 显示关键指标
+    print(f"\n🏆 综合指标:")
+    for key in metrics.keys():
+        print(f"  {key}: {metrics[key]:.4f}")
+    
+    print(f"\n⚡ 性能统计:")
+    sample_results = results_dict.get('sample_results', [])
+    if sample_results:
+        extraction_times = [s.get('extraction_time', 0) for s in sample_results if s.get('extraction_success')]
+        if extraction_times:
+            avg_time = sum(extraction_times) / len(extraction_times)
+            print(f"  平均提取时间: {avg_time:.3f}秒")
+            print(f"  处理速度: {1/avg_time:.1f}样本/秒")
+    
+    success_count = len([s for s in sample_results if s.get('extraction_success', False)])
+    print(f"  成功样本数: {success_count}/{len(dataset)}")
+    
+    # 7. 保存结果
+    print(f"\n6. 💾 保存评测结果...")
+    
+    results_dir = Path("results")
+    results_dir.mkdir(exist_ok=True)
+    # 新增：保存带抽取结果的增强数据集（JSONL格式）
+    jsonl_dataset_path = results_dir / f"{extractor.name}_preprocessed_html_dataset_with_results.jsonl"
+    save_results(jsonl_dataset_path, result.sample_results)
+    print(f"✅ 结果已保存到: {jsonl_dataset_path}")
+    
+    
+    print(f"✅ 带抽取结果的JSONL数据集已保存到: {jsonl_dataset_path}")
+    results_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_results.json"
+    report_path = results_dir / f"{extractor.name}_preprocessed_html_evaluation_report.csv"
+    
+    DataSaver.save_evaluation_results(result, results_path)
+    DataSaver.save_summary_report(result, report_path)
+    
+    print(f"✅ 详细结果已保存到: {results_path}")
+    print(f"✅ CSV报告已保存到: {report_path}")
+    
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="WebMainBench 基本使用示例")
+    parser.add_argument("--model_path", required=True, help="LLM model路径")
+    args = parser.parse_args()
+    try:
+        demo_llm_webkit_with_preprocessed_html_evaluation(args.model_path)
+        print("\n✅ 示例运行完成！")
+        
+    except Exception as e:
+        print(f"\n❌ 运行出错: {e}")
+        import traceback
+        traceback.print_exc() 
@@ -10,4 +10,5 @@ trafilatura
 # llm-web-kit==3.2.0
 https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.5-released/magic_html-0.1.5-py3-none-any.whl
 streamlit
-markdown
+markdown
+jieba