优化了6个计算指标，忽略预测长度为0、gt长度为0的数据，只计算有效数据

pekopoke · pekopoke · commit 5e39ad3b0f98 · 2025-08-07T16:22:27.000+08:00
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
@@ -796,13 +796,15 @@ def demo_multi_extraction():
     from webmainbench import DataLoader, DataSaver, Evaluator, ExtractorFactory
     from pathlib import Path
     import time
+
+
     # 设置日志
     setup_logging(level="INFO")
 
     # 配置文件路径
     data_dir = Path("../data")
-    dataset_path = data_dir / "sample_dataset.jsonl"
-    # dataset_path = "/home/lulindong/Pycharm_projects/cc/test.jsonl"
+    # dataset_path = data_dir / "sample_dataset.jsonl"
+    dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_llm-webkit_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
 
     print(f"📂 数据集文件: {dataset_path}")
 
@@ -815,7 +817,6 @@ def demo_multi_extraction():
             "list_bullets": True,
             "preserve_formatting": True
         }},
-
         {"name": "trafilatura", "config": {}},
         {"name": "magic-html", "config": {}},
     ]
@@ -902,7 +903,7 @@ def demo_multi_extraction():
         all_results.append(result)
 
         # 保存带有当前抽取器内容的数据集
-        enriched_dataset_path = results_dir / f"{dataset.name}_with_{extractor.name}_extraction.jsonl"
+        enriched_dataset_path = results_dir / f"{dataset.name}_{extractor.name}_extraction_infer.jsonl"
         DataSaver.save_dataset_with_extraction(
             results=result,
             dataset=dataset,
diff --git a/webmainbench/evaluator/evaluator.py b/webmainbench/evaluator/evaluator.py
@@ -361,32 +361,72 @@ def _evaluate_sample(self, sample: DataSample, extractor: BaseExtractor) -> Dict
     
     def _aggregate_metrics(self, sample_results: List[Dict[str, Any]]) -> Dict[str, float]:
         """Aggregate metrics across all samples."""
-        # Collect metric results by metric name
-        metric_groups = {}
-        
-        for sample_result in sample_results:
-            if not sample_result.get('extraction_success', True):
-                continue
-                
-            metrics = sample_result.get('metrics', {})
-            for metric_name, metric_data in metrics.items():
-                if metric_data.get('success', False):
-                    if metric_name not in metric_groups:
-                        metric_groups[metric_name] = []
-                    metric_groups[metric_name].append(metric_data['score'])
-        
-        # Calculate aggregated scores
-        aggregated_metrics = {}
-        for metric_name, scores in metric_groups.items():
-            if scores:
-                aggregated_metrics[metric_name] = sum(scores) / len(scores)
+        # # Collect metric results by metric name
+        # metric_groups = {}
+        #
+        # for sample_result in sample_results:
+        #     if not sample_result.get('extraction_success', True):
+        #         continue
+        #
+        #     metrics = sample_result.get('metrics', {})
+        #     for metric_name, metric_data in metrics.items():
+        #         if metric_data.get('success', False):
+        #             if metric_name not in metric_groups:
+        #                 metric_groups[metric_name] = []
+        #             metric_groups[metric_name].append(metric_data['score'])
+        #
+        # # Calculate aggregated scores
+        # aggregated_metrics = {}
+        # for metric_name, scores in metric_groups.items():
+        #     if scores:
+        #         aggregated_metrics[metric_name] = sum(scores) / len(scores)
+        #     else:
+        #         aggregated_metrics[metric_name] = 0.0
+        #
+        # # overall score is already calculated by MetricCalculator
+        # # No need to override it here
+        #
+        # return aggregated_metrics
+        """
+            聚合所有样本的指标，计算全局平均值（每个指标单独聚合）
+            """
+        if not sample_results:
+            return {}
+
+        # 初始化每个指标的总分和样本数
+        metric_totals = {
+            "text_edit": 0.0,
+            "code_edit": 0.0,
+            "table_edit": 0.0,
+            "table_TEDS": 0.0,
+            "formula_edit": 0.0,
+            "overall": 0.0  # 全局overall单独计算
+        }
+        metric_counts = {k: 0 for k in metric_totals.keys()}  # 记录每个指标有效样本数
+
+        # 累加所有样本的指标分数
+        for sample in sample_results:
+            metrics = sample.get("metrics", {})
+            for metric_name in metric_totals.keys():
+                if metric_name in metrics and metrics[metric_name].get("success", False):
+                    metric_totals[metric_name] += metrics[metric_name]["score"]
+                    metric_counts[metric_name] += 1
+
+        # 计算每个指标的平均值（全局overall为5个单项指标的平均值）
+        overall_metrics = {}
+        for metric_name in metric_totals.keys():
+            if metric_counts[metric_name] > 0:
+                overall_metrics[metric_name] = metric_totals[metric_name] / metric_counts[metric_name]
             else:
-                aggregated_metrics[metric_name] = 0.0
-        
-        # overall score is already calculated by MetricCalculator
-        # No need to override it here
-        
-        return aggregated_metrics
+                overall_metrics[metric_name] = 0.0  # 无有效样本时默认为0
+
+        # 特别处理全局overall：固定为5个单项指标的平均值（无论单项是否有有效样本）
+        # 排除样本级overall，仅用5个核心指标计算全局overall
+        core_metrics = ["text_edit", "code_edit", "table_edit", "table_TEDS", "formula_edit"]
+        core_scores = [overall_metrics[metric] for metric in core_metrics]
+        overall_metrics["overall"] = sum(core_scores) / len(core_metrics)
+
+        return overall_metrics
     
     def _calculate_category_metrics(self, sample_results: List[Dict[str, Any]], 
                                   samples: List[DataSample]) -> Optional[Dict[str, Dict[str, float]]]:
diff --git a/webmainbench/extractors/trafilatura_extractor.py b/webmainbench/extractors/trafilatura_extractor.py
@@ -20,6 +20,8 @@ class TrafilaturaInferenceConfig:
     # 可根据需要添加更多trafilatura支持的参数
     include_images: bool = False
     include_links: bool = False
+    # 新增：支持的输出格式（txt/markdown/json/xml等）
+    output_format: str = "markdown"  # 默认保持纯文本
 
 
 @extractor("trafilatura")
@@ -65,7 +67,8 @@ def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
                 include_comments=self.inference_config.include_comments,
                 include_tables=self.inference_config.include_tables,
                 include_images=self.inference_config.include_images,
-                include_links=self.inference_config.include_links
+                include_links=self.inference_config.include_links,
+                output_format=self.inference_config.output_format  # 传入输出格式
             )
 
             # 创建 content_list（简单分割段落）
diff --git a/webmainbench/metrics/calculator.py b/webmainbench/metrics/calculator.py
@@ -70,36 +70,76 @@ def calculate_all(self, predicted_content: str,
         Returns:
             Dictionary mapping metric names to MetricResult instances
         """
-        results = {}
-        
-        for metric_name, metric in self.metrics.items():
-            try:
-                if metric_name in ["edit_distance", "bleu", "rouge"]:
-                    # Text-based metrics
-                    result = metric.calculate(predicted_content, groundtruth_content, **kwargs)
-                elif metric_name in ["code_edit", "formula_edit", 
-                                   "table_edit", "table_TEDS", "text_edit"]:
-                    # 新的内容类型指标，需要传递 content_list
-                    result = metric.calculate(
-                        predicted_content, 
-                        groundtruth_content,
-                        predicted_content_list=predicted_content_list,
-                        groundtruth_content_list=groundtruth_content_list,
-                        **kwargs
-                    )
-                else:
-                    # Generic calculation
-                    result = metric.calculate(predicted_content, groundtruth_content, **kwargs)
-                
-                results[metric_name] = result
-                
-            except Exception as e:
-                # Create error result for failed metrics
-                results[metric_name] = MetricResult.create_error_result(
-                    metric_name, f"Metric calculation failed: {str(e)}"
+        # results = {}
+        #
+        # for metric_name, metric in self.metrics.items():
+        #     try:
+        #         if metric_name in ["edit_distance", "bleu", "rouge"]:
+        #             # Text-based metrics
+        #             result = metric.calculate(predicted_content, groundtruth_content, **kwargs)
+        #         elif metric_name in ["code_edit", "formula_edit",
+        #                            "table_edit", "table_TEDS", "text_edit"]:
+        #             # 新的内容类型指标，需要传递 content_list
+        #             result = metric.calculate(
+        #                 predicted_content,
+        #                 groundtruth_content,
+        #                 predicted_content_list=predicted_content_list,
+        #                 groundtruth_content_list=groundtruth_content_list,
+        #                 **kwargs
+        #             )
+        #         else:
+        #             # Generic calculation
+        #             result = metric.calculate(predicted_content, groundtruth_content, **kwargs)
+        #
+        #         results[metric_name] = result
+        #
+        #     except Exception as e:
+        #         # Create error result for failed metrics
+        #         results[metric_name] = MetricResult.create_error_result(
+        #             metric_name, f"Metric calculation failed: {str(e)}"
+        #         )
+
+        results: Dict[str, MetricResult] = {}
+
+        # 1. 先计算非表格指标（无依赖关系）
+        for metric_name in list(self.metrics.keys()):
+            if metric_name in ["table_edit", "table_TEDS"]:
+                continue  # 表格相关指标单独处理
+
+            metric = self.metrics[metric_name]
+            result = metric.calculate(
+                predicted=predicted_content,
+                groundtruth=groundtruth_content,
+                predicted_content_list=predicted_content_list,
+                groundtruth_content_list=groundtruth_content_list, **kwargs
+            )
+            results[metric_name] = result
+
+        # 2. 处理表格相关指标（有依赖关系）
+        # 2.1 计算 table_edit
+        if "table_edit" in self.metrics:
+            table_edit_result = self.metrics["table_edit"].calculate(
+                predicted=predicted_content,
+                groundtruth=groundtruth_content,
+                predicted_content_list=predicted_content_list,
+                groundtruth_content_list=groundtruth_content_list,
+                **kwargs
+            )
+            results["table_edit"] = table_edit_result
+
+            # 2.2 计算 table_TEDS（依赖 table_edit 的结果）
+            if "table_TEDS" in self.metrics:
+                teds_result = self.metrics["table_TEDS"].calculate(
+                    predicted=predicted_content,
+                    groundtruth=groundtruth_content,
+                    predicted_content_list=predicted_content_list,
+                    groundtruth_content_list=groundtruth_content_list,
+                    table_edit_result=table_edit_result,  # 传递依赖结果
+                    **kwargs
                 )
+                results["table_TEDS"] = teds_result
         
-        # Add overall score as average of all metrics
+        # 3. 计算综合得分（所有成功指标的平均值）
         successful_scores = []
         failed_metrics = []
         
diff --git a/webmainbench/metrics/teds_metrics.py b/webmainbench/metrics/teds_metrics.py
@@ -39,37 +39,45 @@ def _calculate_score(self, predicted: Any, groundtruth: Any, **kwargs) -> Metric
             MetricResult with TEDS score
         """
         try:
-            # Convert inputs to HTML format
+            # 新增：检查 table_edit 的计算结果
+            table_edit_result = kwargs.get('table_edit_result')
+            if table_edit_result is None:
+                return MetricResult.create_error_result(
+                    self.name, "Missing table_edit result in kwargs"
+                )
+            if not table_edit_result.success:
+                # 若 table_edit 失败，TEDS 直接返回失败
+                return MetricResult.create_error_result(
+                    self.name,
+                    f"Skipped due to table_edit failure: {table_edit_result.details.get('error', 'unknown reason')}"
+                )
+
+            # 原有逻辑：转换为HTML并解析树结构
             pred_html = self._normalize_to_html(predicted)
             gt_html = self._normalize_to_html(groundtruth)
-            
-            # Parse HTML to tree structures
+
             pred_tree = self._parse_html_table(pred_html)
             gt_tree = self._parse_html_table(gt_html)
-            
+
+            # 后续逻辑保持不变...
             if pred_tree is None and gt_tree is None:
-                # Both are empty/invalid tables
                 return MetricResult(
                     metric_name=self.name,
                     score=1.0,
                     details={"note": "Both tables are empty or invalid"}
                 )
-            
+
             if pred_tree is None or gt_tree is None:
-                # One is empty/invalid
                 return MetricResult(
                     metric_name=self.name,
                     score=0.0,
                     details={"note": "One table is empty or invalid"}
                 )
-            
-            # Calculate tree edit distance
+
             edit_distance = self._tree_edit_distance(pred_tree, gt_tree)
-            
-            # Calculate TEDS score
             max_nodes = max(self._count_nodes(pred_tree), self._count_nodes(gt_tree))
             teds_score = 1.0 - (edit_distance / max_nodes) if max_nodes > 0 else 1.0
-            
+
             details = {
                 "edit_distance": edit_distance,
                 "predicted_nodes": self._count_nodes(pred_tree),
@@ -78,17 +86,68 @@ def _calculate_score(self, predicted: Any, groundtruth: Any, **kwargs) -> Metric
                 "structure_only": self.structure_only,
                 "algorithm": "TEDS"
             }
-            
+
             return MetricResult(
                 metric_name=self.name,
-                score=max(0.0, min(1.0, teds_score)),  # Clamp to [0, 1]
+                score=max(0.0, min(1.0, teds_score)),
                 details=details
             )
-            
+
         except Exception as e:
             return MetricResult.create_error_result(
                 self.name, f"TEDS calculation failed: {str(e)}"
             )
+        # try:
+        #     # Convert inputs to HTML format
+        #     pred_html = self._normalize_to_html(predicted)
+        #     gt_html = self._normalize_to_html(groundtruth)
+        #
+        #     # Parse HTML to tree structures
+        #     pred_tree = self._parse_html_table(pred_html)
+        #     gt_tree = self._parse_html_table(gt_html)
+        #
+        #     if pred_tree is None and gt_tree is None:
+        #         # Both are empty/invalid tables
+        #         return MetricResult(
+        #             metric_name=self.name,
+        #             score=1.0,
+        #             details={"note": "Both tables are empty or invalid"}
+        #         )
+        #
+        #     if pred_tree is None or gt_tree is None:
+        #         # One is empty/invalid
+        #         return MetricResult(
+        #             metric_name=self.name,
+        #             score=0.0,
+        #             details={"note": "One table is empty or invalid"}
+        #         )
+        #
+        #     # Calculate tree edit distance
+        #     edit_distance = self._tree_edit_distance(pred_tree, gt_tree)
+        #
+        #     # Calculate TEDS score
+        #     max_nodes = max(self._count_nodes(pred_tree), self._count_nodes(gt_tree))
+        #     teds_score = 1.0 - (edit_distance / max_nodes) if max_nodes > 0 else 1.0
+        #
+        #     details = {
+        #         "edit_distance": edit_distance,
+        #         "predicted_nodes": self._count_nodes(pred_tree),
+        #         "groundtruth_nodes": self._count_nodes(gt_tree),
+        #         "max_nodes": max_nodes,
+        #         "structure_only": self.structure_only,
+        #         "algorithm": "TEDS"
+        #     }
+        #
+        #     return MetricResult(
+        #         metric_name=self.name,
+        #         score=max(0.0, min(1.0, teds_score)),  # Clamp to [0, 1]
+        #         details=details
+        #     )
+        #
+        # except Exception as e:
+        #     return MetricResult.create_error_result(
+        #         self.name, f"TEDS calculation failed: {str(e)}"
+        #     )
     
     def _normalize_to_html(self, table_data: Any) -> str:
         """Convert various table formats to HTML."""
diff --git a/webmainbench/metrics/text_metrics.py b/webmainbench/metrics/text_metrics.py