Optimize table edit distance calculation by using normalize

pekopoke · pekopoke · commit 34f517ddb9af · 2025-09-09T15:28:23.000+08:00
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -129,8 +129,8 @@ def test_table_edit_metric(self):
         self.assertTrue(table_result.success)
         self.assertIsInstance(table_result.score, float)
         # 验证固定内容的确定分数
-        self.assertAlmostEqual(table_result.score, 0.868852, places=5,
-                               msg=f"table_edit分数应该是0.868852，实际: {table_result.score}")
+        self.assertAlmostEqual(table_result.score, 0.7241379310344828, places=5,
+                               msg=f"table_edit分数应该是0.7241379310344828，实际: {table_result.score}")
 
         # 验证详细信息
         self.assertEqual(table_result.details['content_type'], 'table')
@@ -874,47 +874,62 @@ def test_html_table_edit_distance(self):
         # 验证表格编辑距离（分隔符长度差异导致的固定分数）
         self.assertIn("table_edit", results)
         self.assertTrue(results["table_edit"].success)
-        self.assertAlmostEqual(results["table_edit"].score, 0.593573, places=5,
-                               msg=f"table_edit分数应该是0.593573，实际: {results['table_edit'].score}")
+        self.assertAlmostEqual(results["table_edit"].score, 0.6237714987714988, places=5,
+                               msg=f"table_edit分数应该是0.6237714987714988，实际: {results['table_edit'].score}")
 
         # 验证TEDS指标（表格结构完全相同，满分）
         self.assertIn("table_TEDS", results)
         self.assertTrue(results["table_TEDS"].success)
         self.assertAlmostEqual(results["table_TEDS"].score, 0.9984520490180891, places=5,
                                msg=f"table_TEDS分数应该是0.0.9984520490180891，实际: {results['table_TEDS'].score}")
 
-        def test_table_sample_edit_distance(self):
-            """测试表格样本的编辑距离"""
-            groundtruth = """## 销售数据统计
-
-    | 产品 | 销量 | 收入 |
-    |------|------|------|
-    | 产品A | 100 | 1000 |
-    | 产品B | 200 | 3000 |"""
-
-            predicted = """## 销售数据统计
-
-    | 产品 | 销量 | 收入 |
-    |---|---|---|
-    | 产品A | 100 | 1000 |
-    | 产品B | 200 | 3000 |"""
-
-            results = self.calculator.calculate_all(
-                predicted_content=predicted,
-                groundtruth_content=groundtruth
-            )
-
-            # 验证表格编辑距离（分隔符长度差异导致的固定分数）
-            self.assertIn("table_edit", results)
-            self.assertTrue(results["table_edit"].success)
-            self.assertAlmostEqual(results["table_edit"].score, 0.888889, places=5,
-                                   msg=f"table_edit分数应该是0.888889，实际: {results['table_edit'].score}")
-
-            # 验证TEDS指标（表格结构完全相同，满分）
-            self.assertIn("table_TEDS", results)
-            self.assertTrue(results["table_TEDS"].success)
-            self.assertAlmostEqual(results["table_TEDS"].score, 1.000000, places=5,
-                                   msg=f"table_TEDS分数应该是1.000000，实际: {results['table_TEDS'].score}")
+    def test_table_sample_edit_distance(self):
+        """测试渲染一致,表格样式不一致的编辑距离"""
+        groundtruth = """## 销售数据统计
+
+| 产品 | 销量 | 收入 |
+|------|------|------|
+| 产品A | 100 | 1000 |
+| 产品B | 200 | 3000 |"""
+
+        predicted = """## <table>
+  <thead>
+    <tr>
+      <th>产品</th>
+      <th>销量</th>
+      <th>收入</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>产品A</td>
+      <td>100</td>
+      <td>1000</td>
+    </tr>
+    <tr>
+      <td>产品B</td>
+      <td>200</td>
+      <td>3000</td>
+    </tr>
+  </tbody>
+</table>"""
+
+        results = self.calculator.calculate_all(
+            predicted_content=predicted,
+            groundtruth_content=groundtruth
+        )
+
+        # 验证表格编辑距离（分隔符长度差异导致的固定分数）
+        self.assertIn("table_edit", results)
+        self.assertTrue(results["table_edit"].success)
+        self.assertAlmostEqual(results["table_edit"].score, 1.0, places=5,
+                               msg=f"table_edit分数应该是1.0，实际: {results['table_edit'].score}")
+
+        # 验证TEDS指标（表格结构完全相同，满分）
+        self.assertIn("table_TEDS", results)
+        self.assertTrue(results["table_TEDS"].success)
+        self.assertAlmostEqual(results["table_TEDS"].score, 0.9806224310041104, places=5,
+                               msg=f"table_TEDS分数应该是0.9806224310041104，实际: {results['table_TEDS'].score}")
 
     def test_formula_sample_edit_distance(self):
         """测试公式样本的编辑距离"""
diff --git a/webmainbench/metrics/table_metrics.py b/webmainbench/metrics/table_metrics.py
@@ -7,7 +7,7 @@
 from .base import BaseMetric, MetricResult
 from .teds_metrics import TEDSMetric, StructureTEDSMetric
 from .text_metrics import EditDistanceMetric
-
+from bs4 import BeautifulSoup
 
 class TableEditMetric(EditDistanceMetric):
     """表格编辑距离指标"""
@@ -24,13 +24,21 @@ def _calculate_score(self, predicted: str, groundtruth: str,
         # 从content_list中提取表格内容
         pred_table = self._extract_table_content(predicted, predicted_content_list)
         gt_table = self._extract_table_content(groundtruth, groundtruth_content_list)
+
+        # 统一转换为HTML格式（复用TEDSMetric的归一化逻辑）
+        pred_html = self._normalize_to_html(pred_table)
+        gt_html = self._normalize_to_html(gt_table)
+
+        # 从HTML中提取纯文本内容（忽略标签，仅保留表格数据）
+        pred_text = self._extract_text_from_html(pred_html)
+        gt_text = self._extract_text_from_html(gt_html)
         
         # 计算编辑距离
-        result = super()._calculate_score(pred_table, gt_table, **kwargs)
+        result = super()._calculate_score(pred_text, gt_text, **kwargs)
         result.metric_name = self.name
         result.details.update({
-            "predicted_table_length": len(pred_table),
-            "groundtruth_table_length": len(gt_table),
+            "predicted_table_length": len(pred_text),
+            "groundtruth_table_length": len(gt_text),
             "content_type": "table"
         })
         
@@ -41,7 +49,56 @@ def _extract_table_content(self, text: str, content_list: List[Dict[str, Any]] =
         # 使用统一的内容分割方法
         content_parts = self.split_content(text, content_list)
         return content_parts.get('table', '')
-    
+
+    def _normalize_to_html(self, table_data: str) -> str:
+        """复用TEDSMetric的表格格式归一化逻辑，统一转换为HTML"""
+        # 若输入为空，直接返回空字符串
+        if not table_data.strip():
+            return ""
+        # 若已为HTML表格，直接返回
+        if '<table' in table_data.lower():
+            return table_data
+        # 若为Markdown表格，转换为HTML
+        if '|' in table_data:
+            return self._markdown_to_html(table_data)
+        # 其他格式视为纯文本表格，简单包裹为HTML
+        return f"<table><tr><td>{table_data}</td></tr></table>"
+
+    def _markdown_to_html(self, markdown: str) -> str:
+        """将Markdown表格转换为HTML（复用TEDSMetric逻辑）"""
+        lines = [line.strip() for line in markdown.split('\n') if line.strip()]
+        table_lines = [line for line in lines if '|' in line]
+        if not table_lines:
+            return ""
+        html_parts = ["<table>"]
+        # 过滤分隔线（如 |---|）
+        data_lines = [line for line in table_lines if not re.match(r'^[\s\|\-:]+$', line)]
+        for i, line in enumerate(data_lines):
+            cells = [cell.strip() for cell in line.split('|') if cell.strip()]
+            if cells:
+                # 首行视为表头（th），其余为单元格（td）
+                tag = "th" if i == 0 else "td"
+                html_parts.append(f"<tr>{''.join(f'<{tag}>{cell}</{tag}>' for cell in cells)}</tr>")
+        html_parts.append("</table>")
+        return ''.join(html_parts)
+
+    def _extract_text_from_html(self, html: str) -> str:
+        """从HTML表格中提取纯文本内容（忽略标签，保留数据结构）"""
+        if not html.strip():
+            return ""
+        soup = BeautifulSoup(html, 'html.parser')
+        table = soup.find('table')
+        if not table:
+            return ""
+        # 按行提取文本，用换行分隔行，用空格分隔单元格
+        text_parts = []
+        for row in table.find_all('tr'):
+            cells = row.find_all(['th', 'td'])
+            row_text = ' '.join([cell.get_text(strip=True) for cell in cells])
+            if row_text:
+                text_parts.append(row_text)
+        return '\n'.join(text_parts)
+
     def _extract_tables_from_content_list(self, content_list: List[Dict[str, Any]]) -> List[str]:
         """递归从content_list中提取表格内容"""
         tables = []