Optimize table edit distance calculation by using normalize

pekopoke · pekopoke · commit 772d72c2eafe · 2025-09-10T10:06:53.000+08:00
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -129,8 +129,8 @@ def test_table_edit_metric(self):
         self.assertTrue(table_result.success)
         self.assertIsInstance(table_result.score, float)
         # 验证固定内容的确定分数
-        self.assertAlmostEqual(table_result.score, 0.7241379310344828, places=5,
-                               msg=f"table_edit分数应该是0.7241379310344828，实际: {table_result.score}")
+        self.assertAlmostEqual(table_result.score, 0.9333333333333333, places=5,
+                               msg=f"table_edit分数应该是0.9333333333333333，实际: {table_result.score}")
 
         # 验证详细信息
         self.assertEqual(table_result.details['content_type'], 'table')
@@ -874,8 +874,8 @@ def test_html_table_edit_distance(self):
         # 验证表格编辑距离（分隔符长度差异导致的固定分数）
         self.assertIn("table_edit", results)
         self.assertTrue(results["table_edit"].success)
-        self.assertAlmostEqual(results["table_edit"].score, 0.6237714987714988, places=5,
-                               msg=f"table_edit分数应该是0.6237714987714988，实际: {results['table_edit'].score}")
+        self.assertAlmostEqual(results["table_edit"].score, 0.5935733724094621, places=5,
+                               msg=f"table_edit分数应该是0.5935733724094621，实际: {results['table_edit'].score}")
 
         # 验证TEDS指标（表格结构完全相同，满分）
         self.assertIn("table_TEDS", results)
@@ -885,34 +885,15 @@ def test_html_table_edit_distance(self):
 
     def test_table_sample_edit_distance(self):
         """测试渲染一致,表格样式不一致的编辑距离"""
-        groundtruth = """## 销售数据统计
-
+        groundtruth = """
 | 产品 | 销量 | 收入 |
 |------|------|------|
 | 产品A | 100 | 1000 |
-| 产品B | 200 | 3000 |"""
-
-        predicted = """## <table>
-  <thead>
-    <tr>
-      <th>产品</th>
-      <th>销量</th>
-      <th>收入</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>产品A</td>
-      <td>100</td>
-      <td>1000</td>
-    </tr>
-    <tr>
-      <td>产品B</td>
-      <td>200</td>
-      <td>3000</td>
-    </tr>
-  </tbody>
-</table>"""
+| 产品B | 200 | 3000 |
+"""
+
+        predicted = """ 
+<table><tr><th>产品</th><th>销量</th><th>收入</th></tr><tr><td>产品A</td><td>100</td><td>1000</td></tr><tr><td>产品B</td><td>200</td><td>3000</td></tr></table>"""
 
         results = self.calculator.calculate_all(
             predicted_content=predicted,
@@ -928,8 +909,8 @@ def test_table_sample_edit_distance(self):
         # 验证TEDS指标（表格结构完全相同，满分）
         self.assertIn("table_TEDS", results)
         self.assertTrue(results["table_TEDS"].success)
-        self.assertAlmostEqual(results["table_TEDS"].score, 0.9806224310041104, places=5,
-                               msg=f"table_TEDS分数应该是0.9806224310041104，实际: {results['table_TEDS'].score}")
+        self.assertAlmostEqual(results["table_TEDS"].score, 1.0, places=5,
+                               msg=f"table_TEDS分数应该是1.0，实际: {results['table_TEDS'].score}")
 
     def test_formula_sample_edit_distance(self):
         """测试公式样本的编辑距离"""
diff --git a/webmainbench/metrics/table_metrics.py b/webmainbench/metrics/table_metrics.py
@@ -30,16 +30,12 @@ def _calculate_score(self, predicted: str, groundtruth: str,
         pred_html = teds._normalize_to_html(pred_raw)  # 调用TEDS的归一化方法
         gt_html = teds._normalize_to_html(gt_raw)
 
-        # 3. 从归一化后的HTML中提取纯文本内容（保留表格结构）
-        pred_text = self._extract_text_from_html(pred_html)
-        gt_text = self._extract_text_from_html(gt_html)
-
-        # 4. 基于归一化后的文本计算编辑距离
-        result = super()._calculate_score(pred_text, gt_text, **kwargs)
+        # 3. 基于归一化后的文本计算编辑距离
+        result = super()._calculate_score(pred_html, gt_html, **kwargs)
         result.metric_name = self.name
         result.details.update({
-            "predicted_table_length": len(pred_text),
-            "groundtruth_table_length": len(gt_text),
+            "predicted_table_length": len(pred_html),
+            "groundtruth_table_length": len(gt_html),
             "content_type": "table",
             "normalization": "teds_based"  # 标记使用TEDS的归一化方法
         })
@@ -52,23 +48,6 @@ def _extract_table_content(self, text: str, content_list: List[Dict[str, Any]] =
         content_parts = self.split_content(text, content_list)
         return content_parts.get('table', '')
 
-    def _extract_text_from_html(self, html: str) -> str:
-        """从HTML表格中提取纯文本内容（忽略标签，保留数据结构）"""
-        if not html.strip():
-            return ""
-        soup = BeautifulSoup(html, 'html.parser')
-        table = soup.find('table')
-        if not table:
-            return ""
-        # 按行提取文本，用换行分隔行，用空格分隔单元格
-        text_parts = []
-        for row in table.find_all('tr'):
-            cells = row.find_all(['th', 'td'])
-            row_text = ' '.join([cell.get_text(strip=True) for cell in cells])
-            if row_text:
-                text_parts.append(row_text)
-        return '\n'.join(text_parts)
-
     def _extract_tables_from_content_list(self, content_list: List[Dict[str, Any]]) -> List[str]:
         """递归从content_list中提取表格内容"""
         tables = []