Skip to content

Commit 772d72c

Browse files
committed
Optimize table edit distance calculation by using normalize
1 parent 2995e2f commit 772d72c

2 files changed

Lines changed: 16 additions & 56 deletions

File tree

tests/test_metrics.py

Lines changed: 12 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,8 @@ def test_table_edit_metric(self):
129129
self.assertTrue(table_result.success)
130130
self.assertIsInstance(table_result.score, float)
131131
# 验证固定内容的确定分数
132-
self.assertAlmostEqual(table_result.score, 0.7241379310344828, places=5,
133-
msg=f"table_edit分数应该是0.7241379310344828,实际: {table_result.score}")
132+
self.assertAlmostEqual(table_result.score, 0.9333333333333333, places=5,
133+
msg=f"table_edit分数应该是0.9333333333333333,实际: {table_result.score}")
134134

135135
# 验证详细信息
136136
self.assertEqual(table_result.details['content_type'], 'table')
@@ -874,8 +874,8 @@ def test_html_table_edit_distance(self):
874874
# 验证表格编辑距离(分隔符长度差异导致的固定分数)
875875
self.assertIn("table_edit", results)
876876
self.assertTrue(results["table_edit"].success)
877-
self.assertAlmostEqual(results["table_edit"].score, 0.6237714987714988, places=5,
878-
msg=f"table_edit分数应该是0.6237714987714988,实际: {results['table_edit'].score}")
877+
self.assertAlmostEqual(results["table_edit"].score, 0.5935733724094621, places=5,
878+
msg=f"table_edit分数应该是0.5935733724094621,实际: {results['table_edit'].score}")
879879

880880
# 验证TEDS指标(表格结构完全相同,满分)
881881
self.assertIn("table_TEDS", results)
@@ -885,34 +885,15 @@ def test_html_table_edit_distance(self):
885885

886886
def test_table_sample_edit_distance(self):
887887
"""测试渲染一致,表格样式不一致的编辑距离"""
888-
groundtruth = """## 销售数据统计
889-
888+
groundtruth = """
890889
| 产品 | 销量 | 收入 |
891890
|------|------|------|
892891
| 产品A | 100 | 1000 |
893-
| 产品B | 200 | 3000 |"""
894-
895-
predicted = """## <table>
896-
<thead>
897-
<tr>
898-
<th>产品</th>
899-
<th>销量</th>
900-
<th>收入</th>
901-
</tr>
902-
</thead>
903-
<tbody>
904-
<tr>
905-
<td>产品A</td>
906-
<td>100</td>
907-
<td>1000</td>
908-
</tr>
909-
<tr>
910-
<td>产品B</td>
911-
<td>200</td>
912-
<td>3000</td>
913-
</tr>
914-
</tbody>
915-
</table>"""
892+
| 产品B | 200 | 3000 |
893+
"""
894+
895+
predicted = """
896+
<table><tr><th>产品</th><th>销量</th><th>收入</th></tr><tr><td>产品A</td><td>100</td><td>1000</td></tr><tr><td>产品B</td><td>200</td><td>3000</td></tr></table>"""
916897

917898
results = self.calculator.calculate_all(
918899
predicted_content=predicted,
@@ -928,8 +909,8 @@ def test_table_sample_edit_distance(self):
928909
# 验证TEDS指标(表格结构完全相同,满分)
929910
self.assertIn("table_TEDS", results)
930911
self.assertTrue(results["table_TEDS"].success)
931-
self.assertAlmostEqual(results["table_TEDS"].score, 0.9806224310041104, places=5,
932-
msg=f"table_TEDS分数应该是0.9806224310041104,实际: {results['table_TEDS'].score}")
912+
self.assertAlmostEqual(results["table_TEDS"].score, 1.0, places=5,
913+
msg=f"table_TEDS分数应该是1.0,实际: {results['table_TEDS'].score}")
933914

934915
def test_formula_sample_edit_distance(self):
935916
"""测试公式样本的编辑距离"""

webmainbench/metrics/table_metrics.py

Lines changed: 4 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,12 @@ def _calculate_score(self, predicted: str, groundtruth: str,
3030
pred_html = teds._normalize_to_html(pred_raw) # 调用TEDS的归一化方法
3131
gt_html = teds._normalize_to_html(gt_raw)
3232

33-
# 3. 从归一化后的HTML中提取纯文本内容(保留表格结构)
34-
pred_text = self._extract_text_from_html(pred_html)
35-
gt_text = self._extract_text_from_html(gt_html)
36-
37-
# 4. 基于归一化后的文本计算编辑距离
38-
result = super()._calculate_score(pred_text, gt_text, **kwargs)
33+
# 3. 基于归一化后的文本计算编辑距离
34+
result = super()._calculate_score(pred_html, gt_html, **kwargs)
3935
result.metric_name = self.name
4036
result.details.update({
41-
"predicted_table_length": len(pred_text),
42-
"groundtruth_table_length": len(gt_text),
37+
"predicted_table_length": len(pred_html),
38+
"groundtruth_table_length": len(gt_html),
4339
"content_type": "table",
4440
"normalization": "teds_based" # 标记使用TEDS的归一化方法
4541
})
@@ -52,23 +48,6 @@ def _extract_table_content(self, text: str, content_list: List[Dict[str, Any]] =
5248
content_parts = self.split_content(text, content_list)
5349
return content_parts.get('table', '')
5450

55-
def _extract_text_from_html(self, html: str) -> str:
56-
"""从HTML表格中提取纯文本内容(忽略标签,保留数据结构)"""
57-
if not html.strip():
58-
return ""
59-
soup = BeautifulSoup(html, 'html.parser')
60-
table = soup.find('table')
61-
if not table:
62-
return ""
63-
# 按行提取文本,用换行分隔行,用空格分隔单元格
64-
text_parts = []
65-
for row in table.find_all('tr'):
66-
cells = row.find_all(['th', 'td'])
67-
row_text = ' '.join([cell.get_text(strip=True) for cell in cells])
68-
if row_text:
69-
text_parts.append(row_text)
70-
return '\n'.join(text_parts)
71-
7251
def _extract_tables_from_content_list(self, content_list: List[Dict[str, Any]]) -> List[str]:
7352
"""递归从content_list中提取表格内容"""
7453
tables = []

0 commit comments

Comments
 (0)