Skip to content

Commit 34f517d

Browse files
committed
Optimize table edit distance calculation by using normalize
1 parent 5bb6dd4 commit 34f517d

2 files changed

Lines changed: 113 additions & 41 deletions

File tree

tests/test_metrics.py

Lines changed: 51 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,8 @@ def test_table_edit_metric(self):
129129
self.assertTrue(table_result.success)
130130
self.assertIsInstance(table_result.score, float)
131131
# 验证固定内容的确定分数
132-
self.assertAlmostEqual(table_result.score, 0.868852, places=5,
133-
msg=f"table_edit分数应该是0.868852,实际: {table_result.score}")
132+
self.assertAlmostEqual(table_result.score, 0.7241379310344828, places=5,
133+
msg=f"table_edit分数应该是0.7241379310344828,实际: {table_result.score}")
134134

135135
# 验证详细信息
136136
self.assertEqual(table_result.details['content_type'], 'table')
@@ -874,47 +874,62 @@ def test_html_table_edit_distance(self):
874874
# 验证表格编辑距离(分隔符长度差异导致的固定分数)
875875
self.assertIn("table_edit", results)
876876
self.assertTrue(results["table_edit"].success)
877-
self.assertAlmostEqual(results["table_edit"].score, 0.593573, places=5,
878-
msg=f"table_edit分数应该是0.593573,实际: {results['table_edit'].score}")
877+
self.assertAlmostEqual(results["table_edit"].score, 0.6237714987714988, places=5,
878+
msg=f"table_edit分数应该是0.6237714987714988,实际: {results['table_edit'].score}")
879879

880880
# 验证TEDS指标(表格结构完全相同,满分)
881881
self.assertIn("table_TEDS", results)
882882
self.assertTrue(results["table_TEDS"].success)
883883
self.assertAlmostEqual(results["table_TEDS"].score, 0.9984520490180891, places=5,
884884
msg=f"table_TEDS分数应该是0.0.9984520490180891,实际: {results['table_TEDS'].score}")
885885

886-
def test_table_sample_edit_distance(self):
887-
"""测试表格样本的编辑距离"""
888-
groundtruth = """## 销售数据统计
889-
890-
| 产品 | 销量 | 收入 |
891-
|------|------|------|
892-
| 产品A | 100 | 1000 |
893-
| 产品B | 200 | 3000 |"""
894-
895-
predicted = """## 销售数据统计
896-
897-
| 产品 | 销量 | 收入 |
898-
|---|---|---|
899-
| 产品A | 100 | 1000 |
900-
| 产品B | 200 | 3000 |"""
901-
902-
results = self.calculator.calculate_all(
903-
predicted_content=predicted,
904-
groundtruth_content=groundtruth
905-
)
906-
907-
# 验证表格编辑距离(分隔符长度差异导致的固定分数)
908-
self.assertIn("table_edit", results)
909-
self.assertTrue(results["table_edit"].success)
910-
self.assertAlmostEqual(results["table_edit"].score, 0.888889, places=5,
911-
msg=f"table_edit分数应该是0.888889,实际: {results['table_edit'].score}")
912-
913-
# 验证TEDS指标(表格结构完全相同,满分)
914-
self.assertIn("table_TEDS", results)
915-
self.assertTrue(results["table_TEDS"].success)
916-
self.assertAlmostEqual(results["table_TEDS"].score, 1.000000, places=5,
917-
msg=f"table_TEDS分数应该是1.000000,实际: {results['table_TEDS'].score}")
886+
def test_table_sample_edit_distance(self):
887+
"""测试渲染一致,表格样式不一致的编辑距离"""
888+
groundtruth = """## 销售数据统计
889+
890+
| 产品 | 销量 | 收入 |
891+
|------|------|------|
892+
| 产品A | 100 | 1000 |
893+
| 产品B | 200 | 3000 |"""
894+
895+
predicted = """## <table>
896+
<thead>
897+
<tr>
898+
<th>产品</th>
899+
<th>销量</th>
900+
<th>收入</th>
901+
</tr>
902+
</thead>
903+
<tbody>
904+
<tr>
905+
<td>产品A</td>
906+
<td>100</td>
907+
<td>1000</td>
908+
</tr>
909+
<tr>
910+
<td>产品B</td>
911+
<td>200</td>
912+
<td>3000</td>
913+
</tr>
914+
</tbody>
915+
</table>"""
916+
917+
results = self.calculator.calculate_all(
918+
predicted_content=predicted,
919+
groundtruth_content=groundtruth
920+
)
921+
922+
# 验证表格编辑距离(分隔符长度差异导致的固定分数)
923+
self.assertIn("table_edit", results)
924+
self.assertTrue(results["table_edit"].success)
925+
self.assertAlmostEqual(results["table_edit"].score, 1.0, places=5,
926+
msg=f"table_edit分数应该是1.0,实际: {results['table_edit'].score}")
927+
928+
# 验证TEDS指标(表格结构完全相同,满分)
929+
self.assertIn("table_TEDS", results)
930+
self.assertTrue(results["table_TEDS"].success)
931+
self.assertAlmostEqual(results["table_TEDS"].score, 0.9806224310041104, places=5,
932+
msg=f"table_TEDS分数应该是0.9806224310041104,实际: {results['table_TEDS'].score}")
918933

919934
def test_formula_sample_edit_distance(self):
920935
"""测试公式样本的编辑距离"""

webmainbench/metrics/table_metrics.py

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from .base import BaseMetric, MetricResult
88
from .teds_metrics import TEDSMetric, StructureTEDSMetric
99
from .text_metrics import EditDistanceMetric
10-
10+
from bs4 import BeautifulSoup
1111

1212
class TableEditMetric(EditDistanceMetric):
1313
"""表格编辑距离指标"""
@@ -24,13 +24,21 @@ def _calculate_score(self, predicted: str, groundtruth: str,
2424
# 从content_list中提取表格内容
2525
pred_table = self._extract_table_content(predicted, predicted_content_list)
2626
gt_table = self._extract_table_content(groundtruth, groundtruth_content_list)
27+
28+
# 统一转换为HTML格式(复用TEDSMetric的归一化逻辑)
29+
pred_html = self._normalize_to_html(pred_table)
30+
gt_html = self._normalize_to_html(gt_table)
31+
32+
# 从HTML中提取纯文本内容(忽略标签,仅保留表格数据)
33+
pred_text = self._extract_text_from_html(pred_html)
34+
gt_text = self._extract_text_from_html(gt_html)
2735

2836
# 计算编辑距离
29-
result = super()._calculate_score(pred_table, gt_table, **kwargs)
37+
result = super()._calculate_score(pred_text, gt_text, **kwargs)
3038
result.metric_name = self.name
3139
result.details.update({
32-
"predicted_table_length": len(pred_table),
33-
"groundtruth_table_length": len(gt_table),
40+
"predicted_table_length": len(pred_text),
41+
"groundtruth_table_length": len(gt_text),
3442
"content_type": "table"
3543
})
3644

@@ -41,7 +49,56 @@ def _extract_table_content(self, text: str, content_list: List[Dict[str, Any]] =
4149
# 使用统一的内容分割方法
4250
content_parts = self.split_content(text, content_list)
4351
return content_parts.get('table', '')
44-
52+
53+
def _normalize_to_html(self, table_data: str) -> str:
54+
"""复用TEDSMetric的表格格式归一化逻辑,统一转换为HTML"""
55+
# 若输入为空,直接返回空字符串
56+
if not table_data.strip():
57+
return ""
58+
# 若已为HTML表格,直接返回
59+
if '<table' in table_data.lower():
60+
return table_data
61+
# 若为Markdown表格,转换为HTML
62+
if '|' in table_data:
63+
return self._markdown_to_html(table_data)
64+
# 其他格式视为纯文本表格,简单包裹为HTML
65+
return f"<table><tr><td>{table_data}</td></tr></table>"
66+
67+
def _markdown_to_html(self, markdown: str) -> str:
68+
"""将Markdown表格转换为HTML(复用TEDSMetric逻辑)"""
69+
lines = [line.strip() for line in markdown.split('\n') if line.strip()]
70+
table_lines = [line for line in lines if '|' in line]
71+
if not table_lines:
72+
return ""
73+
html_parts = ["<table>"]
74+
# 过滤分隔线(如 |---|)
75+
data_lines = [line for line in table_lines if not re.match(r'^[\s\|\-:]+$', line)]
76+
for i, line in enumerate(data_lines):
77+
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
78+
if cells:
79+
# 首行视为表头(th),其余为单元格(td)
80+
tag = "th" if i == 0 else "td"
81+
html_parts.append(f"<tr>{''.join(f'<{tag}>{cell}</{tag}>' for cell in cells)}</tr>")
82+
html_parts.append("</table>")
83+
return ''.join(html_parts)
84+
85+
def _extract_text_from_html(self, html: str) -> str:
86+
"""从HTML表格中提取纯文本内容(忽略标签,保留数据结构)"""
87+
if not html.strip():
88+
return ""
89+
soup = BeautifulSoup(html, 'html.parser')
90+
table = soup.find('table')
91+
if not table:
92+
return ""
93+
# 按行提取文本,用换行分隔行,用空格分隔单元格
94+
text_parts = []
95+
for row in table.find_all('tr'):
96+
cells = row.find_all(['th', 'td'])
97+
row_text = ' '.join([cell.get_text(strip=True) for cell in cells])
98+
if row_text:
99+
text_parts.append(row_text)
100+
return '\n'.join(text_parts)
101+
45102
def _extract_tables_from_content_list(self, content_list: List[Dict[str, Any]]) -> List[str]:
46103
"""递归从content_list中提取表格内容"""
47104
tables = []

0 commit comments

Comments
 (0)