Skip to content

Commit 2995e2f

Browse files
committed
Optimize table edit distance calculation by using normalize
1 parent 34f517d commit 2995e2f

1 file changed

Lines changed: 13 additions & 43 deletions

File tree

webmainbench/metrics/table_metrics.py

Lines changed: 13 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -20,26 +20,28 @@ def _calculate_score(self, predicted: str, groundtruth: str,
2020
groundtruth_content_list: List[Dict[str, Any]] = None,
2121
**kwargs) -> MetricResult:
2222
"""计算表格内容的编辑距离"""
23-
24-
# 从content_list中提取表格内容
25-
pred_table = self._extract_table_content(predicted, predicted_content_list)
26-
gt_table = self._extract_table_content(groundtruth, groundtruth_content_list)
2723

28-
# 统一转换为HTML格式(复用TEDSMetric的归一化逻辑)
29-
pred_html = self._normalize_to_html(pred_table)
30-
gt_html = self._normalize_to_html(gt_table)
24+
# 1. 提取原始表格内容
25+
pred_raw = self._extract_table_content(predicted, predicted_content_list)
26+
gt_raw = self._extract_table_content(groundtruth, groundtruth_content_list)
27+
28+
# 2. 复用TEDSMetric的归一化方法,统一转换为HTML格式
29+
teds = TEDSMetric("temp_teds") # 实例化TEDSMetric以调用其方法
30+
pred_html = teds._normalize_to_html(pred_raw) # 调用TEDS的归一化方法
31+
gt_html = teds._normalize_to_html(gt_raw)
3132

32-
# 从HTML中提取纯文本内容(忽略标签,仅保留表格数据
33+
# 3. 从归一化后的HTML中提取纯文本内容(保留表格结构
3334
pred_text = self._extract_text_from_html(pred_html)
3435
gt_text = self._extract_text_from_html(gt_html)
35-
36-
# 计算编辑距离
36+
37+
# 4. 基于归一化后的文本计算编辑距离
3738
result = super()._calculate_score(pred_text, gt_text, **kwargs)
3839
result.metric_name = self.name
3940
result.details.update({
4041
"predicted_table_length": len(pred_text),
4142
"groundtruth_table_length": len(gt_text),
42-
"content_type": "table"
43+
"content_type": "table",
44+
"normalization": "teds_based" # 标记使用TEDS的归一化方法
4345
})
4446

4547
return result
@@ -50,38 +52,6 @@ def _extract_table_content(self, text: str, content_list: List[Dict[str, Any]] =
5052
content_parts = self.split_content(text, content_list)
5153
return content_parts.get('table', '')
5254

53-
def _normalize_to_html(self, table_data: str) -> str:
54-
"""复用TEDSMetric的表格格式归一化逻辑,统一转换为HTML"""
55-
# 若输入为空,直接返回空字符串
56-
if not table_data.strip():
57-
return ""
58-
# 若已为HTML表格,直接返回
59-
if '<table' in table_data.lower():
60-
return table_data
61-
# 若为Markdown表格,转换为HTML
62-
if '|' in table_data:
63-
return self._markdown_to_html(table_data)
64-
# 其他格式视为纯文本表格,简单包裹为HTML
65-
return f"<table><tr><td>{table_data}</td></tr></table>"
66-
67-
def _markdown_to_html(self, markdown: str) -> str:
68-
"""将Markdown表格转换为HTML(复用TEDSMetric逻辑)"""
69-
lines = [line.strip() for line in markdown.split('\n') if line.strip()]
70-
table_lines = [line for line in lines if '|' in line]
71-
if not table_lines:
72-
return ""
73-
html_parts = ["<table>"]
74-
# 过滤分隔线(如 |---|)
75-
data_lines = [line for line in table_lines if not re.match(r'^[\s\|\-:]+$', line)]
76-
for i, line in enumerate(data_lines):
77-
cells = [cell.strip() for cell in line.split('|') if cell.strip()]
78-
if cells:
79-
# 首行视为表头(th),其余为单元格(td)
80-
tag = "th" if i == 0 else "td"
81-
html_parts.append(f"<tr>{''.join(f'<{tag}>{cell}</{tag}>' for cell in cells)}</tr>")
82-
html_parts.append("</table>")
83-
return ''.join(html_parts)
84-
8555
def _extract_text_from_html(self, html: str) -> str:
8656
"""从HTML表格中提取纯文本内容(忽略标签,保留数据结构)"""
8757
if not html.strip():

0 commit comments

Comments
 (0)