@@ -20,26 +20,28 @@ def _calculate_score(self, predicted: str, groundtruth: str,
2020 groundtruth_content_list : List [Dict [str , Any ]] = None ,
2121 ** kwargs ) -> MetricResult :
2222 """计算表格内容的编辑距离"""
23-
24- # 从content_list中提取表格内容
25- pred_table = self ._extract_table_content (predicted , predicted_content_list )
26- gt_table = self ._extract_table_content (groundtruth , groundtruth_content_list )
2723
28- # 统一转换为HTML格式(复用TEDSMetric的归一化逻辑)
29- pred_html = self ._normalize_to_html (pred_table )
30- gt_html = self ._normalize_to_html (gt_table )
24+ # 1. 提取原始表格内容
25+ pred_raw = self ._extract_table_content (predicted , predicted_content_list )
26+ gt_raw = self ._extract_table_content (groundtruth , groundtruth_content_list )
27+
28+ # 2. 复用TEDSMetric的归一化方法,统一转换为HTML格式
29+ teds = TEDSMetric ("temp_teds" ) # 实例化TEDSMetric以调用其方法
30+ pred_html = teds ._normalize_to_html (pred_raw ) # 调用TEDS的归一化方法
31+ gt_html = teds ._normalize_to_html (gt_raw )
3132
32- # 从HTML中提取纯文本内容(忽略标签,仅保留表格数据 )
33+ # 3. 从归一化后的HTML中提取纯文本内容(保留表格结构 )
3334 pred_text = self ._extract_text_from_html (pred_html )
3435 gt_text = self ._extract_text_from_html (gt_html )
35-
36- # 计算编辑距离
36+
37+ # 4. 基于归一化后的文本计算编辑距离
3738 result = super ()._calculate_score (pred_text , gt_text , ** kwargs )
3839 result .metric_name = self .name
3940 result .details .update ({
4041 "predicted_table_length" : len (pred_text ),
4142 "groundtruth_table_length" : len (gt_text ),
42- "content_type" : "table"
43+ "content_type" : "table" ,
44+ "normalization" : "teds_based" # 标记使用TEDS的归一化方法
4345 })
4446
4547 return result
@@ -50,38 +52,6 @@ def _extract_table_content(self, text: str, content_list: List[Dict[str, Any]] =
5052 content_parts = self .split_content (text , content_list )
5153 return content_parts .get ('table' , '' )
5254
53- def _normalize_to_html (self , table_data : str ) -> str :
54- """复用TEDSMetric的表格格式归一化逻辑,统一转换为HTML"""
55- # 若输入为空,直接返回空字符串
56- if not table_data .strip ():
57- return ""
58- # 若已为HTML表格,直接返回
59- if '<table' in table_data .lower ():
60- return table_data
61- # 若为Markdown表格,转换为HTML
62- if '|' in table_data :
63- return self ._markdown_to_html (table_data )
64- # 其他格式视为纯文本表格,简单包裹为HTML
65- return f"<table><tr><td>{ table_data } </td></tr></table>"
66-
67- def _markdown_to_html (self , markdown : str ) -> str :
68- """将Markdown表格转换为HTML(复用TEDSMetric逻辑)"""
69- lines = [line .strip () for line in markdown .split ('\n ' ) if line .strip ()]
70- table_lines = [line for line in lines if '|' in line ]
71- if not table_lines :
72- return ""
73- html_parts = ["<table>" ]
74- # 过滤分隔线(如 |---|)
75- data_lines = [line for line in table_lines if not re .match (r'^[\s\|\-:]+$' , line )]
76- for i , line in enumerate (data_lines ):
77- cells = [cell .strip () for cell in line .split ('|' ) if cell .strip ()]
78- if cells :
79- # 首行视为表头(th),其余为单元格(td)
80- tag = "th" if i == 0 else "td"
81- html_parts .append (f"<tr>{ '' .join (f'<{ tag } >{ cell } </{ tag } >' for cell in cells )} </tr>" )
82- html_parts .append ("</table>" )
83- return '' .join (html_parts )
84-
8555 def _extract_text_from_html (self , html : str ) -> str :
8656 """从HTML表格中提取纯文本内容(忽略标签,保留数据结构)"""
8757 if not html .strip ():
0 commit comments