Skip to content

Commit d782c16

Browse files
committed
fix - table分割支持更多场景
- code删除单反引号匹配
1 parent a3428db commit d782c16

4 files changed

Lines changed: 177 additions & 411 deletions

File tree

tests/test_table_extraction.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,51 @@ def test_complex_markdown_table(self):
114114
# 验证文本中表格被移除
115115
# self.assertNotIn('| 姓名 | 年龄 | 职业 | 薪资 |', result['text'])
116116

117+
def test_corner_case_markdown_table(self):
118+
"""测试复杂Markdown表格"""
119+
text = """| |
120+
|---|---|
121+
| Scientific classification | |
122+
| Domain: | Eukaryota |
123+
| Kingdom: | Animalia |
124+
| Phylum: | Arthropoda |
125+
| Class: | Insecta |
126+
| Order: | Lepidoptera |
127+
| Family: | Autostichidae |
128+
| Genus: | Pantacordis
129+
|
130+
| Species: | P. scotinella |
131+
| Binomial name | |
132+
Pantacordis scotinella | |
133+
| Synonyms | |
134+
*Borkhausenia scotinella*Rebel, 1916*Pantacordis scotinellum*
135+
|"""
136+
137+
result = self.metric._extract_from_markdown(text)
138+
expected_table = """| |
139+
|---|---|
140+
| Scientific classification | |
141+
| Domain: | Eukaryota |
142+
| Kingdom: | Animalia |
143+
| Phylum: | Arthropoda |
144+
| Class: | Insecta |
145+
| Order: | Lepidoptera |
146+
| Family: | Autostichidae |
147+
| Genus: | Pantacordis
148+
|
149+
| Species: | P. scotinella |
150+
| Binomial name | |
151+
Pantacordis scotinella | |
152+
| Synonyms | |
153+
*Borkhausenia scotinella*Rebel, 1916*Pantacordis scotinellum*
154+
|"""
155+
156+
# 验证提取结果与预期一致
157+
self.assertEqual(result['table'], expected_table)
158+
159+
# 验证文本中表格被移除
160+
# self.assertNotIn('| 姓名 | 年龄 | 职业 | 薪资 |', result['text'])
161+
117162

118163

119164
def test_table_with_alignment(self):

webmainbench/extractors/magic_html_extractor.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from magic_html import GeneralExtractor
1010
import re
1111
import html2text
12+
from ..utils import HTML2TextWrapper
1213

1314

1415
@extractor("magic-html")
@@ -32,7 +33,10 @@ def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
3233

3334
# 从输出中提取所需信息
3435
extracted_html = data.get('html', '')
35-
markdown = html2text.html2text(extracted_html)
36+
# 使用内部HTML2Text方法生成markdown
37+
h = HTML2TextWrapper()
38+
markdown = h(extracted_html)
39+
# markdown = html2text.html2text(extracted_html)
3640
title = data.get('title', '')
3741
# 简单地将提取的 HTML 作为内容
3842
content = markdown

webmainbench/metrics/base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def _extract_from_markdown(text: str) -> Dict[str, str]:
284284

285285
def is_md_table_line(line):
286286
"""判断是否可能是 Markdown 表格行"""
287-
if line.count("|") < 3: # 至少三个竖线
287+
if line.count("|") < 1: # 至少三个竖线
288288
return False
289289
return True
290290

@@ -294,13 +294,17 @@ def is_md_separator_line(line):
294294
# 检查是否所有部分都是分隔符格式
295295
for p in parts:
296296
if p and not re.match(r"^:?\-{3,}:?$", p):
297+
298+
# if p and not re.match(r"^:?\-+(:?)$", p): # 调整正则为允许1个及以上短横线
297299
return False
298300
return True
299301

300302
def save_table():
301303
"""保存当前表格并清空缓存"""
302304
nonlocal table_lines
303-
# 只有当表格行数大于等于2,且第二行是分隔行时才保存
305+
# # 原逻辑要求至少2行且第2行是分隔行,改为只要有2行及以上且包含至少1个分隔行即可
306+
# if len(table_lines) >= 2 and any(is_md_separator_line(line) for line in table_lines):
307+
# 只有当表格行数大于等于2,且第二行是分隔行时才保存
304308
if len(table_lines) >= 2 and is_md_separator_line(table_lines[1]):
305309
md_table = '\n'.join(table_lines)
306310
extracted_segments.append(md_table)

0 commit comments

Comments
 (0)