Skip to content

Commit f5eed23

Browse files
authored
Merge pull request #40 from pekopoke/dev
Dev:优化表格分割、删除code行内分割、teds性能提升
2 parents 2bc4523 + 4b16952 commit f5eed23

11 files changed

Lines changed: 755 additions & 465 deletions

examples/multi_extractor_compare.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def all_extractor_comparison():
88
print("\n=== 多抽取器对比演示 ===\n")
99

1010
# 创建数据集
11-
dataset_path = Path("data/sample_dataset.jsonl")
11+
dataset_path = Path("/home/lulindong/Pycharm_projects/cc/1827_split_jsonl/1-200.jsonl")
1212
dataset = DataLoader.load_jsonl(dataset_path)
1313

1414
# 创建webkit抽取器

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ trafilatura
1111
https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.5-released/magic_html-0.1.5-py3-none-any.whl
1212
streamlit
1313
markdown
14-
jieba
14+
jieba
15+
apted

results/leaderboard.csv

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2-
llm-webkit,sample_dataset,4,1.0,0.262,0.5,0.0,0.0,0.0,0.81
3-
magic-html,sample_dataset,4,1.0,0.174,0.1007,0.0,0.0,0.0,0.7693
4-
resiliparse,sample_dataset,4,1.0,0.1667,0.0,0.0,0.0,0.0,0.8333
5-
trafilatura,sample_dataset,4,1.0,0.1325,0.1007,0.0,0.0,0.0,0.5618
1+
extractor,dataset,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2+
llm-webkit,sample_dataset,4,1.0,0.262,0.5,0.0,0.0,0.0,0.81
3+
magic-html,sample_dataset,4,1.0,0.174,0.1007,0.0,0.0,0.0,0.7693
4+
resiliparse,sample_dataset,4,1.0,0.1667,0.0,0.0,0.0,0.0,0.8333
5+
trafilatura,sample_dataset,4,1.0,0.1325,0.1007,0.0,0.0,0.0,0.5618

tests/test_code_extraction.py

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,13 @@ def test_empty_text(self):
3838
self.assertEqual(result['code'], '')
3939
self.assertEqual(result['text'], '')
4040

41-
def test_inline_code(self):
42-
"""测试行内代码"""
43-
text = "这是一个`行内代码`的例子"
44-
result = BaseMetric._extract_from_markdown(text)
45-
print(result)
46-
self.assertEqual(result['code'], '行内代码')
47-
self.assertEqual(result['text'], text)
41+
# def test_inline_code(self):
42+
# """测试行内代码"""
43+
# text = "这是一个`行内代码`的例子"
44+
# result = BaseMetric._extract_from_markdown(text)
45+
# print(result)
46+
# self.assertEqual(result['code'], '行内代码')
47+
# self.assertEqual(result['text'], text)
4848

4949
def test_code_block(self):
5050
"""测试代码块"""
@@ -63,7 +63,6 @@ def test_code_block(self):
6363

6464
# 验证提取的代码
6565
expected_code = ("""
66-
"aaaabbbb"
6766
>>> mystr = "abcdefghijkl"
6867
>>> mystr[-4:]
6968
'ijkl'
@@ -79,19 +78,19 @@ def test_code_block(self):
7978
self.assertEqual(result['text'], text)
8079
self.assertEqual(result['formula'], '')
8180

82-
def test_code_with_leading_trailing_spaces(self):
83-
"""测试代码前后有空格的情况"""
84-
text = "前面 ` code ` 后面"
85-
result = BaseMetric._extract_from_markdown(text)
86-
self.assertEqual(result['code'], 'code') # 应该去除空格
87-
self.assertEqual(result['text'], text)
88-
89-
def test_multiline_inline_code(self):
90-
"""测试多行行内代码(不应该匹配)"""
91-
text = "`第一行\n第二行`"
92-
result = BaseMetric._extract_from_markdown(text)
93-
self.assertEqual(result['code'], '') # 不应该匹配多行行内代码
94-
self.assertEqual(result['text'], text) # 原样保留
81+
# def test_code_with_leading_trailing_spaces(self):
82+
# """测试代码前后有空格的情况"""
83+
# text = "前面 ` code ` 后面"
84+
# result = BaseMetric._extract_from_markdown(text)
85+
# self.assertEqual(result['code'], 'code') # 应该去除空格
86+
# self.assertEqual(result['text'], text)
87+
88+
# def test_multiline_inline_code(self):
89+
# """测试多行行内代码(不应该匹配)"""
90+
# text = "`第一行\n第二行`"
91+
# result = BaseMetric._extract_from_markdown(text)
92+
# self.assertEqual(result['code'], '') # 不应该匹配多行行内代码
93+
# self.assertEqual(result['text'], text) # 原样保留
9594

9695
if __name__ == '__main__':
9796
unittest.main()

0 commit comments

Comments
 (0)