Skip to content

Commit 931e6dd

Browse files
committed
update requirements
1 parent 79140f7 commit 931e6dd

2 files changed

Lines changed: 30 additions & 30 deletions

File tree

examples/basic_usage.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -801,8 +801,8 @@ def demo_multi_extraction():
801801

802802
# 配置文件路径
803803
data_dir = Path("../data")
804-
# dataset_path = data_dir / "sample_dataset.jsonl"
805-
dataset_path = "/home/lulindong/Pycharm_projects/cc/test.jsonl"
804+
dataset_path = data_dir / "sample_dataset.jsonl"
805+
# dataset_path = "/home/lulindong/Pycharm_projects/cc/test.jsonl"
806806

807807
print(f"📂 数据集文件: {dataset_path}")
808808

@@ -816,8 +816,8 @@ def demo_multi_extraction():
816816
"preserve_formatting": True
817817
}},
818818

819-
# {"name": "trafilatura", "config": {}},
820-
# {"name": "magic-html", "config": {}},
819+
{"name": "trafilatura", "config": {}},
820+
{"name": "magic-html", "config": {}},
821821
]
822822

823823
# 🔧 选择评测模式:内存模式 vs 批处理模式

tests/test_extractors.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,32 +32,32 @@ def test_trafilatura_extractor(self):
3232
self.assertEqual(isinstance(result, ExtractionResult), True)
3333
self.assertEqual(result.success in [True, False], True)
3434

35-
# def test_magic_html_extractor(self):
36-
# # 测试 Magic HTML 抽取器
37-
# try:
38-
# extractor = ExtractorFactory.create("magic-html")
39-
# html_content = """
40-
# <html>
41-
# <body>
42-
# <h1 cc-select="true">Python编程教程</h1>
43-
# <p cc-select="true">这是一个Python基础教程,展示如何定义函数。</p>
44-
# <pre cc-select="true"><code>def greet(name):
45-
# ""问候函数""
46-
# return f"Hello, {name}!"
47-
#
48-
# # 使用示例
49-
# result = greet("World")
50-
# print(result)</code></pre>
51-
# <p cc-select="true">这个函数可以用来问候任何人。</p>
52-
# </body>
53-
# </html>
54-
# """
55-
# result = extractor.extract(html_content)
56-
# self.assertEqual(isinstance(result, ExtractionResult), True)
57-
# self.assertEqual(result.success in [True, False], True)
58-
# except ValueError as e:
59-
# # 如果抽取器未注册,跳过测试
60-
# self.skipTest(f"Magic HTML 抽取器未注册: {e}")
35+
def test_magic_html_extractor(self):
36+
# 测试 Magic HTML 抽取器
37+
try:
38+
extractor = ExtractorFactory.create("magic-html")
39+
html_content = """
40+
<html>
41+
<body>
42+
<h1 cc-select="true">Python编程教程</h1>
43+
<p cc-select="true">这是一个Python基础教程,展示如何定义函数。</p>
44+
<pre cc-select="true"><code>def greet(name):
45+
""问候函数""
46+
return f"Hello, {name}!"
47+
48+
# 使用示例
49+
result = greet("World")
50+
print(result)</code></pre>
51+
<p cc-select="true">这个函数可以用来问候任何人。</p>
52+
</body>
53+
</html>
54+
"""
55+
result = extractor.extract(html_content)
56+
self.assertEqual(isinstance(result, ExtractionResult), True)
57+
self.assertEqual(result.success in [True, False], True)
58+
except ValueError as e:
59+
# 如果抽取器未注册,跳过测试
60+
self.skipTest(f"Magic HTML 抽取器未注册: {e}")
6161

6262
def test_resiliparse_extractor(self):
6363
# 测试 Resiliparse 抽取器

0 commit comments

Comments
 (0)