Skip to content

Commit 534e269

Browse files
committed
add trafilatura detect table
1 parent 3f335bb commit 534e269

2 files changed

Lines changed: 4 additions & 5 deletions

File tree

examples/basic_usage.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -804,7 +804,7 @@ def demo_multi_extraction():
804804
# 配置文件路径
805805
data_dir = Path("../data")
806806
# dataset_path = data_dir / "sample_dataset.jsonl"
807-
dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_2456_v2_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
807+
dataset_path = "/home/lulindong/Pycharm_projects/cc/WebMainBench_2456_v3_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
808808

809809
print(f"📂 数据集文件: {dataset_path}")
810810

@@ -1090,17 +1090,16 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
10901090

10911091
print(f"✅ 详细结果已保存到: {results_path}")
10921092
print(f"✅ CSV报告已保存到: {report_path}")
1093-
10941093

10951094

10961095
if __name__ == "__main__":
10971096
try:
10981097
# demo_basic_mock_evaluation()
10991098
# demo_llm_webkit_evaluation() # 使用LLM-WebKit评测示例
1100-
demo_llm_webkit_with_preprocessed_html_evaluation()
1099+
# demo_llm_webkit_with_preprocessed_html_evaluation()
11011100
# demo_extractor_comparison()
11021101
# demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集
1103-
# demo_multi_extraction() # 演示多个抽取器同时评测
1102+
demo_multi_extraction() # 演示多个抽取器同时评测
11041103
print("\n✅ 示例运行完成!")
11051104

11061105
except Exception as e:

webmainbench/extractors/trafilatura_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class TrafilaturaInferenceConfig:
1616
favor_precision: bool = True
1717
favor_recall: bool = True
1818
include_comments: bool = False
19-
include_tables: bool = False
19+
include_tables: bool = True
2020
# 可根据需要添加更多trafilatura支持的参数
2121
include_images: bool = False
2222
include_links: bool = False

0 commit comments

Comments
 (0)