Skip to content

Commit 1befae1

Browse files
committed
feat: 添加模型路径参数以增强LLM-WebKit预处理HTML功能的灵活性,并优化DripperExtractor的导入逻辑
1 parent b3a8943 commit 1befae1

2 files changed

Lines changed: 15 additions & 10 deletions

File tree

examples/main_html_eval.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def save_results(result_file: Path, results: list[dict]):
4242

4343

4444

45-
def demo_llm_webkit_with_preprocessed_html_evaluation():
45+
def demo_llm_webkit_with_preprocessed_html_evaluation(model_path: str):
4646
"""演示LLM-WebKit预处理HTML功能的评测"""
4747

4848
print("\n=== LLM-WebKit 预处理HTML功能演示 ===\n")
@@ -63,7 +63,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
6363
# 2. 创建预处理HTML模式的LLM-WebKit抽取器
6464
print("2. 创建预处理HTML模式的LLM-WebKit抽取器...")
6565

66-
model_path = "/home/qiujiuantao/project/html-alg-project/dripper/0.6B_ckpt"
6766
extractor = load_extractor(model_path)
6867
print(f"✅ 抽取器创建成功")
6968
print(f"📋 配置信息:")
@@ -129,8 +128,12 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
129128

130129

131130
if __name__ == "__main__":
131+
import argparse
132+
parser = argparse.ArgumentParser(description="WebMainBench 基本使用示例")
133+
parser.add_argument("--model_path", required=True, help="LLM model路径")
134+
args = parser.parse_args()
132135
try:
133-
demo_llm_webkit_with_preprocessed_html_evaluation()
136+
demo_llm_webkit_with_preprocessed_html_evaluation(args.model_path)
134137
print("\n✅ 示例运行完成!")
135138

136139
except Exception as e:

webmainbench/extractors/dripper_extractor.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,15 @@
11
"""
22
LLM-WebKit extractor implementation with advanced LLM inference.
33
"""
4-
5-
import json
6-
import re
74
import time
8-
from typing import Dict, Any, Optional, List
5+
from typing import Dict, Any, Optional
96

10-
from dripper.api import Dripper
11-
from dripper.base import DripperInput, DripperOutput
127
from .base import BaseExtractor, ExtractionResult
138
from .factory import extractor
149

1510
from ..utils import HTML2TextWrapper
1611

1712

18-
1913
@extractor("dripper")
2014
class DripperExtractor(BaseExtractor):
2115
"""Extractor using dripper."""
@@ -25,6 +19,14 @@ class DripperExtractor(BaseExtractor):
2519

2620

2721
def __init__(self, name: str, config: Optional[Dict[str, Any]] = None):
22+
23+
try:
24+
25+
from dripper.api import Dripper
26+
from dripper.base import DripperInput, DripperOutput
27+
except ImportError:
28+
raise ImportError("Please install dripper package")
29+
2830
# 先初始化inference_config,再调用父类初始化(因为父类会调用_setup())
2931
self.dripper = Dripper(config)
3032
self.html2text = HTML2TextWrapper()

0 commit comments

Comments
 (0)