Skip to content

Commit 7fa6b19

Browse files
committed
feat: integrate html2text for improved HTML content extraction in evaluators and dripper extractor
1 parent 49fb47e commit 7fa6b19

2 files changed

Lines changed: 11 additions & 3 deletions

File tree

webmainbench/evaluator/main_html_evaluator.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ def __init__(self, metric_config: Dict[str, Any] = None):
2424
"""
2525
self.metric_calculator = MainHTMLMetricCalculator(metric_config)
2626
self.metric_config = metric_config or {}
27+
self.html2text = html2text.HTML2Text(bodywidth=0)
28+
self.html2text.ignore_links = True
29+
self.html2text.ignore_images = True
2730

2831

2932
def evaluate(self,
@@ -149,7 +152,8 @@ def _evaluate_sample(self, sample: DataSample, extractor: BaseExtractor) -> Dict
149152
return sample_result
150153

151154
main_html = extract_main_html(sample.html)
152-
convert_gt_main_content = html2text.html2text(main_html, sample.url, bodywidth=0)
155+
self.html2text.baseurl = sample.url
156+
convert_gt_main_content = self.html2text.handle(main_html)
153157
sample_result['groundtruth_content'] = sample.groundtruth_content
154158
sample_result['gt_main_html'] = main_html
155159
sample_result['convert_gt_main_content'] = convert_gt_main_content

webmainbench/extractors/dripper_extractor.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,15 @@ class DripperExtractor(BaseExtractor):
2828
def __init__(self, name: str, config: Optional[Dict[str, Any]] = None):
2929
# 先初始化inference_config,再调用父类初始化(因为父类会调用_setup())
3030
self.dripper = Dripper(config)
31+
self.html2text = html2text.HTML2Text(bodywidth=0)
32+
self.html2text.ignore_links = True
33+
self.html2text.ignore_images = True
3134

3235
# 现在可以安全地调用父类初始化(会调用_setup())
3336
super().__init__(name, config)
3437

3538
def _setup(self) -> None:
36-
#self.dripper.get_llm()
39+
self.dripper.get_llm()
3740
self.dripper.get_tokenizer()
3841

3942
def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
@@ -54,7 +57,8 @@ def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
5457
dripper_output : DripperOutput = self.dripper.process([dripper_input])[0]
5558

5659
main_html = dripper_output.main_html
57-
main_content = html2text.html2text(dripper_output.main_html, url, bodywidth=0)
60+
self.html2text.baseurl = url
61+
main_content = self.html2text.handle(main_html)
5862

5963

6064
extraction_time = time.time() - start_time

0 commit comments

Comments
 (0)