Skip to content

Commit b3a8943

Browse files
committed
feat: replace html2text with HTML2TextWrapper for improved HTML to text conversion in evaluators and extractors
1 parent 83d8741 commit b3a8943

4 files changed

Lines changed: 23 additions & 17 deletions

File tree

webmainbench/evaluator/main_html_evaluator.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,12 @@
44

55
from typing import Dict, Any, List, Optional, Union
66
from datetime import datetime
7-
import html2text
87

98
from ..data import BenchmarkDataset, DataSample
109
from ..extractors import BaseExtractor, ExtractorFactory
1110
from ..metrics import MainHTMLMetricCalculator
1211
from .evaluator import EvaluationResult, Evaluator
13-
from ..utils import extract_main_html
12+
from ..utils import extract_main_html, HTML2TextWrapper
1413

1514
class MainHTMLEvaluator(Evaluator):
1615
"""Main html evaluator for web content extraction benchmarks."""
@@ -24,9 +23,7 @@ def __init__(self, metric_config: Dict[str, Any] = None):
2423
"""
2524
self.metric_calculator = MainHTMLMetricCalculator(metric_config)
2625
self.metric_config = metric_config or {}
27-
self.html2text = html2text.HTML2Text(bodywidth=0)
28-
self.html2text.ignore_links = True
29-
self.html2text.ignore_images = True
26+
self.html2text = HTML2TextWrapper()
3027

3128

3229
def evaluate(self,
@@ -152,8 +149,7 @@ def _evaluate_sample(self, sample: DataSample, extractor: BaseExtractor) -> Dict
152149
return sample_result
153150

154151
main_html = extract_main_html(sample.html)
155-
self.html2text.baseurl = sample.url
156-
convert_gt_main_content = self.html2text.handle(main_html)
152+
convert_gt_main_content = self.html2text(main_html, sample.url)
157153
sample_result['groundtruth_content'] = sample.groundtruth_content
158154
sample_result['gt_main_html'] = main_html
159155
sample_result['convert_gt_main_content'] = convert_gt_main_content

webmainbench/extractors/dripper_extractor.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,12 @@
77
import time
88
from typing import Dict, Any, Optional, List
99

10-
import html2text
1110
from dripper.api import Dripper
1211
from dripper.base import DripperInput, DripperOutput
1312
from .base import BaseExtractor, ExtractionResult
1413
from .factory import extractor
1514

16-
from ..utils import extract_main_html
15+
from ..utils import HTML2TextWrapper
1716

1817

1918

@@ -28,9 +27,7 @@ class DripperExtractor(BaseExtractor):
2827
def __init__(self, name: str, config: Optional[Dict[str, Any]] = None):
2928
# 先初始化inference_config,再调用父类初始化(因为父类会调用_setup())
3029
self.dripper = Dripper(config)
31-
self.html2text = html2text.HTML2Text(bodywidth=0)
32-
self.html2text.ignore_links = True
33-
self.html2text.ignore_images = True
30+
self.html2text = HTML2TextWrapper()
3431

3532
# 现在可以安全地调用父类初始化(会调用_setup())
3633
super().__init__(name, config)
@@ -57,10 +54,8 @@ def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
5754
dripper_output : DripperOutput = self.dripper.process([dripper_input])[0]
5855

5956
main_html = dripper_output.main_html
60-
self.html2text.baseurl = url
61-
main_content = self.html2text.handle(main_html)
57+
main_content = self.html2text(main_html, url)
6258

63-
6459
extraction_time = time.time() - start_time
6560

6661
# 创建结果对象

webmainbench/utils/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
"""
44

55
from .helpers import setup_logging, validate_config, format_results
6-
from .main_html import extract_main_html
6+
from .main_html import extract_main_html, HTML2TextWrapper
77

88
__all__ = [
99
"setup_logging",
1010
"validate_config",
1111
"format_results",
1212
"extract_main_html",
13+
"HTML2TextWrapper",
1314
]

webmainbench/utils/main_html.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,25 @@
11

22
from lxml import html
3-
import html2text
3+
44

55

66
SELECT_ATTR = 'cc-select'
77

88

9+
class HTML2TextWrapper:
10+
def __init__(self):
11+
import html2text
12+
self.converter = html2text.HTML2Text(bodywidth=0)
13+
self.converter.ignore_links = True
14+
self.converter.ignore_images = True
15+
16+
def __call__(self, html_str: str, url: str = '') -> str:
17+
self.converter.baseurl = url
18+
text = self.converter.handle(html_str)
19+
self.converter.baseurl = ''
20+
return text
21+
22+
923
def html_to_element(html_str: str) -> html.HtmlElement:
1024
parser = html.HTMLParser(
1125
collect_ids=False,

0 commit comments

Comments
 (0)