44
55from typing import Dict , Any , List , Optional , Union
66from datetime import datetime
7- import html2text
87
98from ..data import BenchmarkDataset , DataSample
109from ..extractors import BaseExtractor , ExtractorFactory
1110from ..metrics import MainHTMLMetricCalculator
1211from .evaluator import EvaluationResult , Evaluator
13- from ..utils import extract_main_html
12+ from ..utils import extract_main_html , HTML2TextWrapper
1413
1514class MainHTMLEvaluator (Evaluator ):
1615 """Main html evaluator for web content extraction benchmarks."""
@@ -24,9 +23,7 @@ def __init__(self, metric_config: Dict[str, Any] = None):
2423 """
2524 self .metric_calculator = MainHTMLMetricCalculator (metric_config )
2625 self .metric_config = metric_config or {}
27- self .html2text = html2text .HTML2Text (bodywidth = 0 )
28- self .html2text .ignore_links = True
29- self .html2text .ignore_images = True
26+ self .html2text = HTML2TextWrapper ()
3027
3128
3229 def evaluate (self ,
@@ -152,8 +149,7 @@ def _evaluate_sample(self, sample: DataSample, extractor: BaseExtractor) -> Dict
152149 return sample_result
153150
154151 main_html = extract_main_html (sample .html )
155- self .html2text .baseurl = sample .url
156- convert_gt_main_content = self .html2text .handle (main_html )
152+ convert_gt_main_content = self .html2text (main_html , sample .url )
157153 sample_result ['groundtruth_content' ] = sample .groundtruth_content
158154 sample_result ['gt_main_html' ] = main_html
159155 sample_result ['convert_gt_main_content' ] = convert_gt_main_content
0 commit comments