修改命名，防止与extractor混淆

1041206149 · 1041206149 · commit daea11763634 · 2025-09-25T15:33:52.000+08:00
diff --git a/examples/multi_extractor_compare.py b/examples/multi_extractor_compare.py
@@ -8,7 +8,7 @@ def all_extractor_comparison():
     print("\n=== 多抽取器对比演示 ===\n")
     
     # 创建数据集
-    dataset_path = Path("../data/sample_dataset.jsonl")
+    dataset_path = Path("../data/test_math.jsonl")
     dataset = DataLoader.load_jsonl(dataset_path)
 
     # 创建webkit抽取器
diff --git a/webmainbench/metrics/__init__.py b/webmainbench/metrics/__init__.py
@@ -11,10 +11,10 @@
 from .teds_metrics import TEDSMetric, StructureTEDSMetric
 from .calculator import MetricCalculator
 from .mainhtml_calculator import MainHTMLMetricCalculator
-from .base_extractor import ContentExtractor
-from .formula_extractor import FormulaExtractor
-from .code_extractor import CodeExtractor
-from .table_extractor import TableExtractor
+from .base_content_splitter import BaseContentSplitter
+from .formula_extractor import FormulaSplitter
+from .code_extractor import CodeSplitter
+from .table_extractor import TableSplitter
 
 __all__ = [
     "BaseMetric",
@@ -31,8 +31,8 @@
     "TextEditMetric",
     "MetricCalculator",
     "MainHTMLMetricCalculator",
-    'ContentExtractor',
-    'FormulaExtractor',
-    'CodeExtractor',
-    'TableExtractor',
+    'BaseContentSplitter',
+    'FormulaSplitter',
+    'CodeSplitter',
+    'TableSplitter',
 ] 
diff --git a/webmainbench/metrics/base.py b/webmainbench/metrics/base.py
@@ -198,20 +198,20 @@ def _extract_from_markdown(text: str, field_name: str = None) -> Dict[str, str]:
 
         # 创建提取器配置
         config = {
-            'llm_base_url': '',
-            'llm_api_key': '',
-            'llm_model': '',
-            'use_llm': False  # 使用时改为True
+            'llm_base_url': 'http://35.220.164.252:3888/v1/',
+            'llm_api_key': 'sk-PZgDr7sZdt77805Cg8s5ZB9QnGMGke61ovYnHYcHKIYVGHNA',
+            'llm_model': 'deepseek-chat',
+            'use_llm': True  # 使用时改为True
         }
 
         # 直接创建具体的提取器实例
-        from .code_extractor import CodeExtractor
-        from .formula_extractor import FormulaExtractor
-        from .table_extractor import TableExtractor
+        from .code_extractor import CodeSplitter
+        from .formula_extractor import FormulaSplitter
+        from .table_extractor import TableSplitter
 
-        code_extractor = CodeExtractor(config)
-        formula_extractor = FormulaExtractor(config)
-        table_extractor = TableExtractor(config)
+        code_extractor = CodeSplitter(config)
+        formula_extractor = FormulaSplitter(config)
+        table_extractor = TableSplitter(config)
 
         # 提取各类内容
         code_content = code_extractor.extract(text, field_name)
diff --git a/webmainbench/metrics/base_content_splitter.py b/webmainbench/metrics/base_content_splitter.py
@@ -0,0 +1 @@
+from abc import ABC, abstractmethodfrom typing import List, Dict, Anyimport osimport hashlibimport jsonclass BaseContentSplitter(ABC):    """抽象基类，用于从文本中提取特定类型的内容"""    def __init__(self, config: Dict[str, Any] = None):        """初始化提取器"""        self.config = config or {}        # 保留这行代码，用于控制是否使用LLM        self.use_llm = self.config.get('use_llm', True)        self.cache_dir = self.config.get('cache_dir',                                         os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),                                                      '.cache'))        os.makedirs(self.cache_dir, exist_ok=True)    @abstractmethod    def extract(self, text: str, field_name: str = None) -> str:        """提取特定类型的内容"""        pass    @abstractmethod    def extract_basic(self, text: str) -> List[str]:        """使用基本方法提取内容（通常是正则表达式）"""        pass    def should_use_llm(self, field_name: str) -> bool:        """判断是否应该使用LLM进行增强提取"""        if not self.use_llm:            return False        # 默认逻辑：对groundtruth内容不使用LLM，对其他内容使用        if field_name == "groundtruth_content":            print(f"[DEBUG] 检测到groundtruth内容，不使用LLM")            return False        return True    def enhance_with_llm(self, basic_results: List[str], cache_key: str = None) -> List[str]:        """使用LLM增强基本提取结果"""        if not basic_results:            print(f"[DEBUG] 输入内容为空，跳过LLM增强")            return []        # 生成缓存键        if cache_key is None:            content_str = '\n'.join(basic_results)            cache_key = hashlib.md5(content_str.encode('utf-8')).hexdigest()        cache_file = os.path.join(self.cache_dir, f'{self.__class__.__name__.lower()}_cache_{cache_key}.json')        # 检查缓存        if os.path.exists(cache_file):            try:                with open(cache_file, 'r', encoding='utf-8') as f:                    cached_result = json.load(f)                    print(f"[DEBUG] 从缓存加载LLM增强结果: {len(cached_result)} 个")                    return cached_result            except Exception as e:                print(f"[DEBUG] 缓存读取失败: {e}")        # 实际的LLM增强逻辑        try:            enhanced_results = self._llm_enhance(basic_results)            # 保存缓存            try:                with open(cache_file, 'w', encoding='utf-8') as f:                    json.dump(enhanced_results, f, ensure_ascii=False, indent=2)                print(f"[DEBUG] LLM增强结果已缓存到: {cache_file}")            except Exception as e:                print(f"[DEBUG] 缓存保存失败: {e}")            return enhanced_results        except Exception as e:            print(f"[DEBUG] LLM增强失败: {type(e).__name__}: {e}")            return basic_results    @abstractmethod    def _llm_enhance(self, basic_results: List[str]) -> List[str]:        """使用LLM增强基本提取结果的具体实现"""        pass
diff --git a/webmainbench/metrics/base_extractor.py b/webmainbench/metrics/base_extractor.py
diff --git a/webmainbench/metrics/code_extractor.py b/webmainbench/metrics/code_extractor.py
@@ -1 +1 @@
-# webmainbench/metrics/extractors/code_extractor.pyimport refrom typing import List, Dict, Anyfrom .base_extractor import ContentExtractorclass CodeExtractor(ContentExtractor):    """从文本中提取代码块"""    def extract(self, text: str, field_name: str = None) -> str:        """提取代码块"""        code_blocks = self.extract_basic(text)        if self.should_use_llm(field_name):            code_parts = self.enhance_with_llm(code_blocks)        else:            code_parts = code_blocks        return '\n'.join(code_parts)    def extract_basic(self, text: str) -> List[str]:        """使用正则表达式提取代码块"""        code_parts = []        # 处理三个反引号包裹的代码块        backtick_pattern = r'(```[\s\S]*?```)'        for match in re.finditer(backtick_pattern, text):            code_segment = match.group(0)            if code_segment.startswith('```'):                lines = code_segment.split('\n')                content_lines = lines[1:-1]                code_content = '\n'.join(content_lines)                if code_content:                    code_parts.append(code_content)        # 处理缩进代码块 - 定义缺失的模式        indent_pattern = r'(?:\n\s*\n)((?:(?: {4,}|\t+)[^\n]*(?:\n|$)){2,})(?=\n\s*\n|$)'        for match in re.finditer(indent_pattern, text, re.MULTILINE):            code_segment = match.group(1)            # 验证：确保所有行都是缩进的            lines = code_segment.split('\n')            all_indented = all(                line.startswith('    ') or line.startswith('\t') or not line.strip()                for line in lines                if line.strip()            )            if not all_indented:                continue            # 进一步验证代码特征            non_empty_lines = [line.strip() for line in lines if line.strip()]            if len(non_empty_lines) < 2:                continue            # 检查是否有明显的非代码特征            has_list_features = any(                re.match(r'^[-•*]\s', line) or                re.match(r'^\d+\.\s', line) or                re.search(r'\$[\d,]', line) or                re.search(r'\b(million|billion|thousand)\b', line, re.IGNORECASE)                for line in non_empty_lines            )            if has_list_features:                continue            # 清理代码段            cleaned_lines = []            for line in code_segment.split('\n'):                if line.strip():                    if line.startswith('    '):                        cleaned_lines.append(line[4:])                    elif line.startswith('\t'):                        cleaned_lines.append(line[1:])                    else:                        cleaned_lines.append(line)            code_content = '\n'.join(cleaned_lines)            if code_content.strip():                code_parts.append(code_content)        return code_parts    def _llm_enhance(self, basic_results: List[str]) -> List[str]:        """使用LLM增强代码提取结果（未实现）"""        print(f"[DEBUG] 代码LLM增强功能尚未实现，返回原始结果")        return basic_results
+# webmainbench/metrics/extractors/code_extractor.pyimport refrom typing import List, Dict, Anyfrom .base_content_splitter import BaseContentSplitterclass CodeSplitter(BaseContentSplitter):    """从文本中提取代码块"""    def extract(self, text: str, field_name: str = None) -> str:        """提取代码块"""        code_blocks = self.extract_basic(text)        if self.should_use_llm(field_name):            code_parts = self.enhance_with_llm(code_blocks)        else:            code_parts = code_blocks        return '\n'.join(code_parts)    def extract_basic(self, text: str) -> List[str]:        """使用正则表达式提取代码块"""        code_parts = []        # 处理三个反引号包裹的代码块        backtick_pattern = r'(```[\s\S]*?```)'        for match in re.finditer(backtick_pattern, text):            code_segment = match.group(0)            if code_segment.startswith('```'):                lines = code_segment.split('\n')                content_lines = lines[1:-1]                code_content = '\n'.join(content_lines)                if code_content:                    code_parts.append(code_content)        # 处理缩进代码块 - 定义缺失的模式        indent_pattern = r'(?:\n\s*\n)((?:(?: {4,}|\t+)[^\n]*(?:\n|$)){2,})(?=\n\s*\n|$)'        for match in re.finditer(indent_pattern, text, re.MULTILINE):            code_segment = match.group(1)            # 验证：确保所有行都是缩进的            lines = code_segment.split('\n')            all_indented = all(                line.startswith('    ') or line.startswith('\t') or not line.strip()                for line in lines                if line.strip()            )            if not all_indented:                continue            # 进一步验证代码特征            non_empty_lines = [line.strip() for line in lines if line.strip()]            if len(non_empty_lines) < 2:                continue            # 检查是否有明显的非代码特征            has_list_features = any(                re.match(r'^[-•*]\s', line) or                re.match(r'^\d+\.\s', line) or                re.search(r'\$[\d,]', line) or                re.search(r'\b(million|billion|thousand)\b', line, re.IGNORECASE)                for line in non_empty_lines            )            if has_list_features:                continue            # 清理代码段            cleaned_lines = []            for line in code_segment.split('\n'):                if line.strip():                    if line.startswith('    '):                        cleaned_lines.append(line[4:])                    elif line.startswith('\t'):                        cleaned_lines.append(line[1:])                    else:                        cleaned_lines.append(line)            code_content = '\n'.join(cleaned_lines)            if code_content.strip():                code_parts.append(code_content)        return code_parts    def _llm_enhance(self, basic_results: List[str]) -> List[str]:        """使用LLM增强代码提取结果（未实现）"""        print(f"[DEBUG] 代码LLM增强功能尚未实现，返回原始结果")        return basic_results
diff --git a/webmainbench/metrics/formula_extractor.py b/webmainbench/metrics/formula_extractor.py
@@ -2,9 +2,9 @@
 from typing import List
 from openai import OpenAI
 
-from .base_extractor import ContentExtractor
+from .base_content_splitter import BaseContentSplitter
 
-class FormulaExtractor(ContentExtractor):
+class FormulaSplitter(BaseContentSplitter):
     """从文本中提取数学公式"""
 
     def extract(self, text: str, field_name: str = None) -> str:
diff --git a/webmainbench/metrics/table_extractor.py b/webmainbench/metrics/table_extractor.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+from abc import ABC, abstractmethodfrom typing import List, Dict, Anyimport osimport hashlibimport jsonclass BaseContentSplitter(ABC): """抽象基类，用于从文本中提取特定类型的内容""" def __init__(self, config: Dict[str, Any] = None): """初始化提取器""" self.config = config or {} # 保留这行代码，用于控制是否使用LLM self.use_llm = self.config.get('use_llm', True) self.cache_dir = self.config.get('cache_dir', os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '.cache')) os.makedirs(self.cache_dir, exist_ok=True) @abstractmethod def extract(self, text: str, field_name: str = None) -> str: """提取特定类型的内容""" pass @abstractmethod def extract_basic(self, text: str) -> List[str]: """使用基本方法提取内容（通常是正则表达式）""" pass def should_use_llm(self, field_name: str) -> bool: """判断是否应该使用LLM进行增强提取""" if not self.use_llm: return False # 默认逻辑：对groundtruth内容不使用LLM，对其他内容使用 if field_name == "groundtruth_content": print(f"[DEBUG] 检测到groundtruth内容，不使用LLM") return False return True def enhance_with_llm(self, basic_results: List[str], cache_key: str = None) -> List[str]: """使用LLM增强基本提取结果""" if not basic_results: print(f"[DEBUG] 输入内容为空，跳过LLM增强") return [] # 生成缓存键 if cache_key is None: content_str = '\n'.join(basic_results) cache_key = hashlib.md5(content_str.encode('utf-8')).hexdigest() cache_file = os.path.join(self.cache_dir, f'{self.__class__.__name__.lower()}_cache_{cache_key}.json') # 检查缓存 if os.path.exists(cache_file): try: with open(cache_file, 'r', encoding='utf-8') as f: cached_result = json.load(f) print(f"[DEBUG] 从缓存加载LLM增强结果: {len(cached_result)} 个") return cached_result except Exception as e: print(f"[DEBUG] 缓存读取失败: {e}") # 实际的LLM增强逻辑 try: enhanced_results = self._llm_enhance(basic_results) # 保存缓存 try: with open(cache_file, 'w', encoding='utf-8') as f: json.dump(enhanced_results, f, ensure_ascii=False, indent=2) print(f"[DEBUG] LLM增强结果已缓存到: {cache_file}") except Exception as e: print(f"[DEBUG] 缓存保存失败: {e}") return enhanced_results except Exception as e: print(f"[DEBUG] LLM增强失败: {type(e).__name__}: {e}") return basic_results @abstractmethod def _llm_enhance(self, basic_results: List[str]) -> List[str]: """使用LLM增强基本提取结果的具体实现""" pass
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-# webmainbench/metrics/extractors/code_extractor.pyimport refrom typing import List, Dict, Anyfrom .base_extractor import ContentExtractorclass CodeExtractor(ContentExtractor): """从文本中提取代码块""" def extract(self, text: str, field_name: str = None) -> str: """提取代码块""" code_blocks = self.extract_basic(text) if self.should_use_llm(field_name): code_parts = self.enhance_with_llm(code_blocks) else: code_parts = code_blocks return '\n'.join(code_parts) def extract_basic(self, text: str) -> List[str]: """使用正则表达式提取代码块""" code_parts = [] # 处理三个反引号包裹的代码块 backtick_pattern = r'(```[\s\S]?```)' for match in re.finditer(backtick_pattern, text): code_segment = match.group(0) if code_segment.startswith('```'): lines = code_segment.split('\n') content_lines = lines[1:-1] code_content = '\n'.join(content_lines) if code_content: code_parts.append(code_content) # 处理缩进代码块 - 定义缺失的模式 indent_pattern = r'(?:\n\s\n)((?:(?: {4,}\|\t+)[^\n](?:\n\|$)){2,})(?=\n\s\n\|$)' for match in re.finditer(indent_pattern, text, re.MULTILINE): code_segment = match.group(1) # 验证：确保所有行都是缩进的 lines = code_segment.split('\n') all_indented = all( line.startswith(' ') or line.startswith('\t') or not line.strip() for line in lines if line.strip() ) if not all_indented: continue # 进一步验证代码特征 non_empty_lines = [line.strip() for line in lines if line.strip()] if len(non_empty_lines) < 2: continue # 检查是否有明显的非代码特征 has_list_features = any( re.match(r'^[-•*]\s', line) or re.match(r'^\d+\.\s', line) or re.search(r'\$[\d,]', line) or re.search(r'\b(million\|billion\|thousand)\b', line, re.IGNORECASE) for line in non_empty_lines ) if has_list_features: continue # 清理代码段 cleaned_lines = [] for line in code_segment.split('\n'): if line.strip(): if line.startswith(' '): cleaned_lines.append(line[4:]) elif line.startswith('\t'): cleaned_lines.append(line[1:]) else: cleaned_lines.append(line) code_content = '\n'.join(cleaned_lines) if code_content.strip(): code_parts.append(code_content) return code_parts def _llm_enhance(self, basic_results: List[str]) -> List[str]: """使用LLM增强代码提取结果（未实现）""" print(f"[DEBUG] 代码LLM增强功能尚未实现，返回原始结果") return basic_results
	`1`	+# webmainbench/metrics/extractors/code_extractor.pyimport refrom typing import List, Dict, Anyfrom .base_content_splitter import BaseContentSplitterclass CodeSplitter(BaseContentSplitter): """从文本中提取代码块""" def extract(self, text: str, field_name: str = None) -> str: """提取代码块""" code_blocks = self.extract_basic(text) if self.should_use_llm(field_name): code_parts = self.enhance_with_llm(code_blocks) else: code_parts = code_blocks return '\n'.join(code_parts) def extract_basic(self, text: str) -> List[str]: """使用正则表达式提取代码块""" code_parts = [] # 处理三个反引号包裹的代码块 backtick_pattern = r'(```[\s\S]?```)' for match in re.finditer(backtick_pattern, text): code_segment = match.group(0) if code_segment.startswith('```'): lines = code_segment.split('\n') content_lines = lines[1:-1] code_content = '\n'.join(content_lines) if code_content: code_parts.append(code_content) # 处理缩进代码块 - 定义缺失的模式 indent_pattern = r'(?:\n\s\n)((?:(?: {4,}\|\t+)[^\n](?:\n\|$)){2,})(?=\n\s\n\|$)' for match in re.finditer(indent_pattern, text, re.MULTILINE): code_segment = match.group(1) # 验证：确保所有行都是缩进的 lines = code_segment.split('\n') all_indented = all( line.startswith(' ') or line.startswith('\t') or not line.strip() for line in lines if line.strip() ) if not all_indented: continue # 进一步验证代码特征 non_empty_lines = [line.strip() for line in lines if line.strip()] if len(non_empty_lines) < 2: continue # 检查是否有明显的非代码特征 has_list_features = any( re.match(r'^[-•*]\s', line) or re.match(r'^\d+\.\s', line) or re.search(r'\$[\d,]', line) or re.search(r'\b(million\|billion\|thousand)\b', line, re.IGNORECASE) for line in non_empty_lines ) if has_list_features: continue # 清理代码段 cleaned_lines = [] for line in code_segment.split('\n'): if line.strip(): if line.startswith(' '): cleaned_lines.append(line[4:]) elif line.startswith('\t'): cleaned_lines.append(line[1:]) else: cleaned_lines.append(line) code_content = '\n'.join(cleaned_lines) if code_content.strip(): code_parts.append(code_content) return code_parts def _llm_enhance(self, basic_results: List[str]) -> List[str]: """使用LLM增强代码提取结果（未实现）""" print(f"[DEBUG] 代码LLM增强功能尚未实现，返回原始结果") return basic_results