Merge remote-tracking branch 'origin/chonggou' into chonggou

1041206149 · 1041206149 · commit 26a33d3d4112 · 2025-09-25T16:14:41.000+08:00
# Conflicts:
#	webmainbench/metrics/base_extractor.py
#	webmainbench/metrics/code_extractor.py
#	webmainbench/metrics/formula_extractor.py
#	webmainbench/metrics/table_extractor.py
diff --git a/requirements.txt b/requirements.txt
@@ -12,4 +12,5 @@ https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.5-rel
 streamlit
 markdown
 jieba
-apted
+apted
+openai
diff --git a/webmainbench/metrics/code_extractor.py b/webmainbench/metrics/code_extractor.py
@@ -1 +1,91 @@
-# webmainbench/metrics/extractors/code_extractor.pyimport refrom typing import List, Dict, Anyfrom .base_content_splitter import BaseContentSplitterclass CodeSplitter(BaseContentSplitter):    """从文本中提取代码块"""    def extract(self, text: str, field_name: str = None) -> str:        """提取代码块"""        code_blocks = self.extract_basic(text)        if self.should_use_llm(field_name):            code_parts = self.enhance_with_llm(code_blocks)        else:            code_parts = code_blocks        return '\n'.join(code_parts)    def extract_basic(self, text: str) -> List[str]:        """使用正则表达式提取代码块"""        code_parts = []        # 处理三个反引号包裹的代码块        backtick_pattern = r'(```[\s\S]*?```)'        for match in re.finditer(backtick_pattern, text):            code_segment = match.group(0)            if code_segment.startswith('```'):                lines = code_segment.split('\n')                content_lines = lines[1:-1]                code_content = '\n'.join(content_lines)                if code_content:                    code_parts.append(code_content)        # 处理缩进代码块 - 定义缺失的模式        indent_pattern = r'(?:\n\s*\n)((?:(?: {4,}|\t+)[^\n]*(?:\n|$)){2,})(?=\n\s*\n|$)'        for match in re.finditer(indent_pattern, text, re.MULTILINE):            code_segment = match.group(1)            # 验证：确保所有行都是缩进的            lines = code_segment.split('\n')            all_indented = all(                line.startswith('    ') or line.startswith('\t') or not line.strip()                for line in lines                if line.strip()            )            if not all_indented:                continue            # 进一步验证代码特征            non_empty_lines = [line.strip() for line in lines if line.strip()]            if len(non_empty_lines) < 2:                continue            # 检查是否有明显的非代码特征            has_list_features = any(                re.match(r'^[-•*]\s', line) or                re.match(r'^\d+\.\s', line) or                re.search(r'\$[\d,]', line) or                re.search(r'\b(million|billion|thousand)\b', line, re.IGNORECASE)                for line in non_empty_lines            )            if has_list_features:                continue            # 清理代码段            cleaned_lines = []            for line in code_segment.split('\n'):                if line.strip():                    if line.startswith('    '):                        cleaned_lines.append(line[4:])                    elif line.startswith('\t'):                        cleaned_lines.append(line[1:])                    else:                        cleaned_lines.append(line)            code_content = '\n'.join(cleaned_lines)            if code_content.strip():                code_parts.append(code_content)        return code_parts    def _llm_enhance(self, basic_results: List[str]) -> List[str]:        """使用LLM增强代码提取结果（未实现）"""        print(f"[DEBUG] 代码LLM增强功能尚未实现，返回原始结果")        return basic_results
+# webmainbench/metrics/extractors/code_extractor.py
+import re
+from typing import List, Dict, Any
+
+from .base_content_splitter import BaseContentSplitter
+
+
+class CodeSplitter(BaseContentSplitter):
+    """从文本中提取代码块"""
+
+    def extract(self, text: str, field_name: str = None) -> str:
+        """提取代码块"""
+        code_blocks = self.extract_basic(text)
+
+        if self.should_use_llm(field_name):
+            code_parts = self.enhance_with_llm(code_blocks)
+        else:
+            code_parts = code_blocks
+
+        return '\n'.join(code_parts)
+
+    def extract_basic(self, text: str) -> List[str]:
+        """使用正则表达式提取代码块"""
+        code_parts = []
+
+        # 处理三个反引号包裹的代码块
+        backtick_pattern = r'(```[\s\S]*?```)'
+        for match in re.finditer(backtick_pattern, text):
+            code_segment = match.group(0)
+            if code_segment.startswith('```'):
+                lines = code_segment.split('\n')
+                content_lines = lines[1:-1]
+                code_content = '\n'.join(content_lines)
+                if code_content:
+                    code_parts.append(code_content)
+
+        # 处理缩进代码块 - 定义缺失的模式
+        indent_pattern = r'(?:\n\s*\n)((?:(?: {4,}|\t+)[^\n]*(?:\n|$)){2,})(?=\n\s*\n|$)'
+
+        for match in re.finditer(indent_pattern, text, re.MULTILINE):
+            code_segment = match.group(1)
+
+            # 验证：确保所有行都是缩进的
+            lines = code_segment.split('\n')
+            all_indented = all(
+                line.startswith('    ') or line.startswith('\t') or not line.strip()
+                for line in lines
+                if line.strip()
+            )
+
+            if not all_indented:
+                continue
+
+            # 进一步验证代码特征
+            non_empty_lines = [line.strip() for line in lines if line.strip()]
+            if len(non_empty_lines) < 2:
+                continue
+
+            # 检查是否有明显的非代码特征
+            has_list_features = any(
+                re.match(r'^[-•*]\s', line) or
+                re.match(r'^\d+\.\s', line) or
+                re.search(r'\$[\d,]', line) or
+                re.search(r'\b(million|billion|thousand)\b', line, re.IGNORECASE)
+                for line in non_empty_lines
+            )
+
+            if has_list_features:
+                continue
+
+            # 清理代码段
+            cleaned_lines = []
+            for line in code_segment.split('\n'):
+                if line.strip():
+                    if line.startswith('    '):
+                        cleaned_lines.append(line[4:])
+                    elif line.startswith('\t'):
+                        cleaned_lines.append(line[1:])
+                    else:
+                        cleaned_lines.append(line)
+
+            code_content = '\n'.join(cleaned_lines)
+            if code_content.strip():
+                code_parts.append(code_content)
+
+        return code_parts
+
+    def _llm_enhance(self, basic_results: List[str]) -> List[str]:
+        """使用LLM增强代码提取结果（未实现）"""
+        print(f"[DEBUG] 代码LLM增强功能尚未实现，返回原始结果")
+        return basic_results