1- # webmainbench/metrics/extractors/code_extractor.py import re from typing import List, Dict, Any from .base_content_splitter import BaseContentSplitter class CodeSplitter(BaseContentSplitter): """从文本中提取代码块""" def extract(self, text: str, field_name: str = None) -> str: """提取代码块""" code_blocks = self.extract_basic(text) if self.should_use_llm(field_name): code_parts = self.enhance_with_llm(code_blocks) else: code_parts = code_blocks return '\n'.join(code_parts) def extract_basic(self, text: str) -> List[str]: """使用正则表达式提取代码块""" code_parts = [] # 处理三个反引号包裹的代码块 backtick_pattern = r'(```[\s\S]*?```)' for match in re.finditer(backtick_pattern, text): code_segment = match.group(0) if code_segment.startswith('```'): lines = code_segment.split('\n') content_lines = lines[1:-1] code_content = '\n'.join(content_lines) if code_content: code_parts.append(code_content) # 处理缩进代码块 - 定义缺失的模式 indent_pattern = r'(?:\n\s*\n)((?:(?: {4,}|\t+)[^\n]*(?:\n|$)){2,})(?=\n\s*\n|$)' for match in re.finditer(indent_pattern, text, re.MULTILINE): code_segment = match.group(1) # 验证:确保所有行都是缩进的 lines = code_segment.split('\n') all_indented = all( line.startswith(' ') or line.startswith('\t') or not line.strip() for line in lines if line.strip() ) if not all_indented: continue # 进一步验证代码特征 non_empty_lines = [line.strip() for line in lines if line.strip()] if len(non_empty_lines) < 2: continue # 检查是否有明显的非代码特征 has_list_features = any( re.match(r'^[-•*]\s', line) or re.match(r'^\d+\.\s', line) or re.search(r'\$[\d,]', line) or re.search(r'\b(million|billion|thousand)\b', line, re.IGNORECASE) for line in non_empty_lines ) if has_list_features: continue # 清理代码段 cleaned_lines = [] for line in code_segment.split('\n'): if line.strip(): if line.startswith(' '): cleaned_lines.append(line[4:]) elif line.startswith('\t'): cleaned_lines.append(line[1:]) else: cleaned_lines.append(line) code_content = '\n'.join(cleaned_lines) if code_content.strip(): code_parts.append(code_content) return code_parts def _llm_enhance(self, basic_results: List[str]) -> List[str]: """使用LLM增强代码提取结果(未实现)""" print(f"[DEBUG] 代码LLM增强功能尚未实现,返回原始结果") return basic_results
1+ # webmainbench/metrics/extractors/code_extractor.py
2+ import re
3+ from typing import List , Dict , Any
4+
5+ from .base_content_splitter import BaseContentSplitter
6+
7+
8+ class CodeSplitter (BaseContentSplitter ):
9+ """从文本中提取代码块"""
10+
11+ def extract (self , text : str , field_name : str = None ) -> str :
12+ """提取代码块"""
13+ code_blocks = self .extract_basic (text )
14+
15+ if self .should_use_llm (field_name ):
16+ code_parts = self .enhance_with_llm (code_blocks )
17+ else :
18+ code_parts = code_blocks
19+
20+ return '\n ' .join (code_parts )
21+
22+ def extract_basic (self , text : str ) -> List [str ]:
23+ """使用正则表达式提取代码块"""
24+ code_parts = []
25+
26+ # 处理三个反引号包裹的代码块
27+ backtick_pattern = r'(```[\s\S]*?```)'
28+ for match in re .finditer (backtick_pattern , text ):
29+ code_segment = match .group (0 )
30+ if code_segment .startswith ('```' ):
31+ lines = code_segment .split ('\n ' )
32+ content_lines = lines [1 :- 1 ]
33+ code_content = '\n ' .join (content_lines )
34+ if code_content :
35+ code_parts .append (code_content )
36+
37+ # 处理缩进代码块 - 定义缺失的模式
38+ indent_pattern = r'(?:\n\s*\n)((?:(?: {4,}|\t+)[^\n]*(?:\n|$)){2,})(?=\n\s*\n|$)'
39+
40+ for match in re .finditer (indent_pattern , text , re .MULTILINE ):
41+ code_segment = match .group (1 )
42+
43+ # 验证:确保所有行都是缩进的
44+ lines = code_segment .split ('\n ' )
45+ all_indented = all (
46+ line .startswith (' ' ) or line .startswith ('\t ' ) or not line .strip ()
47+ for line in lines
48+ if line .strip ()
49+ )
50+
51+ if not all_indented :
52+ continue
53+
54+ # 进一步验证代码特征
55+ non_empty_lines = [line .strip () for line in lines if line .strip ()]
56+ if len (non_empty_lines ) < 2 :
57+ continue
58+
59+ # 检查是否有明显的非代码特征
60+ has_list_features = any (
61+ re .match (r'^[-•*]\s' , line ) or
62+ re .match (r'^\d+\.\s' , line ) or
63+ re .search (r'\$[\d,]' , line ) or
64+ re .search (r'\b(million|billion|thousand)\b' , line , re .IGNORECASE )
65+ for line in non_empty_lines
66+ )
67+
68+ if has_list_features :
69+ continue
70+
71+ # 清理代码段
72+ cleaned_lines = []
73+ for line in code_segment .split ('\n ' ):
74+ if line .strip ():
75+ if line .startswith (' ' ):
76+ cleaned_lines .append (line [4 :])
77+ elif line .startswith ('\t ' ):
78+ cleaned_lines .append (line [1 :])
79+ else :
80+ cleaned_lines .append (line )
81+
82+ code_content = '\n ' .join (cleaned_lines )
83+ if code_content .strip ():
84+ code_parts .append (code_content )
85+
86+ return code_parts
87+
88+ def _llm_enhance (self , basic_results : List [str ]) -> List [str ]:
89+ """使用LLM增强代码提取结果(未实现)"""
90+ print (f"[DEBUG] 代码LLM增强功能尚未实现,返回原始结果" )
91+ return basic_results
0 commit comments