Skip to content

Commit 26a33d3

Browse files
committed
Merge remote-tracking branch 'origin/chonggou' into chonggou
# Conflicts: # webmainbench/metrics/base_extractor.py # webmainbench/metrics/code_extractor.py # webmainbench/metrics/formula_extractor.py # webmainbench/metrics/table_extractor.py
2 parents 6182dfc + e73c2a5 commit 26a33d3

File tree

2 files changed

+93
-2
lines changed

2 files changed

+93
-2
lines changed

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ https://github.com/opendatalab/magic-html/releases/download/magic_html-0.1.5-rel
1212
streamlit
1313
markdown
1414
jieba
15-
apted
15+
apted
16+
openai
Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,91 @@
1-
# webmainbench/metrics/extractors/code_extractor.pyimport refrom typing import List, Dict, Anyfrom .base_content_splitter import BaseContentSplitterclass CodeSplitter(BaseContentSplitter): """从文本中提取代码块""" def extract(self, text: str, field_name: str = None) -> str: """提取代码块""" code_blocks = self.extract_basic(text) if self.should_use_llm(field_name): code_parts = self.enhance_with_llm(code_blocks) else: code_parts = code_blocks return '\n'.join(code_parts) def extract_basic(self, text: str) -> List[str]: """使用正则表达式提取代码块""" code_parts = [] # 处理三个反引号包裹的代码块 backtick_pattern = r'(```[\s\S]*?```)' for match in re.finditer(backtick_pattern, text): code_segment = match.group(0) if code_segment.startswith('```'): lines = code_segment.split('\n') content_lines = lines[1:-1] code_content = '\n'.join(content_lines) if code_content: code_parts.append(code_content) # 处理缩进代码块 - 定义缺失的模式 indent_pattern = r'(?:\n\s*\n)((?:(?: {4,}|\t+)[^\n]*(?:\n|$)){2,})(?=\n\s*\n|$)' for match in re.finditer(indent_pattern, text, re.MULTILINE): code_segment = match.group(1) # 验证:确保所有行都是缩进的 lines = code_segment.split('\n') all_indented = all( line.startswith(' ') or line.startswith('\t') or not line.strip() for line in lines if line.strip() ) if not all_indented: continue # 进一步验证代码特征 non_empty_lines = [line.strip() for line in lines if line.strip()] if len(non_empty_lines) < 2: continue # 检查是否有明显的非代码特征 has_list_features = any( re.match(r'^[-•*]\s', line) or re.match(r'^\d+\.\s', line) or re.search(r'\$[\d,]', line) or re.search(r'\b(million|billion|thousand)\b', line, re.IGNORECASE) for line in non_empty_lines ) if has_list_features: continue # 清理代码段 cleaned_lines = [] for line in code_segment.split('\n'): if line.strip(): if line.startswith(' '): cleaned_lines.append(line[4:]) elif line.startswith('\t'): cleaned_lines.append(line[1:]) else: cleaned_lines.append(line) code_content = '\n'.join(cleaned_lines) if code_content.strip(): code_parts.append(code_content) return code_parts def _llm_enhance(self, basic_results: List[str]) -> List[str]: """使用LLM增强代码提取结果(未实现)""" print(f"[DEBUG] 代码LLM增强功能尚未实现,返回原始结果") return basic_results
1+
# webmainbench/metrics/extractors/code_extractor.py
2+
import re
3+
from typing import List, Dict, Any
4+
5+
from .base_content_splitter import BaseContentSplitter
6+
7+
8+
class CodeSplitter(BaseContentSplitter):
9+
"""从文本中提取代码块"""
10+
11+
def extract(self, text: str, field_name: str = None) -> str:
12+
"""提取代码块"""
13+
code_blocks = self.extract_basic(text)
14+
15+
if self.should_use_llm(field_name):
16+
code_parts = self.enhance_with_llm(code_blocks)
17+
else:
18+
code_parts = code_blocks
19+
20+
return '\n'.join(code_parts)
21+
22+
def extract_basic(self, text: str) -> List[str]:
23+
"""使用正则表达式提取代码块"""
24+
code_parts = []
25+
26+
# 处理三个反引号包裹的代码块
27+
backtick_pattern = r'(```[\s\S]*?```)'
28+
for match in re.finditer(backtick_pattern, text):
29+
code_segment = match.group(0)
30+
if code_segment.startswith('```'):
31+
lines = code_segment.split('\n')
32+
content_lines = lines[1:-1]
33+
code_content = '\n'.join(content_lines)
34+
if code_content:
35+
code_parts.append(code_content)
36+
37+
# 处理缩进代码块 - 定义缺失的模式
38+
indent_pattern = r'(?:\n\s*\n)((?:(?: {4,}|\t+)[^\n]*(?:\n|$)){2,})(?=\n\s*\n|$)'
39+
40+
for match in re.finditer(indent_pattern, text, re.MULTILINE):
41+
code_segment = match.group(1)
42+
43+
# 验证:确保所有行都是缩进的
44+
lines = code_segment.split('\n')
45+
all_indented = all(
46+
line.startswith(' ') or line.startswith('\t') or not line.strip()
47+
for line in lines
48+
if line.strip()
49+
)
50+
51+
if not all_indented:
52+
continue
53+
54+
# 进一步验证代码特征
55+
non_empty_lines = [line.strip() for line in lines if line.strip()]
56+
if len(non_empty_lines) < 2:
57+
continue
58+
59+
# 检查是否有明显的非代码特征
60+
has_list_features = any(
61+
re.match(r'^[-•*]\s', line) or
62+
re.match(r'^\d+\.\s', line) or
63+
re.search(r'\$[\d,]', line) or
64+
re.search(r'\b(million|billion|thousand)\b', line, re.IGNORECASE)
65+
for line in non_empty_lines
66+
)
67+
68+
if has_list_features:
69+
continue
70+
71+
# 清理代码段
72+
cleaned_lines = []
73+
for line in code_segment.split('\n'):
74+
if line.strip():
75+
if line.startswith(' '):
76+
cleaned_lines.append(line[4:])
77+
elif line.startswith('\t'):
78+
cleaned_lines.append(line[1:])
79+
else:
80+
cleaned_lines.append(line)
81+
82+
code_content = '\n'.join(cleaned_lines)
83+
if code_content.strip():
84+
code_parts.append(code_content)
85+
86+
return code_parts
87+
88+
def _llm_enhance(self, basic_results: List[str]) -> List[str]:
89+
"""使用LLM增强代码提取结果(未实现)"""
90+
print(f"[DEBUG] 代码LLM增强功能尚未实现,返回原始结果")
91+
return basic_results

0 commit comments

Comments
 (0)