Skip to content

Commit bd24034

Browse files
committed
fix match formula and code
1 parent 4621d5e commit bd24034

2 files changed

Lines changed: 37 additions & 23 deletions

File tree

examples/basic_usage.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -957,7 +957,7 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
957957
print("1. 从真实数据集加载预处理HTML数据...")
958958

959959
# 使用DataLoader加载真实的样本数据
960-
dataset_path = Path("/home/lulindong/Pycharm_projects/cc/WebMainBench_1949_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
960+
dataset_path = Path("/home/lulindong/Pycharm_projects/cc/WebMainBench_1848_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
961961
print(f"📂 数据集文件: {dataset_path}")
962962

963963
if not dataset_path.exists():
@@ -1101,7 +1101,6 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
11011101
# demo_extractor_comparison()
11021102
# demo_dataset_with_extraction() # 演示保存带有抽取内容的数据集
11031103
# demo_multi_extraction() # 演示多个抽取器同时评测
1104-
# demo_lld_workers_extraction()
11051104
print("\n✅ 示例运行完成!")
11061105

11071106
except Exception as e:

webmainbench/metrics/base.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -201,34 +201,49 @@ def _extract_from_markdown(text: str) -> Dict[str, str]:
201201

202202
# 收集所有需要移除的内容片段
203203
extracted_segments = []
204-
205-
# 提取代码
206204
code_parts = []
207-
# 代码块 ```code```
208-
for match in re.finditer(r'```[\s\S]*?```', text):
209-
code_block = match.group(0)
210-
extracted_segments.append(code_block)
211-
code_parts.append(code_block.strip('`').strip())
205+
# 同时匹配行内代码 `...` 和代码块 ```...```
206+
pattern = r'(```[\s\S]*?```|`[^`\n]+`)' # 匹配 ```...``` 或 `...`
207+
for match in re.finditer(pattern, text):
208+
code_segment = match.group(0)
209+
210+
# 判断是代码块还是行内代码
211+
if code_segment.startswith('```'):
212+
# 代码块,去掉 ``` 并去除首尾空白
213+
code_content = code_segment[3:-3].strip()
214+
else:
215+
# 行内代码,去掉 `
216+
code_content = code_segment[1:-1]
217+
218+
code_parts.append(code_content)
212219

213-
# 行内代码 `code`
214-
for match in re.finditer(r'`([^`]+)`', text):
215-
inline_code_full = match.group(0) # 包含反引号的完整匹配
216-
inline_code_content = match.group(1) # 只是内容
217-
extracted_segments.append(inline_code_full)
218-
code_parts.append(inline_code_content)
220+
# # 提取代码
221+
# code_parts = []
222+
# # 代码块 ```code```
223+
# for match in re.finditer(r'```[\s\S]*?```', text):
224+
# code_block = match.group(0)
225+
# extracted_segments.append(code_block)
226+
# code_parts.append(code_block.strip('`').strip())
227+
#
228+
# # 行内代码 `code`
229+
# for match in re.finditer(r'`([^`]+)`', text):
230+
# inline_code_full = match.group(0) # 包含反引号的完整匹配
231+
# inline_code_content = match.group(1) # 只是内容
232+
# extracted_segments.append(inline_code_full)
233+
# code_parts.append(inline_code_content)
219234

220235
# 提取公式
221236
formula_parts = []
222237
# 统一的公式提取模式
223238
latex_patterns = [
224-
r'(?<!\\)\$\$([^$]+)\$\$(?!\\)', # Display math (not escaped)
225-
r'(?<!\\)\$([^$\n]+)\$(?![\\\$])', # Inline math (not escaped)
226-
# r'\\begin\{equation\*?\}(.*?)\\end\{equation\*?\}', # Equation environment
227-
# r'\\begin\{align\*?\}(.*?)\\end\{align\*?\}', # Align environment
228-
# r'\\begin\{gather\*?\}(.*?)\\end\{gather\*?\}', # Gather environment
229-
# r'\\begin\{eqnarray\*?\}(.*?)\\end\{eqnarray\*?\}', # Eqnarray environment
230-
# r'\\begin\{multline\*?\}(.*?)\\end\{multline\*?\}', # Multline environment
231-
# r'\\begin\{split\}(.*?)\\end\{split\}', # Split environment
239+
# r'(?<!\\)\$\$([^$]+)\$\$(?!\\)', # Display math (not escaped)
240+
# r'(?<!\\)\$([^$\n]+)\$(?![\\\$])', # Inline math (not escaped)
241+
# r'(?<!\\)\$\$([^$]+)\$\$(?!\\)',
242+
# r'(?<!\\)\$([^$\n\w][^$\n]*[^$\n\w])\$(?![\\\$])',
243+
r'\$\$(.*?)\$\$', # 行间$$...$$
244+
r'\\\[(.*?)\\]', # 行间\[...\]
245+
r'\$(.*?)\$', # 行内$...$
246+
r'\\\((.*?)\\\)', # 行内\(...\)
232247
]
233248

234249
for pattern in latex_patterns:

0 commit comments

Comments
 (0)