@@ -293,43 +293,55 @@ def _extract_from_markdown(text: str, field_name: str = None) -> Dict[str, str]:
293293 if code_content .strip ():
294294 code_parts .append (code_content )
295295
296- # 提取公式 - 根据字段类型决定使用API还是正则
296+ # 提取公式 - 新的两步处理逻辑
297297 formula_parts = []
298298
299- # 如果是groundtruth_content,使用正则提取公式
300- if field_name == "llm_webkit_md" :
301- print (f"[DEBUG] 检测到groundtruth内容,使用正则提取公式" )
302- # 统一的公式提取模式
303- latex_patterns = [
304- r'(?<!\\)\$\$(.*?)(?<!\\)\$\$' , # 行间 $$...$$
305- r'(?<!\\)\\\[(.*?)(?<!\\)\\\]' , # 行间 \[...\]
306- r'(?<!\\)\$(.*?)(?<!\\)\$' , # 行内 $...$
307- r'(?<!\\)\\\((.*?)(?<!\\)\\\)' , # 行内 \(...\)
308- ]
309-
310- for pattern in latex_patterns :
311- for match in re .finditer (pattern , text , re .DOTALL ):
312- formula_full = match .group (0 )
313- formula_content = match .group (1 )
314- extracted_segments .append (formula_full )
315- if formula_content .strip ():
316- formula_parts .append (formula_content .strip ())
299+ # 第一步:先用正则提取公式
300+ regex_formulas = []
301+ latex_patterns = [
302+ r'(?<!\\)\$\$(.*?)(?<!\\)\$\$' , # 行间 $$...$$
303+ r'(?<!\\)\\\[(.*?)(?<!\\)\\\]' , # 行间 \[...\]
304+ r'(?<!\\)\$(.*?)(?<!\\)\$' , # 行内 $...$
305+ r'(?<!\\)\\\((.*?)(?<!\\)\\\)' , # 行内 \(...\)
306+ ]
307+
308+ for pattern in latex_patterns :
309+ for match in re .finditer (pattern , text , re .DOTALL ):
310+ formula_full = match .group (0 )
311+ formula_content = match .group (1 )
312+ extracted_segments .append (formula_full )
313+ if formula_content .strip ():
314+ regex_formulas .append (formula_content .strip ())
315+
316+ # 第二步:根据字段类型决定是否需要API修正
317+ if field_name == "groundtruth_content" :
318+ print (f"[DEBUG] 检测到groundtruth内容,仅使用正则提取公式" )
319+ formula_parts = regex_formulas
317320 else :
318- # 其他内容使用API提取公式
319- cache_dir = os .path .join (os .path .dirname (os .path .abspath (__file__ )), '.cache' )
320- os .makedirs (cache_dir , exist_ok = True )
321-
322- # 使用文本哈希作为缓存文件名
323- text_hash = hashlib .md5 (text .encode ('utf-8' )).hexdigest ()
324- cache_file = os .path .join (cache_dir , f'formula_cache_{ text_hash } .json' )
325-
326- # 使用LLM API提取公式
327- try :
328- from .formula_extractor import extract_formulas_with_llm
329- formula_parts = extract_formulas_with_llm (text , cache_file )
330- print (f"[DEBUG] 公式提取成功,提取到 { len (formula_parts )} 个公式" )
331- except Exception as e :
332- print (f"[DEBUG] 公式提取失败: { type (e ).__name__ } : { e } " )
321+ print (f"[DEBUG] 检测到llm_webkit_md内容,使用正则+API修正模式" )
322+ # 对于llm_webkit_md,将正则结果传递给API进行修正
323+ if regex_formulas :
324+ # 将正则提取的公式作为输入传递给API
325+ regex_formulas_text = '\n ' .join (regex_formulas )
326+ print (f"[DEBUG] 正则提取到 { len (regex_formulas )} 个公式,准备API修正" )
327+
328+ cache_dir = os .path .join (os .path .dirname (os .path .abspath (__file__ )), '.cache' )
329+ os .makedirs (cache_dir , exist_ok = True )
330+
331+ # 使用正则结果的哈希作为缓存文件名
332+ text_hash = hashlib .md5 (regex_formulas_text .encode ('utf-8' )).hexdigest ()
333+ cache_file = os .path .join (cache_dir , f'formula_correction_cache_{ text_hash } .json' )
334+
335+ try :
336+ from .formula_extractor import correct_formulas_with_llm
337+ corrected_formulas = correct_formulas_with_llm (regex_formulas , cache_file )
338+ formula_parts = corrected_formulas
339+ print (f"[DEBUG] API修正成功,最终得到 { len (formula_parts )} 个公式" )
340+ except Exception as e :
341+ print (f"[DEBUG] API修正失败: { type (e ).__name__ } : { e } ,使用正则结果" )
342+ formula_parts = regex_formulas
343+ else :
344+ print (f"[DEBUG] 正则未提取到公式,跳过API修正" )
333345 formula_parts = []
334346
335347 # 提取表格
0 commit comments