@@ -201,34 +201,49 @@ def _extract_from_markdown(text: str) -> Dict[str, str]:
201201
202202 # 收集所有需要移除的内容片段
203203 extracted_segments = []
204-
205- # 提取代码
206204 code_parts = []
207- # 代码块 ```code```
208- for match in re .finditer (r'```[\s\S]*?```' , text ):
209- code_block = match .group (0 )
210- extracted_segments .append (code_block )
211- code_parts .append (code_block .strip ('`' ).strip ())
205+ # 同时匹配行内代码 `...` 和代码块 ```...```
206+ pattern = r'(```[\s\S]*?```|`[^`\n]+`)' # 匹配 ```...``` 或 `...`
207+ for match in re .finditer (pattern , text ):
208+ code_segment = match .group (0 )
209+
210+ # 判断是代码块还是行内代码
211+ if code_segment .startswith ('```' ):
212+ # 代码块,去掉 ``` 并去除首尾空白
213+ code_content = code_segment [3 :- 3 ].strip ()
214+ else :
215+ # 行内代码,去掉 `
216+ code_content = code_segment [1 :- 1 ]
217+
218+ code_parts .append (code_content )
212219
213- # 行内代码 `code`
214- for match in re .finditer (r'`([^`]+)`' , text ):
215- inline_code_full = match .group (0 ) # 包含反引号的完整匹配
216- inline_code_content = match .group (1 ) # 只是内容
217- extracted_segments .append (inline_code_full )
218- code_parts .append (inline_code_content )
220+ # # 提取代码
221+ # code_parts = []
222+ # # 代码块 ```code```
223+ # for match in re.finditer(r'```[\s\S]*?```', text):
224+ # code_block = match.group(0)
225+ # extracted_segments.append(code_block)
226+ # code_parts.append(code_block.strip('`').strip())
227+ #
228+ # # 行内代码 `code`
229+ # for match in re.finditer(r'`([^`]+)`', text):
230+ # inline_code_full = match.group(0) # 包含反引号的完整匹配
231+ # inline_code_content = match.group(1) # 只是内容
232+ # extracted_segments.append(inline_code_full)
233+ # code_parts.append(inline_code_content)
219234
220235 # 提取公式
221236 formula_parts = []
222237 # 统一的公式提取模式
223238 latex_patterns = [
224- r'(?<!\\)\$\$([^$]+)\$\$(?!\\)' , # Display math (not escaped)
225- r'(?<!\\)\$([^$\n]+)\$(?![\\\$])' , # Inline math (not escaped)
226- # r'\\begin\{equation\*?\}(.*?)\\end\{equation\*?\}', # Equation environment
227- # r'\\begin\{align\*?\}(.*?)\\end\{align\*?\}', # Align environment
228- # r'\\begin\{gather\*?\} (.*?)\\end\{gather\*?\} ', # Gather environment
229- # r'\\begin\{eqnarray\*?\} (.*?)\\end\{eqnarray\*?\} ', # Eqnarray environment
230- # r'\\begin\{multline\*?\} (.*?)\\end\{multline\*?\} ', # Multline environment
231- # r'\\begin\{split\}( .*?)\\end\{split\} ', # Split environment
239+ # r'(?<!\\)\$\$([^$]+)\$\$(?!\\)', # Display math (not escaped)
240+ # r'(?<!\\)\$([^$\n]+)\$(?![\\\$])', # Inline math (not escaped)
241+ # r'(?<!\\)\$\$([^$]+)\$\$(?!\\)',
242+ # r'(?<!\\)\$([^$\n\w][^$\n]*[^$\n\w])\$(?![\\\$])',
243+ r'\$\$ (.*?)\$\$ ' , # 行间$$...$$
244+ r'\\\[ (.*?)\\] ' , # 行间\[...\]
245+ r'\$ (.*?)\$ ' , # 行内$...$
246+ r'\\\(( .*?)\\\) ' , # 行内\(...\)
232247 ]
233248
234249 for pattern in latex_patterns :
0 commit comments