mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-11 08:18:01 +00:00
fix: handle nested brackets and parentheses in LINK_PATTERN regex
The previous regex [^\]]+ stopped at the first ] which broke markdown links containing embedded images like: The new pattern allows one level of nested [...] in the link text and one level of nested (...) in the URL, correctly handling: - Embedded images in link text - Wikipedia-style URLs with parentheses Fixes #711
This commit is contained in:
@@ -8,7 +8,7 @@ import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
# Pre-compile the regex pattern
|
||||
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
|
||||
LINK_PATTERN = re.compile(r'!?\[((?:[^\[\]]|\[(?:[^\[\]]|\[[^\]]*\])*\])*)\]\(((?:[^()\s]|\([^()]*\))*)(?:\s+"([^"]*)")?\)')
|
||||
|
||||
|
||||
def fast_urljoin(base: str, url: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user