fix: batch fix for 10 open issues (#1520, #1489, #1374, #1424, #1183, #1354, #880, #1031, #1251, #1758)

- #1520: Preserve trailing slashes in URL normalization (RFC 3986 compliance) - #1489: Preserve query parameter key casing in normalize_url - #1374: Close NamedTemporaryFile handle before reopening (Windows fix) - #1424: Fix CosineStrategy returning empty results (delimiter fallback + at_least_k >= 1) - #1183: Fix extract_xml_data regex matching tag names in prose text - #1354: Make import_knowledge_base async (fix asyncio.run in running loop) - #880: Fix 404 sample_ecommerce.html gist URL in docs (6 occurrences) - #1031: Make Docker playground code editor resizable with overflow-auto - #1251: Add DEFAULT_CONFIG with deep-merge in load_config to prevent KeyError crashes - #1758: Change screenshot stitching format from BMP to PNG
2026-06-10 15:58:15 +00:00 · 2026-03-07 09:47:38 +00:00
parent 0c9e3c427e
commit 3a75dd3f4c
15 changed files with 139 additions and 29 deletions
--- a/crawl4ai/adaptive_crawler.py
+++ b/crawl4ai/adaptive_crawler.py
@@ -1842,7 +1842,7 @@ class AdaptiveCrawler:
        
        return export_dict
    
-    def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None:
+    async def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None:
        """Import a knowledge base from a file
        
        Args:
@@ -1871,7 +1871,7 @@ class AdaptiveCrawler:
            self.state.knowledge_base.extend(imported_results)
            
            # Update state with imported data
-            asyncio.run(self.strategy.update_state(self.state, imported_results))
+            await self.strategy.update_state(self.state, imported_results)
            
            print(f"Imported {len(imported_results)} documents from {filepath}")
        else:
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1882,7 +1882,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

            buffered = BytesIO()
            stitched = stitched.convert("RGB")
-            stitched.save(buffered, format="BMP", quality=85)
+            stitched.save(buffered, format="PNG")
            encoded = base64.b64encode(buffered.getvalue()).decode("utf-8")

            return encoded
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -296,7 +296,7 @@ class CosineStrategy(ExtractionStrategy):
            return documents

        if len(documents) < at_least_k:
-            at_least_k = len(documents) // 2
+            at_least_k = max(1, len(documents) // 2)

        from sklearn.metrics.pairwise import cosine_similarity

@@ -451,7 +451,10 @@ class CosineStrategy(ExtractionStrategy):
        """
        # Assume `html` is a list of text chunks for this strategy
        t = time.time()
-        text_chunks = html.split(self.DEL)  # Split by lines or paragraphs as needed
+        # Split by delimiter; fall back to double-newline splitting for raw text
+        text_chunks = html.split(self.DEL)
+        if len(text_chunks) == 1:
+            text_chunks = [chunk.strip() for chunk in html.split("\n\n") if chunk.strip()]

        # Pre-filter documents using embeddings and semantic_filter
        text_chunks = self.filter_documents_embeddings(
--- a/crawl4ai/processors/pdf/init.py
+++ b/crawl4ai/processors/pdf/init.py
@@ -145,6 +145,7 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
            
            # Create temp file with .pdf extension
            temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
+            temp_file.close()  # Close handle immediately; file persists due to delete=False
            self._temp_files.append(temp_file.name)
            
            try:
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1697,7 +1697,7 @@ def extract_xml_data_legacy(tags, string):
    data = {}

    for tag in tags:
-        pattern = f"<{tag}>(.*?)</{tag}>"
+        pattern = f"<{tag}>((?:(?!<{tag}>).)*)</{tag}>"
        match = re.search(pattern, string, re.DOTALL)
        if match:
            data[tag] = match.group(1).strip()
@@ -1726,7 +1726,7 @@ def extract_xml_data(tags, string):
    data = {}

    for tag in tags:
-        pattern = f"<{tag}>(.*?)</{tag}>"
+        pattern = f"<{tag}>((?:(?!<{tag}>).)*)</{tag}>"
        matches = re.findall(pattern, string, re.DOTALL)
        
        if matches:
@@ -2294,14 +2294,14 @@ def normalize_url(
    # IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
    # The path from urlparse is already properly encoded
    path = parsed.path
-    if path.endswith('/') and path != '/':
-        path = path.rstrip('/')
+    # Preserve trailing slashes -- they are semantically significant per RFC 3986
+    # e.g. /page/9123/ and /page/9123 may return different responses

    # ── query ──
    query = parsed.query
    if query:
        # explode, mutate, then rebuild
-        params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)]
+        params = [(k, v) for k, v in parse_qsl(query, keep_blank_values=True)]

        if drop_query_tracking:
            default_tracking = {
@@ -2310,7 +2310,7 @@ def normalize_url(
            }
            if extra_drop_params:
                default_tracking |= {p.lower() for p in extra_drop_params}
-            params = [(k, v) for k, v in params if k not in default_tracking]
+            params = [(k, v) for k, v in params if k.lower() not in default_tracking]

        if sort_query:
            params.sort(key=lambda kv: kv[0])
@@ -2383,7 +2383,7 @@ def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_
    normalized = urlunparse((
        parsed.scheme,
        netloc,
-        parsed.path.rstrip('/'),  # Normalize trailing slash
+        parsed.path or '/',  # Preserve trailing slash
        parsed.params,
        query,
        fragment
@@ -2422,7 +2422,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False,
    normalized = urlunparse((
        parsed.scheme,
        parsed.netloc.lower(),
-        parsed.path.rstrip('/'),
+        parsed.path or '/',  # Preserve trailing slash
        parsed.params,
        parsed.query,
        ''  # Remove fragment
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -8589,7 +8589,7 @@ Real sites often have **nested** or repeated data—like categories containing p

 We have a **sample e-commerce** HTML file on GitHub (example):
 ```
-https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
 ```
 This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**.

@@ -8721,7 +8721,7 @@ async def extract_ecommerce_data():
    
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
-            url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
+            url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
            extraction_strategy=strategy,
            config=config
        )
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@@ -128,6 +128,10 @@
            opacity: 1;
        }

+        #adv-editor .CodeMirror {
+            height: 100% !important;
+        }
+
        /* copid text highlighted */
        .highlighted {
            background-color: rgba(78, 255, 255, 0.2) !important;
@@ -267,7 +271,7 @@
                    </div>

                    <!-- CodeMirror host -->
-                    <div id="adv-editor" class="mt-2 border border-border rounded overflow-hidden h-40"></div>
+                    <div id="adv-editor" class="mt-2 border border-border rounded overflow-auto" style="height: 160px; min-height: 160px; max-height: 500px; resize: vertical;"></div>
                </details>

                <div class="flex space-x-2">
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -19,11 +19,113 @@ class FilterType(str, Enum):
    BM25 = "bm25"
    LLM = "llm"

+DEFAULT_CONFIG = {
+    "app": {
+        "title": "Crawl4AI API",
+        "version": "1.0.0",
+        "host": "0.0.0.0",
+        "port": 11235,
+        "reload": False,
+        "workers": 1,
+        "timeout_keep_alive": 300,
+    },
+    "llm": {
+        "provider": "openai/gpt-4o-mini",
+    },
+    "redis": {
+        "host": "localhost",
+        "port": 6379,
+        "db": 0,
+        "password": "",
+        "task_ttl_seconds": 3600,
+        "ssl": False,
+    },
+    "rate_limiting": {
+        "enabled": True,
+        "default_limit": "1000/minute",
+        "trusted_proxies": [],
+        "storage_uri": "memory://",
+    },
+    "security": {
+        "enabled": False,
+        "jwt_enabled": False,
+        "api_token": "",
+        "https_redirect": False,
+        "trusted_hosts": ["*"],
+        "headers": {
+            "x_content_type_options": "nosniff",
+            "x_frame_options": "DENY",
+            "content_security_policy": "default-src 'self'",
+            "strict_transport_security": "max-age=63072000; includeSubDomains",
+        },
+    },
+    "crawler": {
+        "base_config": {"simulate_user": True},
+        "memory_threshold_percent": 95.0,
+        "rate_limiter": {"enabled": True, "base_delay": [1.0, 2.0]},
+        "timeouts": {"stream_init": 30.0, "batch_process": 300.0},
+        "pool": {"max_pages": 40, "idle_ttl_sec": 300},
+        "browser": {
+            "kwargs": {"headless": True, "text_mode": True},
+            "extra_args": [
+                "--no-sandbox",
+                "--disable-dev-shm-usage",
+                "--disable-gpu",
+                "--disable-software-rasterizer",
+                "--disable-web-security",
+                "--allow-insecure-localhost",
+                "--ignore-certificate-errors",
+            ],
+        },
+    },
+    "logging": {
+        "level": "INFO",
+        "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    },
+    "observability": {
+        "prometheus": {"enabled": True, "endpoint": "/metrics"},
+        "health_check": {"endpoint": "/health"},
+    },
+    "webhooks": {
+        "enabled": True,
+        "default_url": None,
+        "data_in_payload": False,
+        "retry": {
+            "max_attempts": 5,
+            "initial_delay_ms": 1000,
+            "max_delay_ms": 32000,
+            "timeout_ms": 30000,
+        },
+        "headers": {"User-Agent": "Crawl4AI-Webhook/1.0"},
+    },
+}
+
+
+def _deep_merge(base: dict, override: dict) -> dict:
+    """Recursively merge override into base. Override values take precedence."""
+    merged = base.copy()
+    for key, value in override.items():
+        if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
+            merged[key] = _deep_merge(merged[key], value)
+        else:
+            merged[key] = value
+    return merged
+
+
 def load_config() -> Dict:
    """Load and return application configuration with environment variable overrides."""
    config_path = Path(__file__).parent / "config.yml"
    with open(config_path, "r") as config_file:
-        config = yaml.safe_load(config_file)
+        user_config = yaml.safe_load(config_file) or {}
+
+    # Deep-merge user config on top of defaults so missing keys get safe values
+    config = _deep_merge(DEFAULT_CONFIG, user_config)
+
+    for section in DEFAULT_CONFIG:
+        if section not in user_config:
+            logging.warning(
+                f"Config section '{section}' missing from config.yml, using defaults"
+            )
    
    # Override LLM provider from environment if set
    llm_provider = os.environ.get("LLM_PROVIDER")
--- a/docs/examples/adaptive_crawling/export_import_kb.py
+++ b/docs/examples/adaptive_crawling/export_import_kb.py
@@ -114,7 +114,7 @@ async def import_and_continue():
        
        # Import existing knowledge base
        print(f"\n1. Importing knowledge base from {kb_path}")
-        adaptive.import_knowledge_base(kb_path)
+        await adaptive.import_knowledge_base(kb_path)
        
        print(f"   - Imported {len(adaptive.state.knowledge_base)} documents")
        print(f"   - Existing URLs: {len(adaptive.state.crawled_urls)}")
@@ -175,10 +175,10 @@ async def share_knowledge_bases():
        merged_crawler = AdaptiveCrawler(crawler)
        
        # Import both knowledge bases
-        merged_crawler.import_knowledge_base(project_a_kb)
+        await merged_crawler.import_knowledge_base(project_a_kb)
        initial_size = len(merged_crawler.state.knowledge_base)
-        
-        merged_crawler.import_knowledge_base(project_b_kb)
+
+        await merged_crawler.import_knowledge_base(project_b_kb)
        final_size = len(merged_crawler.state.knowledge_base)
        
        print(f"   - Project A documents: {initial_size}")
--- a/docs/md_v2/api/adaptive-crawler.md
+++ b/docs/md_v2/api/adaptive-crawler.md
@@ -161,7 +161,7 @@ adaptive.export_knowledge_base("my_knowledge.jsonl")
 Import a previously exported knowledge base.

 ```python
-def import_knowledge_base(
+async def import_knowledge_base(
    self,
    path: Union[str, Path]
 ) -> None
--- a/docs/md_v2/complete-sdk-reference.md
+++ b/docs/md_v2/complete-sdk-reference.md
@@ -4128,7 +4128,7 @@ That's how you keep the config self-contained, illustrate **XPath** usage, and d
 ## 3. Advanced Schema & Nested Structures
 ### Sample E-Commerce HTML
 ```
-https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
 ```
 ```python
 schema = {
@@ -4253,7 +4253,7 @@ async def extract_ecommerce_data():

    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
-            url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
+            url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
            extraction_strategy=strategy,
            config=config
        )
--- a/docs/md_v2/core/adaptive-crawling.md
+++ b/docs/md_v2/core/adaptive-crawling.md
@@ -274,7 +274,7 @@ adaptive.export_knowledge_base("knowledge_base.jsonl")

 # Import into another session
 new_adaptive = AdaptiveCrawler(crawler)
-new_adaptive.import_knowledge_base("knowledge_base.jsonl")
+await new_adaptive.import_knowledge_base("knowledge_base.jsonl")
 ```

 ## Best Practices
--- a/docs/md_v2/extraction/no-llm-strategies.md
+++ b/docs/md_v2/extraction/no-llm-strategies.md
@@ -191,7 +191,7 @@ Real sites often have **nested** or repeated data—like categories containing p

 We have a **sample e-commerce** HTML file on GitHub (example):
 ```
-https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
 ```
 This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**.

@@ -323,7 +323,7 @@ async def extract_ecommerce_data():
    
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
-            url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
+            url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
            extraction_strategy=strategy,
            config=config
        )
--- a/docs/releases_review/crawl4ai_v0_7_0_showcase.py
+++ b/docs/releases_review/crawl4ai_v0_7_0_showcase.py
@@ -543,7 +543,7 @@ async def adaptive_crawling_demo(auto_mode=False):
        adaptive2 = AdaptiveCrawler(crawler, export_config)
        
        # Import the knowledge base
-        adaptive2.import_knowledge_base(kb_export)
+        await adaptive2.import_knowledge_base(kb_export)
        console.print(f"✓ Imported {len(adaptive2.state.knowledge_base)} documents")
        console.print(f"✓ Starting confidence: {int(adaptive2.confidence * 100)}%")
        
--- a/tests/adaptive/test_embedding_strategy.py
+++ b/tests/adaptive/test_embedding_strategy.py
@@ -233,7 +233,7 @@ async def test_knowledge_export_import():
        crawler2 = AdaptiveCrawler(crawler=crawler, config=config)
        
        console.print("\n[cyan]Importing knowledge base...[/cyan]")
-        crawler2.import_knowledge_base(export_path)
+        await crawler2.import_knowledge_base(export_path)
        
        # Continue with new query - should be faster
        console.print("\n[cyan]Extending with new query...[/cyan]")