diff --git a/crawl4ai/adaptive_crawler.py b/crawl4ai/adaptive_crawler.py
index 5611d820..6aa1d3c2 100644
--- a/crawl4ai/adaptive_crawler.py
+++ b/crawl4ai/adaptive_crawler.py
@@ -1842,7 +1842,7 @@ class AdaptiveCrawler:
return export_dict
- def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None:
+ async def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None:
"""Import a knowledge base from a file
Args:
@@ -1871,7 +1871,7 @@ class AdaptiveCrawler:
self.state.knowledge_base.extend(imported_results)
# Update state with imported data
- asyncio.run(self.strategy.update_state(self.state, imported_results))
+ await self.strategy.update_state(self.state, imported_results)
print(f"Imported {len(imported_results)} documents from {filepath}")
else:
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index dcc7130c..5ec368f1 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1882,7 +1882,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
buffered = BytesIO()
stitched = stitched.convert("RGB")
- stitched.save(buffered, format="BMP", quality=85)
+ stitched.save(buffered, format="PNG")
encoded = base64.b64encode(buffered.getvalue()).decode("utf-8")
return encoded
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index c50916f1..a3156016 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -296,7 +296,7 @@ class CosineStrategy(ExtractionStrategy):
return documents
if len(documents) < at_least_k:
- at_least_k = len(documents) // 2
+ at_least_k = max(1, len(documents) // 2)
from sklearn.metrics.pairwise import cosine_similarity
@@ -451,7 +451,10 @@ class CosineStrategy(ExtractionStrategy):
"""
# Assume `html` is a list of text chunks for this strategy
t = time.time()
- text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
+ # Split by delimiter; fall back to double-newline splitting for raw text
+ text_chunks = html.split(self.DEL)
+ if len(text_chunks) == 1:
+ text_chunks = [chunk.strip() for chunk in html.split("\n\n") if chunk.strip()]
# Pre-filter documents using embeddings and semantic_filter
text_chunks = self.filter_documents_embeddings(
diff --git a/crawl4ai/processors/pdf/__init__.py b/crawl4ai/processors/pdf/__init__.py
index a6627f13..69a6f75a 100644
--- a/crawl4ai/processors/pdf/__init__.py
+++ b/crawl4ai/processors/pdf/__init__.py
@@ -145,6 +145,7 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
# Create temp file with .pdf extension
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
+ temp_file.close() # Close handle immediately; file persists due to delete=False
self._temp_files.append(temp_file.name)
try:
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index ec68c47b..4b3d9690 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1697,7 +1697,7 @@ def extract_xml_data_legacy(tags, string):
data = {}
for tag in tags:
- pattern = f"<{tag}>(.*?){tag}>"
+ pattern = f"<{tag}>((?:(?!<{tag}>).)*){tag}>"
match = re.search(pattern, string, re.DOTALL)
if match:
data[tag] = match.group(1).strip()
@@ -1726,7 +1726,7 @@ def extract_xml_data(tags, string):
data = {}
for tag in tags:
- pattern = f"<{tag}>(.*?){tag}>"
+ pattern = f"<{tag}>((?:(?!<{tag}>).)*){tag}>"
matches = re.findall(pattern, string, re.DOTALL)
if matches:
@@ -2294,14 +2294,14 @@ def normalize_url(
# IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
# The path from urlparse is already properly encoded
path = parsed.path
- if path.endswith('/') and path != '/':
- path = path.rstrip('/')
+ # Preserve trailing slashes -- they are semantically significant per RFC 3986
+ # e.g. /page/9123/ and /page/9123 may return different responses
# ── query ──
query = parsed.query
if query:
# explode, mutate, then rebuild
- params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)]
+ params = [(k, v) for k, v in parse_qsl(query, keep_blank_values=True)]
if drop_query_tracking:
default_tracking = {
@@ -2310,7 +2310,7 @@ def normalize_url(
}
if extra_drop_params:
default_tracking |= {p.lower() for p in extra_drop_params}
- params = [(k, v) for k, v in params if k not in default_tracking]
+ params = [(k, v) for k, v in params if k.lower() not in default_tracking]
if sort_query:
params.sort(key=lambda kv: kv[0])
@@ -2383,7 +2383,7 @@ def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_
normalized = urlunparse((
parsed.scheme,
netloc,
- parsed.path.rstrip('/'), # Normalize trailing slash
+ parsed.path or '/', # Preserve trailing slash
parsed.params,
query,
fragment
@@ -2422,7 +2422,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False,
normalized = urlunparse((
parsed.scheme,
parsed.netloc.lower(),
- parsed.path.rstrip('/'),
+ parsed.path or '/', # Preserve trailing slash
parsed.params,
parsed.query,
'' # Remove fragment
diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
index abfd3637..0120e1b6 100644
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -8589,7 +8589,7 @@ Real sites often have **nested** or repeated data—like categories containing p
We have a **sample e-commerce** HTML file on GitHub (example):
```
-https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
```
This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**.
@@ -8721,7 +8721,7 @@ async def extract_ecommerce_data():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
- url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
+ url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
extraction_strategy=strategy,
config=config
)
diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html
index 510a6620..e8c55037 100644
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@@ -128,6 +128,10 @@
opacity: 1;
}
+ #adv-editor .CodeMirror {
+ height: 100% !important;
+ }
+
/* copid text highlighted */
.highlighted {
background-color: rgba(78, 255, 255, 0.2) !important;
@@ -267,7 +271,7 @@
-
+
diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
index de44852b..585d8941 100644
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -19,11 +19,113 @@ class FilterType(str, Enum):
BM25 = "bm25"
LLM = "llm"
+DEFAULT_CONFIG = {
+ "app": {
+ "title": "Crawl4AI API",
+ "version": "1.0.0",
+ "host": "0.0.0.0",
+ "port": 11235,
+ "reload": False,
+ "workers": 1,
+ "timeout_keep_alive": 300,
+ },
+ "llm": {
+ "provider": "openai/gpt-4o-mini",
+ },
+ "redis": {
+ "host": "localhost",
+ "port": 6379,
+ "db": 0,
+ "password": "",
+ "task_ttl_seconds": 3600,
+ "ssl": False,
+ },
+ "rate_limiting": {
+ "enabled": True,
+ "default_limit": "1000/minute",
+ "trusted_proxies": [],
+ "storage_uri": "memory://",
+ },
+ "security": {
+ "enabled": False,
+ "jwt_enabled": False,
+ "api_token": "",
+ "https_redirect": False,
+ "trusted_hosts": ["*"],
+ "headers": {
+ "x_content_type_options": "nosniff",
+ "x_frame_options": "DENY",
+ "content_security_policy": "default-src 'self'",
+ "strict_transport_security": "max-age=63072000; includeSubDomains",
+ },
+ },
+ "crawler": {
+ "base_config": {"simulate_user": True},
+ "memory_threshold_percent": 95.0,
+ "rate_limiter": {"enabled": True, "base_delay": [1.0, 2.0]},
+ "timeouts": {"stream_init": 30.0, "batch_process": 300.0},
+ "pool": {"max_pages": 40, "idle_ttl_sec": 300},
+ "browser": {
+ "kwargs": {"headless": True, "text_mode": True},
+ "extra_args": [
+ "--no-sandbox",
+ "--disable-dev-shm-usage",
+ "--disable-gpu",
+ "--disable-software-rasterizer",
+ "--disable-web-security",
+ "--allow-insecure-localhost",
+ "--ignore-certificate-errors",
+ ],
+ },
+ },
+ "logging": {
+ "level": "INFO",
+ "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+ },
+ "observability": {
+ "prometheus": {"enabled": True, "endpoint": "/metrics"},
+ "health_check": {"endpoint": "/health"},
+ },
+ "webhooks": {
+ "enabled": True,
+ "default_url": None,
+ "data_in_payload": False,
+ "retry": {
+ "max_attempts": 5,
+ "initial_delay_ms": 1000,
+ "max_delay_ms": 32000,
+ "timeout_ms": 30000,
+ },
+ "headers": {"User-Agent": "Crawl4AI-Webhook/1.0"},
+ },
+}
+
+
+def _deep_merge(base: dict, override: dict) -> dict:
+ """Recursively merge override into base. Override values take precedence."""
+ merged = base.copy()
+ for key, value in override.items():
+ if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
+ merged[key] = _deep_merge(merged[key], value)
+ else:
+ merged[key] = value
+ return merged
+
+
def load_config() -> Dict:
"""Load and return application configuration with environment variable overrides."""
config_path = Path(__file__).parent / "config.yml"
with open(config_path, "r") as config_file:
- config = yaml.safe_load(config_file)
+ user_config = yaml.safe_load(config_file) or {}
+
+ # Deep-merge user config on top of defaults so missing keys get safe values
+ config = _deep_merge(DEFAULT_CONFIG, user_config)
+
+ for section in DEFAULT_CONFIG:
+ if section not in user_config:
+ logging.warning(
+ f"Config section '{section}' missing from config.yml, using defaults"
+ )
# Override LLM provider from environment if set
llm_provider = os.environ.get("LLM_PROVIDER")
diff --git a/docs/examples/adaptive_crawling/export_import_kb.py b/docs/examples/adaptive_crawling/export_import_kb.py
index c0a72c2c..476eb700 100644
--- a/docs/examples/adaptive_crawling/export_import_kb.py
+++ b/docs/examples/adaptive_crawling/export_import_kb.py
@@ -114,7 +114,7 @@ async def import_and_continue():
# Import existing knowledge base
print(f"\n1. Importing knowledge base from {kb_path}")
- adaptive.import_knowledge_base(kb_path)
+ await adaptive.import_knowledge_base(kb_path)
print(f" - Imported {len(adaptive.state.knowledge_base)} documents")
print(f" - Existing URLs: {len(adaptive.state.crawled_urls)}")
@@ -175,10 +175,10 @@ async def share_knowledge_bases():
merged_crawler = AdaptiveCrawler(crawler)
# Import both knowledge bases
- merged_crawler.import_knowledge_base(project_a_kb)
+ await merged_crawler.import_knowledge_base(project_a_kb)
initial_size = len(merged_crawler.state.knowledge_base)
-
- merged_crawler.import_knowledge_base(project_b_kb)
+
+ await merged_crawler.import_knowledge_base(project_b_kb)
final_size = len(merged_crawler.state.knowledge_base)
print(f" - Project A documents: {initial_size}")
diff --git a/docs/md_v2/api/adaptive-crawler.md b/docs/md_v2/api/adaptive-crawler.md
index af92ee3a..5bd5bf44 100644
--- a/docs/md_v2/api/adaptive-crawler.md
+++ b/docs/md_v2/api/adaptive-crawler.md
@@ -161,7 +161,7 @@ adaptive.export_knowledge_base("my_knowledge.jsonl")
Import a previously exported knowledge base.
```python
-def import_knowledge_base(
+async def import_knowledge_base(
self,
path: Union[str, Path]
) -> None
diff --git a/docs/md_v2/complete-sdk-reference.md b/docs/md_v2/complete-sdk-reference.md
index 6fd974ea..aa0517b2 100644
--- a/docs/md_v2/complete-sdk-reference.md
+++ b/docs/md_v2/complete-sdk-reference.md
@@ -4128,7 +4128,7 @@ That's how you keep the config self-contained, illustrate **XPath** usage, and d
## 3. Advanced Schema & Nested Structures
### Sample E-Commerce HTML
```
-https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
```
```python
schema = {
@@ -4253,7 +4253,7 @@ async def extract_ecommerce_data():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
- url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
+ url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
extraction_strategy=strategy,
config=config
)
diff --git a/docs/md_v2/core/adaptive-crawling.md b/docs/md_v2/core/adaptive-crawling.md
index 1a43c9f2..b3cf2672 100644
--- a/docs/md_v2/core/adaptive-crawling.md
+++ b/docs/md_v2/core/adaptive-crawling.md
@@ -274,7 +274,7 @@ adaptive.export_knowledge_base("knowledge_base.jsonl")
# Import into another session
new_adaptive = AdaptiveCrawler(crawler)
-new_adaptive.import_knowledge_base("knowledge_base.jsonl")
+await new_adaptive.import_knowledge_base("knowledge_base.jsonl")
```
## Best Practices
diff --git a/docs/md_v2/extraction/no-llm-strategies.md b/docs/md_v2/extraction/no-llm-strategies.md
index 318b5106..63138b30 100644
--- a/docs/md_v2/extraction/no-llm-strategies.md
+++ b/docs/md_v2/extraction/no-llm-strategies.md
@@ -191,7 +191,7 @@ Real sites often have **nested** or repeated data—like categories containing p
We have a **sample e-commerce** HTML file on GitHub (example):
```
-https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
```
This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**.
@@ -323,7 +323,7 @@ async def extract_ecommerce_data():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
- url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
+ url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
extraction_strategy=strategy,
config=config
)
diff --git a/docs/releases_review/crawl4ai_v0_7_0_showcase.py b/docs/releases_review/crawl4ai_v0_7_0_showcase.py
index 29c056f0..d78af7f8 100644
--- a/docs/releases_review/crawl4ai_v0_7_0_showcase.py
+++ b/docs/releases_review/crawl4ai_v0_7_0_showcase.py
@@ -543,7 +543,7 @@ async def adaptive_crawling_demo(auto_mode=False):
adaptive2 = AdaptiveCrawler(crawler, export_config)
# Import the knowledge base
- adaptive2.import_knowledge_base(kb_export)
+ await adaptive2.import_knowledge_base(kb_export)
console.print(f"✓ Imported {len(adaptive2.state.knowledge_base)} documents")
console.print(f"✓ Starting confidence: {int(adaptive2.confidence * 100)}%")
diff --git a/tests/adaptive/test_embedding_strategy.py b/tests/adaptive/test_embedding_strategy.py
index 37433065..6e34b85e 100644
--- a/tests/adaptive/test_embedding_strategy.py
+++ b/tests/adaptive/test_embedding_strategy.py
@@ -233,7 +233,7 @@ async def test_knowledge_export_import():
crawler2 = AdaptiveCrawler(crawler=crawler, config=config)
console.print("\n[cyan]Importing knowledge base...[/cyan]")
- crawler2.import_knowledge_base(export_path)
+ await crawler2.import_knowledge_base(export_path)
# Continue with new query - should be faster
console.print("\n[cyan]Extending with new query...[/cyan]")