diff --git a/crawl4ai/adaptive_crawler.py b/crawl4ai/adaptive_crawler.py index 5611d820..6aa1d3c2 100644 --- a/crawl4ai/adaptive_crawler.py +++ b/crawl4ai/adaptive_crawler.py @@ -1842,7 +1842,7 @@ class AdaptiveCrawler: return export_dict - def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None: + async def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None: """Import a knowledge base from a file Args: @@ -1871,7 +1871,7 @@ class AdaptiveCrawler: self.state.knowledge_base.extend(imported_results) # Update state with imported data - asyncio.run(self.strategy.update_state(self.state, imported_results)) + await self.strategy.update_state(self.state, imported_results) print(f"Imported {len(imported_results)} documents from {filepath}") else: diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index dcc7130c..5ec368f1 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1882,7 +1882,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): buffered = BytesIO() stitched = stitched.convert("RGB") - stitched.save(buffered, format="BMP", quality=85) + stitched.save(buffered, format="PNG") encoded = base64.b64encode(buffered.getvalue()).decode("utf-8") return encoded diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index c50916f1..a3156016 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -296,7 +296,7 @@ class CosineStrategy(ExtractionStrategy): return documents if len(documents) < at_least_k: - at_least_k = len(documents) // 2 + at_least_k = max(1, len(documents) // 2) from sklearn.metrics.pairwise import cosine_similarity @@ -451,7 +451,10 @@ class CosineStrategy(ExtractionStrategy): """ # Assume `html` is a list of text chunks for this strategy t = time.time() - text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed + # Split by delimiter; fall back to double-newline splitting for raw text + text_chunks = html.split(self.DEL) + if len(text_chunks) == 1: + text_chunks = [chunk.strip() for chunk in html.split("\n\n") if chunk.strip()] # Pre-filter documents using embeddings and semantic_filter text_chunks = self.filter_documents_embeddings( diff --git a/crawl4ai/processors/pdf/__init__.py b/crawl4ai/processors/pdf/__init__.py index a6627f13..69a6f75a 100644 --- a/crawl4ai/processors/pdf/__init__.py +++ b/crawl4ai/processors/pdf/__init__.py @@ -145,6 +145,7 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy): # Create temp file with .pdf extension temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) + temp_file.close() # Close handle immediately; file persists due to delete=False self._temp_files.append(temp_file.name) try: diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index ec68c47b..4b3d9690 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1697,7 +1697,7 @@ def extract_xml_data_legacy(tags, string): data = {} for tag in tags: - pattern = f"<{tag}>(.*?)" + pattern = f"<{tag}>((?:(?!<{tag}>).)*)" match = re.search(pattern, string, re.DOTALL) if match: data[tag] = match.group(1).strip() @@ -1726,7 +1726,7 @@ def extract_xml_data(tags, string): data = {} for tag in tags: - pattern = f"<{tag}>(.*?)" + pattern = f"<{tag}>((?:(?!<{tag}>).)*)" matches = re.findall(pattern, string, re.DOTALL) if matches: @@ -2294,14 +2294,14 @@ def normalize_url( # IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs # The path from urlparse is already properly encoded path = parsed.path - if path.endswith('/') and path != '/': - path = path.rstrip('/') + # Preserve trailing slashes -- they are semantically significant per RFC 3986 + # e.g. /page/9123/ and /page/9123 may return different responses # ── query ── query = parsed.query if query: # explode, mutate, then rebuild - params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)] + params = [(k, v) for k, v in parse_qsl(query, keep_blank_values=True)] if drop_query_tracking: default_tracking = { @@ -2310,7 +2310,7 @@ def normalize_url( } if extra_drop_params: default_tracking |= {p.lower() for p in extra_drop_params} - params = [(k, v) for k, v in params if k not in default_tracking] + params = [(k, v) for k, v in params if k.lower() not in default_tracking] if sort_query: params.sort(key=lambda kv: kv[0]) @@ -2383,7 +2383,7 @@ def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_ normalized = urlunparse(( parsed.scheme, netloc, - parsed.path.rstrip('/'), # Normalize trailing slash + parsed.path or '/', # Preserve trailing slash parsed.params, query, fragment @@ -2422,7 +2422,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, normalized = urlunparse(( parsed.scheme, parsed.netloc.lower(), - parsed.path.rstrip('/'), + parsed.path or '/', # Preserve trailing slash parsed.params, parsed.query, '' # Remove fragment diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index abfd3637..0120e1b6 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -8589,7 +8589,7 @@ Real sites often have **nested** or repeated data—like categories containing p We have a **sample e-commerce** HTML file on GitHub (example): ``` -https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html +https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html ``` This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**. @@ -8721,7 +8721,7 @@ async def extract_ecommerce_data(): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( - url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html", + url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html", extraction_strategy=strategy, config=config ) diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html index 510a6620..e8c55037 100644 --- a/deploy/docker/static/playground/index.html +++ b/deploy/docker/static/playground/index.html @@ -128,6 +128,10 @@ opacity: 1; } + #adv-editor .CodeMirror { + height: 100% !important; + } + /* copid text highlighted */ .highlighted { background-color: rgba(78, 255, 255, 0.2) !important; @@ -267,7 +271,7 @@ -
+
diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py index de44852b..585d8941 100644 --- a/deploy/docker/utils.py +++ b/deploy/docker/utils.py @@ -19,11 +19,113 @@ class FilterType(str, Enum): BM25 = "bm25" LLM = "llm" +DEFAULT_CONFIG = { + "app": { + "title": "Crawl4AI API", + "version": "1.0.0", + "host": "0.0.0.0", + "port": 11235, + "reload": False, + "workers": 1, + "timeout_keep_alive": 300, + }, + "llm": { + "provider": "openai/gpt-4o-mini", + }, + "redis": { + "host": "localhost", + "port": 6379, + "db": 0, + "password": "", + "task_ttl_seconds": 3600, + "ssl": False, + }, + "rate_limiting": { + "enabled": True, + "default_limit": "1000/minute", + "trusted_proxies": [], + "storage_uri": "memory://", + }, + "security": { + "enabled": False, + "jwt_enabled": False, + "api_token": "", + "https_redirect": False, + "trusted_hosts": ["*"], + "headers": { + "x_content_type_options": "nosniff", + "x_frame_options": "DENY", + "content_security_policy": "default-src 'self'", + "strict_transport_security": "max-age=63072000; includeSubDomains", + }, + }, + "crawler": { + "base_config": {"simulate_user": True}, + "memory_threshold_percent": 95.0, + "rate_limiter": {"enabled": True, "base_delay": [1.0, 2.0]}, + "timeouts": {"stream_init": 30.0, "batch_process": 300.0}, + "pool": {"max_pages": 40, "idle_ttl_sec": 300}, + "browser": { + "kwargs": {"headless": True, "text_mode": True}, + "extra_args": [ + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-gpu", + "--disable-software-rasterizer", + "--disable-web-security", + "--allow-insecure-localhost", + "--ignore-certificate-errors", + ], + }, + }, + "logging": { + "level": "INFO", + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s", + }, + "observability": { + "prometheus": {"enabled": True, "endpoint": "/metrics"}, + "health_check": {"endpoint": "/health"}, + }, + "webhooks": { + "enabled": True, + "default_url": None, + "data_in_payload": False, + "retry": { + "max_attempts": 5, + "initial_delay_ms": 1000, + "max_delay_ms": 32000, + "timeout_ms": 30000, + }, + "headers": {"User-Agent": "Crawl4AI-Webhook/1.0"}, + }, +} + + +def _deep_merge(base: dict, override: dict) -> dict: + """Recursively merge override into base. Override values take precedence.""" + merged = base.copy() + for key, value in override.items(): + if key in merged and isinstance(merged[key], dict) and isinstance(value, dict): + merged[key] = _deep_merge(merged[key], value) + else: + merged[key] = value + return merged + + def load_config() -> Dict: """Load and return application configuration with environment variable overrides.""" config_path = Path(__file__).parent / "config.yml" with open(config_path, "r") as config_file: - config = yaml.safe_load(config_file) + user_config = yaml.safe_load(config_file) or {} + + # Deep-merge user config on top of defaults so missing keys get safe values + config = _deep_merge(DEFAULT_CONFIG, user_config) + + for section in DEFAULT_CONFIG: + if section not in user_config: + logging.warning( + f"Config section '{section}' missing from config.yml, using defaults" + ) # Override LLM provider from environment if set llm_provider = os.environ.get("LLM_PROVIDER") diff --git a/docs/examples/adaptive_crawling/export_import_kb.py b/docs/examples/adaptive_crawling/export_import_kb.py index c0a72c2c..476eb700 100644 --- a/docs/examples/adaptive_crawling/export_import_kb.py +++ b/docs/examples/adaptive_crawling/export_import_kb.py @@ -114,7 +114,7 @@ async def import_and_continue(): # Import existing knowledge base print(f"\n1. Importing knowledge base from {kb_path}") - adaptive.import_knowledge_base(kb_path) + await adaptive.import_knowledge_base(kb_path) print(f" - Imported {len(adaptive.state.knowledge_base)} documents") print(f" - Existing URLs: {len(adaptive.state.crawled_urls)}") @@ -175,10 +175,10 @@ async def share_knowledge_bases(): merged_crawler = AdaptiveCrawler(crawler) # Import both knowledge bases - merged_crawler.import_knowledge_base(project_a_kb) + await merged_crawler.import_knowledge_base(project_a_kb) initial_size = len(merged_crawler.state.knowledge_base) - - merged_crawler.import_knowledge_base(project_b_kb) + + await merged_crawler.import_knowledge_base(project_b_kb) final_size = len(merged_crawler.state.knowledge_base) print(f" - Project A documents: {initial_size}") diff --git a/docs/md_v2/api/adaptive-crawler.md b/docs/md_v2/api/adaptive-crawler.md index af92ee3a..5bd5bf44 100644 --- a/docs/md_v2/api/adaptive-crawler.md +++ b/docs/md_v2/api/adaptive-crawler.md @@ -161,7 +161,7 @@ adaptive.export_knowledge_base("my_knowledge.jsonl") Import a previously exported knowledge base. ```python -def import_knowledge_base( +async def import_knowledge_base( self, path: Union[str, Path] ) -> None diff --git a/docs/md_v2/complete-sdk-reference.md b/docs/md_v2/complete-sdk-reference.md index 6fd974ea..aa0517b2 100644 --- a/docs/md_v2/complete-sdk-reference.md +++ b/docs/md_v2/complete-sdk-reference.md @@ -4128,7 +4128,7 @@ That's how you keep the config self-contained, illustrate **XPath** usage, and d ## 3. Advanced Schema & Nested Structures ### Sample E-Commerce HTML ``` -https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html +https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html ``` ```python schema = { @@ -4253,7 +4253,7 @@ async def extract_ecommerce_data(): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( - url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html", + url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html", extraction_strategy=strategy, config=config ) diff --git a/docs/md_v2/core/adaptive-crawling.md b/docs/md_v2/core/adaptive-crawling.md index 1a43c9f2..b3cf2672 100644 --- a/docs/md_v2/core/adaptive-crawling.md +++ b/docs/md_v2/core/adaptive-crawling.md @@ -274,7 +274,7 @@ adaptive.export_knowledge_base("knowledge_base.jsonl") # Import into another session new_adaptive = AdaptiveCrawler(crawler) -new_adaptive.import_knowledge_base("knowledge_base.jsonl") +await new_adaptive.import_knowledge_base("knowledge_base.jsonl") ``` ## Best Practices diff --git a/docs/md_v2/extraction/no-llm-strategies.md b/docs/md_v2/extraction/no-llm-strategies.md index 318b5106..63138b30 100644 --- a/docs/md_v2/extraction/no-llm-strategies.md +++ b/docs/md_v2/extraction/no-llm-strategies.md @@ -191,7 +191,7 @@ Real sites often have **nested** or repeated data—like categories containing p We have a **sample e-commerce** HTML file on GitHub (example): ``` -https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html +https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html ``` This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**. @@ -323,7 +323,7 @@ async def extract_ecommerce_data(): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( - url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html", + url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html", extraction_strategy=strategy, config=config ) diff --git a/docs/releases_review/crawl4ai_v0_7_0_showcase.py b/docs/releases_review/crawl4ai_v0_7_0_showcase.py index 29c056f0..d78af7f8 100644 --- a/docs/releases_review/crawl4ai_v0_7_0_showcase.py +++ b/docs/releases_review/crawl4ai_v0_7_0_showcase.py @@ -543,7 +543,7 @@ async def adaptive_crawling_demo(auto_mode=False): adaptive2 = AdaptiveCrawler(crawler, export_config) # Import the knowledge base - adaptive2.import_knowledge_base(kb_export) + await adaptive2.import_knowledge_base(kb_export) console.print(f"✓ Imported {len(adaptive2.state.knowledge_base)} documents") console.print(f"✓ Starting confidence: {int(adaptive2.confidence * 100)}%") diff --git a/tests/adaptive/test_embedding_strategy.py b/tests/adaptive/test_embedding_strategy.py index 37433065..6e34b85e 100644 --- a/tests/adaptive/test_embedding_strategy.py +++ b/tests/adaptive/test_embedding_strategy.py @@ -233,7 +233,7 @@ async def test_knowledge_export_import(): crawler2 = AdaptiveCrawler(crawler=crawler, config=config) console.print("\n[cyan]Importing knowledge base...[/cyan]") - crawler2.import_knowledge_base(export_path) + await crawler2.import_knowledge_base(export_path) # Continue with new query - should be faster console.print("\n[cyan]Extending with new query...[/cyan]")