fix: batch fix for 10 open issues (#1520, #1489, #1374, #1424, #1183, #1354, #880, #1031, #1251, #1758)

- #1520: Preserve trailing slashes in URL normalization (RFC 3986 compliance)
- #1489: Preserve query parameter key casing in normalize_url
- #1374: Close NamedTemporaryFile handle before reopening (Windows fix)
- #1424: Fix CosineStrategy returning empty results (delimiter fallback + at_least_k >= 1)
- #1183: Fix extract_xml_data regex matching tag names in prose text
- #1354: Make import_knowledge_base async (fix asyncio.run in running loop)
- #880: Fix 404 sample_ecommerce.html gist URL in docs (6 occurrences)
- #1031: Make Docker playground code editor resizable with overflow-auto
- #1251: Add DEFAULT_CONFIG with deep-merge in load_config to prevent KeyError crashes
- #1758: Change screenshot stitching format from BMP to PNG
This commit is contained in:
unclecode
2026-03-07 09:47:38 +00:00
parent 0c9e3c427e
commit 3a75dd3f4c
15 changed files with 139 additions and 29 deletions

View File

@@ -1842,7 +1842,7 @@ class AdaptiveCrawler:
return export_dict
def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None:
async def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None:
"""Import a knowledge base from a file
Args:
@@ -1871,7 +1871,7 @@ class AdaptiveCrawler:
self.state.knowledge_base.extend(imported_results)
# Update state with imported data
asyncio.run(self.strategy.update_state(self.state, imported_results))
await self.strategy.update_state(self.state, imported_results)
print(f"Imported {len(imported_results)} documents from {filepath}")
else:

View File

@@ -1882,7 +1882,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
buffered = BytesIO()
stitched = stitched.convert("RGB")
stitched.save(buffered, format="BMP", quality=85)
stitched.save(buffered, format="PNG")
encoded = base64.b64encode(buffered.getvalue()).decode("utf-8")
return encoded

View File

@@ -296,7 +296,7 @@ class CosineStrategy(ExtractionStrategy):
return documents
if len(documents) < at_least_k:
at_least_k = len(documents) // 2
at_least_k = max(1, len(documents) // 2)
from sklearn.metrics.pairwise import cosine_similarity
@@ -451,7 +451,10 @@ class CosineStrategy(ExtractionStrategy):
"""
# Assume `html` is a list of text chunks for this strategy
t = time.time()
text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
# Split by delimiter; fall back to double-newline splitting for raw text
text_chunks = html.split(self.DEL)
if len(text_chunks) == 1:
text_chunks = [chunk.strip() for chunk in html.split("\n\n") if chunk.strip()]
# Pre-filter documents using embeddings and semantic_filter
text_chunks = self.filter_documents_embeddings(

View File

@@ -145,6 +145,7 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
# Create temp file with .pdf extension
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
temp_file.close() # Close handle immediately; file persists due to delete=False
self._temp_files.append(temp_file.name)
try:

View File

@@ -1697,7 +1697,7 @@ def extract_xml_data_legacy(tags, string):
data = {}
for tag in tags:
pattern = f"<{tag}>(.*?)</{tag}>"
pattern = f"<{tag}>((?:(?!<{tag}>).)*)</{tag}>"
match = re.search(pattern, string, re.DOTALL)
if match:
data[tag] = match.group(1).strip()
@@ -1726,7 +1726,7 @@ def extract_xml_data(tags, string):
data = {}
for tag in tags:
pattern = f"<{tag}>(.*?)</{tag}>"
pattern = f"<{tag}>((?:(?!<{tag}>).)*)</{tag}>"
matches = re.findall(pattern, string, re.DOTALL)
if matches:
@@ -2294,14 +2294,14 @@ def normalize_url(
# IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
# The path from urlparse is already properly encoded
path = parsed.path
if path.endswith('/') and path != '/':
path = path.rstrip('/')
# Preserve trailing slashes -- they are semantically significant per RFC 3986
# e.g. /page/9123/ and /page/9123 may return different responses
# ── query ──
query = parsed.query
if query:
# explode, mutate, then rebuild
params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)]
params = [(k, v) for k, v in parse_qsl(query, keep_blank_values=True)]
if drop_query_tracking:
default_tracking = {
@@ -2310,7 +2310,7 @@ def normalize_url(
}
if extra_drop_params:
default_tracking |= {p.lower() for p in extra_drop_params}
params = [(k, v) for k, v in params if k not in default_tracking]
params = [(k, v) for k, v in params if k.lower() not in default_tracking]
if sort_query:
params.sort(key=lambda kv: kv[0])
@@ -2383,7 +2383,7 @@ def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_
normalized = urlunparse((
parsed.scheme,
netloc,
parsed.path.rstrip('/'), # Normalize trailing slash
parsed.path or '/', # Preserve trailing slash
parsed.params,
query,
fragment
@@ -2422,7 +2422,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False,
normalized = urlunparse((
parsed.scheme,
parsed.netloc.lower(),
parsed.path.rstrip('/'),
parsed.path or '/', # Preserve trailing slash
parsed.params,
parsed.query,
'' # Remove fragment

View File

@@ -8589,7 +8589,7 @@ Real sites often have **nested** or repeated data—like categories containing p
We have a **sample e-commerce** HTML file on GitHub (example):
```
https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
```
This snippet includes categories, products, features, reviews, and related items. Lets see how to define a schema that fully captures that structure **without LLM**.
@@ -8721,7 +8721,7 @@ async def extract_ecommerce_data():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
extraction_strategy=strategy,
config=config
)

View File

@@ -128,6 +128,10 @@
opacity: 1;
}
#adv-editor .CodeMirror {
height: 100% !important;
}
/* copid text highlighted */
.highlighted {
background-color: rgba(78, 255, 255, 0.2) !important;
@@ -267,7 +271,7 @@
</div>
<!-- CodeMirror host -->
<div id="adv-editor" class="mt-2 border border-border rounded overflow-hidden h-40"></div>
<div id="adv-editor" class="mt-2 border border-border rounded overflow-auto" style="height: 160px; min-height: 160px; max-height: 500px; resize: vertical;"></div>
</details>
<div class="flex space-x-2">

View File

@@ -19,11 +19,113 @@ class FilterType(str, Enum):
BM25 = "bm25"
LLM = "llm"
DEFAULT_CONFIG = {
"app": {
"title": "Crawl4AI API",
"version": "1.0.0",
"host": "0.0.0.0",
"port": 11235,
"reload": False,
"workers": 1,
"timeout_keep_alive": 300,
},
"llm": {
"provider": "openai/gpt-4o-mini",
},
"redis": {
"host": "localhost",
"port": 6379,
"db": 0,
"password": "",
"task_ttl_seconds": 3600,
"ssl": False,
},
"rate_limiting": {
"enabled": True,
"default_limit": "1000/minute",
"trusted_proxies": [],
"storage_uri": "memory://",
},
"security": {
"enabled": False,
"jwt_enabled": False,
"api_token": "",
"https_redirect": False,
"trusted_hosts": ["*"],
"headers": {
"x_content_type_options": "nosniff",
"x_frame_options": "DENY",
"content_security_policy": "default-src 'self'",
"strict_transport_security": "max-age=63072000; includeSubDomains",
},
},
"crawler": {
"base_config": {"simulate_user": True},
"memory_threshold_percent": 95.0,
"rate_limiter": {"enabled": True, "base_delay": [1.0, 2.0]},
"timeouts": {"stream_init": 30.0, "batch_process": 300.0},
"pool": {"max_pages": 40, "idle_ttl_sec": 300},
"browser": {
"kwargs": {"headless": True, "text_mode": True},
"extra_args": [
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--disable-software-rasterizer",
"--disable-web-security",
"--allow-insecure-localhost",
"--ignore-certificate-errors",
],
},
},
"logging": {
"level": "INFO",
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
},
"observability": {
"prometheus": {"enabled": True, "endpoint": "/metrics"},
"health_check": {"endpoint": "/health"},
},
"webhooks": {
"enabled": True,
"default_url": None,
"data_in_payload": False,
"retry": {
"max_attempts": 5,
"initial_delay_ms": 1000,
"max_delay_ms": 32000,
"timeout_ms": 30000,
},
"headers": {"User-Agent": "Crawl4AI-Webhook/1.0"},
},
}
def _deep_merge(base: dict, override: dict) -> dict:
"""Recursively merge override into base. Override values take precedence."""
merged = base.copy()
for key, value in override.items():
if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
merged[key] = _deep_merge(merged[key], value)
else:
merged[key] = value
return merged
def load_config() -> Dict:
"""Load and return application configuration with environment variable overrides."""
config_path = Path(__file__).parent / "config.yml"
with open(config_path, "r") as config_file:
config = yaml.safe_load(config_file)
user_config = yaml.safe_load(config_file) or {}
# Deep-merge user config on top of defaults so missing keys get safe values
config = _deep_merge(DEFAULT_CONFIG, user_config)
for section in DEFAULT_CONFIG:
if section not in user_config:
logging.warning(
f"Config section '{section}' missing from config.yml, using defaults"
)
# Override LLM provider from environment if set
llm_provider = os.environ.get("LLM_PROVIDER")

View File

@@ -114,7 +114,7 @@ async def import_and_continue():
# Import existing knowledge base
print(f"\n1. Importing knowledge base from {kb_path}")
adaptive.import_knowledge_base(kb_path)
await adaptive.import_knowledge_base(kb_path)
print(f" - Imported {len(adaptive.state.knowledge_base)} documents")
print(f" - Existing URLs: {len(adaptive.state.crawled_urls)}")
@@ -175,10 +175,10 @@ async def share_knowledge_bases():
merged_crawler = AdaptiveCrawler(crawler)
# Import both knowledge bases
merged_crawler.import_knowledge_base(project_a_kb)
await merged_crawler.import_knowledge_base(project_a_kb)
initial_size = len(merged_crawler.state.knowledge_base)
merged_crawler.import_knowledge_base(project_b_kb)
await merged_crawler.import_knowledge_base(project_b_kb)
final_size = len(merged_crawler.state.knowledge_base)
print(f" - Project A documents: {initial_size}")

View File

@@ -161,7 +161,7 @@ adaptive.export_knowledge_base("my_knowledge.jsonl")
Import a previously exported knowledge base.
```python
def import_knowledge_base(
async def import_knowledge_base(
self,
path: Union[str, Path]
) -> None

View File

@@ -4128,7 +4128,7 @@ That's how you keep the config self-contained, illustrate **XPath** usage, and d
## 3. Advanced Schema & Nested Structures
### Sample E-Commerce HTML
```
https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
```
```python
schema = {
@@ -4253,7 +4253,7 @@ async def extract_ecommerce_data():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
extraction_strategy=strategy,
config=config
)

View File

@@ -274,7 +274,7 @@ adaptive.export_knowledge_base("knowledge_base.jsonl")
# Import into another session
new_adaptive = AdaptiveCrawler(crawler)
new_adaptive.import_knowledge_base("knowledge_base.jsonl")
await new_adaptive.import_knowledge_base("knowledge_base.jsonl")
```
## Best Practices

View File

@@ -191,7 +191,7 @@ Real sites often have **nested** or repeated data—like categories containing p
We have a **sample e-commerce** HTML file on GitHub (example):
```
https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
```
This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**.
@@ -323,7 +323,7 @@ async def extract_ecommerce_data():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
extraction_strategy=strategy,
config=config
)

View File

@@ -543,7 +543,7 @@ async def adaptive_crawling_demo(auto_mode=False):
adaptive2 = AdaptiveCrawler(crawler, export_config)
# Import the knowledge base
adaptive2.import_knowledge_base(kb_export)
await adaptive2.import_knowledge_base(kb_export)
console.print(f"✓ Imported {len(adaptive2.state.knowledge_base)} documents")
console.print(f"✓ Starting confidence: {int(adaptive2.confidence * 100)}%")

View File

@@ -233,7 +233,7 @@ async def test_knowledge_export_import():
crawler2 = AdaptiveCrawler(crawler=crawler, config=config)
console.print("\n[cyan]Importing knowledge base...[/cyan]")
crawler2.import_knowledge_base(export_path)
await crawler2.import_knowledge_base(export_path)
# Continue with new query - should be faster
console.print("\n[cyan]Extending with new query...[/cyan]")