mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
fix: batch fix for 10 open issues (#1520, #1489, #1374, #1424, #1183, #1354, #880, #1031, #1251, #1758)
- #1520: Preserve trailing slashes in URL normalization (RFC 3986 compliance) - #1489: Preserve query parameter key casing in normalize_url - #1374: Close NamedTemporaryFile handle before reopening (Windows fix) - #1424: Fix CosineStrategy returning empty results (delimiter fallback + at_least_k >= 1) - #1183: Fix extract_xml_data regex matching tag names in prose text - #1354: Make import_knowledge_base async (fix asyncio.run in running loop) - #880: Fix 404 sample_ecommerce.html gist URL in docs (6 occurrences) - #1031: Make Docker playground code editor resizable with overflow-auto - #1251: Add DEFAULT_CONFIG with deep-merge in load_config to prevent KeyError crashes - #1758: Change screenshot stitching format from BMP to PNG
This commit is contained in:
@@ -1842,7 +1842,7 @@ class AdaptiveCrawler:
|
||||
|
||||
return export_dict
|
||||
|
||||
def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None:
|
||||
async def import_knowledge_base(self, filepath: Union[str, Path], format: str = "jsonl") -> None:
|
||||
"""Import a knowledge base from a file
|
||||
|
||||
Args:
|
||||
@@ -1871,7 +1871,7 @@ class AdaptiveCrawler:
|
||||
self.state.knowledge_base.extend(imported_results)
|
||||
|
||||
# Update state with imported data
|
||||
asyncio.run(self.strategy.update_state(self.state, imported_results))
|
||||
await self.strategy.update_state(self.state, imported_results)
|
||||
|
||||
print(f"Imported {len(imported_results)} documents from {filepath}")
|
||||
else:
|
||||
|
||||
@@ -1882,7 +1882,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
buffered = BytesIO()
|
||||
stitched = stitched.convert("RGB")
|
||||
stitched.save(buffered, format="BMP", quality=85)
|
||||
stitched.save(buffered, format="PNG")
|
||||
encoded = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
|
||||
return encoded
|
||||
|
||||
@@ -296,7 +296,7 @@ class CosineStrategy(ExtractionStrategy):
|
||||
return documents
|
||||
|
||||
if len(documents) < at_least_k:
|
||||
at_least_k = len(documents) // 2
|
||||
at_least_k = max(1, len(documents) // 2)
|
||||
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
@@ -451,7 +451,10 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
# Assume `html` is a list of text chunks for this strategy
|
||||
t = time.time()
|
||||
text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
|
||||
# Split by delimiter; fall back to double-newline splitting for raw text
|
||||
text_chunks = html.split(self.DEL)
|
||||
if len(text_chunks) == 1:
|
||||
text_chunks = [chunk.strip() for chunk in html.split("\n\n") if chunk.strip()]
|
||||
|
||||
# Pre-filter documents using embeddings and semantic_filter
|
||||
text_chunks = self.filter_documents_embeddings(
|
||||
|
||||
@@ -145,6 +145,7 @@ class PDFContentScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
# Create temp file with .pdf extension
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
||||
temp_file.close() # Close handle immediately; file persists due to delete=False
|
||||
self._temp_files.append(temp_file.name)
|
||||
|
||||
try:
|
||||
|
||||
@@ -1697,7 +1697,7 @@ def extract_xml_data_legacy(tags, string):
|
||||
data = {}
|
||||
|
||||
for tag in tags:
|
||||
pattern = f"<{tag}>(.*?)</{tag}>"
|
||||
pattern = f"<{tag}>((?:(?!<{tag}>).)*)</{tag}>"
|
||||
match = re.search(pattern, string, re.DOTALL)
|
||||
if match:
|
||||
data[tag] = match.group(1).strip()
|
||||
@@ -1726,7 +1726,7 @@ def extract_xml_data(tags, string):
|
||||
data = {}
|
||||
|
||||
for tag in tags:
|
||||
pattern = f"<{tag}>(.*?)</{tag}>"
|
||||
pattern = f"<{tag}>((?:(?!<{tag}>).)*)</{tag}>"
|
||||
matches = re.findall(pattern, string, re.DOTALL)
|
||||
|
||||
if matches:
|
||||
@@ -2294,14 +2294,14 @@ def normalize_url(
|
||||
# IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
|
||||
# The path from urlparse is already properly encoded
|
||||
path = parsed.path
|
||||
if path.endswith('/') and path != '/':
|
||||
path = path.rstrip('/')
|
||||
# Preserve trailing slashes -- they are semantically significant per RFC 3986
|
||||
# e.g. /page/9123/ and /page/9123 may return different responses
|
||||
|
||||
# ── query ──
|
||||
query = parsed.query
|
||||
if query:
|
||||
# explode, mutate, then rebuild
|
||||
params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)]
|
||||
params = [(k, v) for k, v in parse_qsl(query, keep_blank_values=True)]
|
||||
|
||||
if drop_query_tracking:
|
||||
default_tracking = {
|
||||
@@ -2310,7 +2310,7 @@ def normalize_url(
|
||||
}
|
||||
if extra_drop_params:
|
||||
default_tracking |= {p.lower() for p in extra_drop_params}
|
||||
params = [(k, v) for k, v in params if k not in default_tracking]
|
||||
params = [(k, v) for k, v in params if k.lower() not in default_tracking]
|
||||
|
||||
if sort_query:
|
||||
params.sort(key=lambda kv: kv[0])
|
||||
@@ -2383,7 +2383,7 @@ def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
netloc,
|
||||
parsed.path.rstrip('/'), # Normalize trailing slash
|
||||
parsed.path or '/', # Preserve trailing slash
|
||||
parsed.params,
|
||||
query,
|
||||
fragment
|
||||
@@ -2422,7 +2422,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False,
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc.lower(),
|
||||
parsed.path.rstrip('/'),
|
||||
parsed.path or '/', # Preserve trailing slash
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
'' # Remove fragment
|
||||
|
||||
@@ -8589,7 +8589,7 @@ Real sites often have **nested** or repeated data—like categories containing p
|
||||
|
||||
We have a **sample e-commerce** HTML file on GitHub (example):
|
||||
```
|
||||
https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
|
||||
https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
|
||||
```
|
||||
This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**.
|
||||
|
||||
@@ -8721,7 +8721,7 @@ async def extract_ecommerce_data():
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
|
||||
url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
|
||||
extraction_strategy=strategy,
|
||||
config=config
|
||||
)
|
||||
|
||||
@@ -128,6 +128,10 @@
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
#adv-editor .CodeMirror {
|
||||
height: 100% !important;
|
||||
}
|
||||
|
||||
/* copid text highlighted */
|
||||
.highlighted {
|
||||
background-color: rgba(78, 255, 255, 0.2) !important;
|
||||
@@ -267,7 +271,7 @@
|
||||
</div>
|
||||
|
||||
<!-- CodeMirror host -->
|
||||
<div id="adv-editor" class="mt-2 border border-border rounded overflow-hidden h-40"></div>
|
||||
<div id="adv-editor" class="mt-2 border border-border rounded overflow-auto" style="height: 160px; min-height: 160px; max-height: 500px; resize: vertical;"></div>
|
||||
</details>
|
||||
|
||||
<div class="flex space-x-2">
|
||||
|
||||
@@ -19,11 +19,113 @@ class FilterType(str, Enum):
|
||||
BM25 = "bm25"
|
||||
LLM = "llm"
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"app": {
|
||||
"title": "Crawl4AI API",
|
||||
"version": "1.0.0",
|
||||
"host": "0.0.0.0",
|
||||
"port": 11235,
|
||||
"reload": False,
|
||||
"workers": 1,
|
||||
"timeout_keep_alive": 300,
|
||||
},
|
||||
"llm": {
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
},
|
||||
"redis": {
|
||||
"host": "localhost",
|
||||
"port": 6379,
|
||||
"db": 0,
|
||||
"password": "",
|
||||
"task_ttl_seconds": 3600,
|
||||
"ssl": False,
|
||||
},
|
||||
"rate_limiting": {
|
||||
"enabled": True,
|
||||
"default_limit": "1000/minute",
|
||||
"trusted_proxies": [],
|
||||
"storage_uri": "memory://",
|
||||
},
|
||||
"security": {
|
||||
"enabled": False,
|
||||
"jwt_enabled": False,
|
||||
"api_token": "",
|
||||
"https_redirect": False,
|
||||
"trusted_hosts": ["*"],
|
||||
"headers": {
|
||||
"x_content_type_options": "nosniff",
|
||||
"x_frame_options": "DENY",
|
||||
"content_security_policy": "default-src 'self'",
|
||||
"strict_transport_security": "max-age=63072000; includeSubDomains",
|
||||
},
|
||||
},
|
||||
"crawler": {
|
||||
"base_config": {"simulate_user": True},
|
||||
"memory_threshold_percent": 95.0,
|
||||
"rate_limiter": {"enabled": True, "base_delay": [1.0, 2.0]},
|
||||
"timeouts": {"stream_init": 30.0, "batch_process": 300.0},
|
||||
"pool": {"max_pages": 40, "idle_ttl_sec": 300},
|
||||
"browser": {
|
||||
"kwargs": {"headless": True, "text_mode": True},
|
||||
"extra_args": [
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-gpu",
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-web-security",
|
||||
"--allow-insecure-localhost",
|
||||
"--ignore-certificate-errors",
|
||||
],
|
||||
},
|
||||
},
|
||||
"logging": {
|
||||
"level": "INFO",
|
||||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
},
|
||||
"observability": {
|
||||
"prometheus": {"enabled": True, "endpoint": "/metrics"},
|
||||
"health_check": {"endpoint": "/health"},
|
||||
},
|
||||
"webhooks": {
|
||||
"enabled": True,
|
||||
"default_url": None,
|
||||
"data_in_payload": False,
|
||||
"retry": {
|
||||
"max_attempts": 5,
|
||||
"initial_delay_ms": 1000,
|
||||
"max_delay_ms": 32000,
|
||||
"timeout_ms": 30000,
|
||||
},
|
||||
"headers": {"User-Agent": "Crawl4AI-Webhook/1.0"},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _deep_merge(base: dict, override: dict) -> dict:
|
||||
"""Recursively merge override into base. Override values take precedence."""
|
||||
merged = base.copy()
|
||||
for key, value in override.items():
|
||||
if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
|
||||
merged[key] = _deep_merge(merged[key], value)
|
||||
else:
|
||||
merged[key] = value
|
||||
return merged
|
||||
|
||||
|
||||
def load_config() -> Dict:
|
||||
"""Load and return application configuration with environment variable overrides."""
|
||||
config_path = Path(__file__).parent / "config.yml"
|
||||
with open(config_path, "r") as config_file:
|
||||
config = yaml.safe_load(config_file)
|
||||
user_config = yaml.safe_load(config_file) or {}
|
||||
|
||||
# Deep-merge user config on top of defaults so missing keys get safe values
|
||||
config = _deep_merge(DEFAULT_CONFIG, user_config)
|
||||
|
||||
for section in DEFAULT_CONFIG:
|
||||
if section not in user_config:
|
||||
logging.warning(
|
||||
f"Config section '{section}' missing from config.yml, using defaults"
|
||||
)
|
||||
|
||||
# Override LLM provider from environment if set
|
||||
llm_provider = os.environ.get("LLM_PROVIDER")
|
||||
|
||||
@@ -114,7 +114,7 @@ async def import_and_continue():
|
||||
|
||||
# Import existing knowledge base
|
||||
print(f"\n1. Importing knowledge base from {kb_path}")
|
||||
adaptive.import_knowledge_base(kb_path)
|
||||
await adaptive.import_knowledge_base(kb_path)
|
||||
|
||||
print(f" - Imported {len(adaptive.state.knowledge_base)} documents")
|
||||
print(f" - Existing URLs: {len(adaptive.state.crawled_urls)}")
|
||||
@@ -175,10 +175,10 @@ async def share_knowledge_bases():
|
||||
merged_crawler = AdaptiveCrawler(crawler)
|
||||
|
||||
# Import both knowledge bases
|
||||
merged_crawler.import_knowledge_base(project_a_kb)
|
||||
await merged_crawler.import_knowledge_base(project_a_kb)
|
||||
initial_size = len(merged_crawler.state.knowledge_base)
|
||||
|
||||
merged_crawler.import_knowledge_base(project_b_kb)
|
||||
|
||||
await merged_crawler.import_knowledge_base(project_b_kb)
|
||||
final_size = len(merged_crawler.state.knowledge_base)
|
||||
|
||||
print(f" - Project A documents: {initial_size}")
|
||||
|
||||
@@ -161,7 +161,7 @@ adaptive.export_knowledge_base("my_knowledge.jsonl")
|
||||
Import a previously exported knowledge base.
|
||||
|
||||
```python
|
||||
def import_knowledge_base(
|
||||
async def import_knowledge_base(
|
||||
self,
|
||||
path: Union[str, Path]
|
||||
) -> None
|
||||
|
||||
@@ -4128,7 +4128,7 @@ That's how you keep the config self-contained, illustrate **XPath** usage, and d
|
||||
## 3. Advanced Schema & Nested Structures
|
||||
### Sample E-Commerce HTML
|
||||
```
|
||||
https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
|
||||
https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
|
||||
```
|
||||
```python
|
||||
schema = {
|
||||
@@ -4253,7 +4253,7 @@ async def extract_ecommerce_data():
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
|
||||
url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
|
||||
extraction_strategy=strategy,
|
||||
config=config
|
||||
)
|
||||
|
||||
@@ -274,7 +274,7 @@ adaptive.export_knowledge_base("knowledge_base.jsonl")
|
||||
|
||||
# Import into another session
|
||||
new_adaptive = AdaptiveCrawler(crawler)
|
||||
new_adaptive.import_knowledge_base("knowledge_base.jsonl")
|
||||
await new_adaptive.import_knowledge_base("knowledge_base.jsonl")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
@@ -191,7 +191,7 @@ Real sites often have **nested** or repeated data—like categories containing p
|
||||
|
||||
We have a **sample e-commerce** HTML file on GitHub (example):
|
||||
```
|
||||
https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
|
||||
https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html
|
||||
```
|
||||
This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**.
|
||||
|
||||
@@ -323,7 +323,7 @@ async def extract_ecommerce_data():
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
|
||||
url="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/examples/sample_ecommerce.html",
|
||||
extraction_strategy=strategy,
|
||||
config=config
|
||||
)
|
||||
|
||||
@@ -543,7 +543,7 @@ async def adaptive_crawling_demo(auto_mode=False):
|
||||
adaptive2 = AdaptiveCrawler(crawler, export_config)
|
||||
|
||||
# Import the knowledge base
|
||||
adaptive2.import_knowledge_base(kb_export)
|
||||
await adaptive2.import_knowledge_base(kb_export)
|
||||
console.print(f"✓ Imported {len(adaptive2.state.knowledge_base)} documents")
|
||||
console.print(f"✓ Starting confidence: {int(adaptive2.confidence * 100)}%")
|
||||
|
||||
|
||||
@@ -233,7 +233,7 @@ async def test_knowledge_export_import():
|
||||
crawler2 = AdaptiveCrawler(crawler=crawler, config=config)
|
||||
|
||||
console.print("\n[cyan]Importing knowledge base...[/cyan]")
|
||||
crawler2.import_knowledge_base(export_path)
|
||||
await crawler2.import_knowledge_base(export_path)
|
||||
|
||||
# Continue with new query - should be faster
|
||||
console.print("\n[cyan]Extending with new query...[/cyan]")
|
||||
|
||||
Reference in New Issue
Block a user