mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 07:48:50 +00:00
When the Docker API receives markdown_generator as JSON with "options" instead of "params", from_serializable_dict silently passes the raw dict through. This later crashes with a confusing "'dict' object has no attribute 'generate_markdown'" deep in the crawl pipeline. Add type validation for markdown_generator in CrawlerRunConfig.__init__ (matching existing extraction_strategy/chunking_strategy validation). When a dict slips through, the error now clearly states: - What type was expected vs received - That "params" is the required key (not "options") Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
159 lines
6.1 KiB
Python
159 lines
6.1 KiB
Python
"""
|
|
Tests for #1880: markdown_generator deserialization validation in CrawlerRunConfig
|
|
|
|
Ensures that:
|
|
1. Correct {"type": ..., "params": {...}} format deserializes properly
|
|
2. Wrong key names ("options") raise a clear ValueError, not a cryptic AttributeError
|
|
3. Nested content_filter deserializes correctly
|
|
"""
|
|
import pytest
|
|
|
|
|
|
class TestMarkdownGeneratorDeserialization:
|
|
"""Test CrawlerRunConfig.load() with markdown_generator configs."""
|
|
|
|
def test_params_key_deserializes_correctly(self):
|
|
"""{"type": ..., "params": {...}} should produce a real object."""
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
data = {
|
|
"markdown_generator": {
|
|
"type": "DefaultMarkdownGenerator",
|
|
"params": {},
|
|
}
|
|
}
|
|
config = CrawlerRunConfig.load(data)
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
assert isinstance(config.markdown_generator, DefaultMarkdownGenerator)
|
|
|
|
def test_params_with_content_filter(self):
|
|
"""Nested BM25ContentFilter should deserialize inside markdown_generator."""
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
|
|
|
data = {
|
|
"markdown_generator": {
|
|
"type": "DefaultMarkdownGenerator",
|
|
"params": {
|
|
"content_filter": {
|
|
"type": "BM25ContentFilter",
|
|
"params": {
|
|
"user_query": "example",
|
|
"bm25_threshold": 0.9,
|
|
},
|
|
}
|
|
},
|
|
}
|
|
}
|
|
config = CrawlerRunConfig.load(data)
|
|
assert isinstance(config.markdown_generator.content_filter, BM25ContentFilter)
|
|
assert config.markdown_generator.content_filter.user_query == "example"
|
|
assert config.markdown_generator.content_filter.bm25_threshold == 0.9
|
|
|
|
def test_params_with_pruning_filter(self):
|
|
"""PruningContentFilter should also work."""
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
|
|
|
data = {
|
|
"markdown_generator": {
|
|
"type": "DefaultMarkdownGenerator",
|
|
"params": {
|
|
"content_filter": {
|
|
"type": "PruningContentFilter",
|
|
"params": {},
|
|
}
|
|
},
|
|
}
|
|
}
|
|
config = CrawlerRunConfig.load(data)
|
|
assert isinstance(config.markdown_generator.content_filter, PruningContentFilter)
|
|
|
|
def test_options_key_raises_clear_error(self):
|
|
"""Using "options" instead of "params" should raise ValueError with hint."""
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
data = {
|
|
"markdown_generator": {
|
|
"type": "DefaultMarkdownGenerator",
|
|
"options": {"content_filter": {}},
|
|
}
|
|
}
|
|
with pytest.raises(ValueError, match="params.*required"):
|
|
CrawlerRunConfig.load(data)
|
|
|
|
def test_arbitrary_key_raises_clear_error(self):
|
|
"""Any non-"params" key should raise ValueError."""
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
data = {
|
|
"markdown_generator": {
|
|
"type": "DefaultMarkdownGenerator",
|
|
"settings": {},
|
|
}
|
|
}
|
|
with pytest.raises(ValueError, match="markdown_generator must be an instance"):
|
|
CrawlerRunConfig.load(data)
|
|
|
|
def test_plain_dict_raises_clear_error(self):
|
|
"""A dict without type/params structure should raise ValueError."""
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
data = {
|
|
"markdown_generator": {"foo": "bar"}
|
|
}
|
|
with pytest.raises(ValueError, match="got dict"):
|
|
CrawlerRunConfig.load(data)
|
|
|
|
def test_error_message_mentions_params_key(self):
|
|
"""Error message should specifically mention that 'params' is required."""
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
data = {
|
|
"markdown_generator": {
|
|
"type": "DefaultMarkdownGenerator",
|
|
"options": {},
|
|
}
|
|
}
|
|
with pytest.raises(ValueError) as exc_info:
|
|
CrawlerRunConfig.load(data)
|
|
msg = str(exc_info.value)
|
|
assert "params" in msg
|
|
assert "options" in msg or "not recognized" in msg
|
|
|
|
def test_none_markdown_generator_uses_default(self):
|
|
"""None should use the default (DefaultMarkdownGenerator)."""
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
|
|
config = CrawlerRunConfig(markdown_generator=None)
|
|
# None is allowed — the crawler falls back to default behavior
|
|
assert config.markdown_generator is None
|
|
|
|
def test_valid_instance_passes_validation(self):
|
|
"""Passing an actual instance should work fine."""
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
|
|
|
gen = DefaultMarkdownGenerator(
|
|
content_filter=BM25ContentFilter(user_query="test")
|
|
)
|
|
config = CrawlerRunConfig(markdown_generator=gen)
|
|
assert config.markdown_generator is gen
|
|
assert config.markdown_generator.content_filter.user_query == "test"
|
|
|
|
|
|
class TestExistingValidationStillWorks:
|
|
"""Ensure existing extraction_strategy/chunking_strategy validation unchanged."""
|
|
|
|
def test_extraction_strategy_dict_raises(self):
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
with pytest.raises(ValueError, match="extraction_strategy"):
|
|
CrawlerRunConfig(extraction_strategy={"type": "bad"})
|
|
|
|
def test_chunking_strategy_dict_raises(self):
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
with pytest.raises(ValueError, match="chunking_strategy"):
|
|
CrawlerRunConfig(chunking_strategy={"type": "bad"})
|