Files
crawl4ai/tests/test_issue_1848_logger_serialize.py
hafezparast e603e4a722 fix: skip non-allowlisted types in serialization/deserialization (#1848)
to_serializable_dict now skips types not in ALLOWED_DESERIALIZE_TYPES
(returns None), preventing objects like logging.Logger from being
serialized as {"type": "Logger", "params": {...}} which then fails
deserialization. from_serializable_dict returns None for unknown types
instead of raising ValueError, handling payloads from older clients.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-23 08:17:02 +08:00

223 lines
8.2 KiB
Python

"""
Tests for issue #1848: ValueError on deserialization of 'Logger' type.
When BFSDeepCrawlStrategy is serialized for the Docker API, its logger field
(a logging.Logger) was serialized as {"type": "Logger", "params": {...}},
which then failed deserialization because Logger is not in the allowlist.
Fix: to_serializable_dict skips non-allowlisted types (returns None),
and from_serializable_dict returns None for unknown types instead of raising.
"""
import logging
import pytest
from crawl4ai.async_configs import (
to_serializable_dict,
from_serializable_dict,
ALLOWED_DESERIALIZE_TYPES,
)
from crawl4ai import (
BFSDeepCrawlStrategy,
DFSDeepCrawlStrategy,
CrawlerRunConfig,
BrowserConfig,
CacheMode,
)
# -- Serialization: non-allowlisted types are skipped --
class TestSerializationSkipsNonAllowlisted:
"""to_serializable_dict should return None for types not in the allowlist."""
def test_logger_serialized_as_none(self):
logger = logging.getLogger("test")
result = to_serializable_dict(logger)
assert result is None
def test_bfs_strategy_logger_is_none_in_output(self):
strategy = BFSDeepCrawlStrategy(max_depth=2, max_pages=10)
serialized = to_serializable_dict(strategy)
assert serialized["type"] == "BFSDeepCrawlStrategy"
# Logger should be None, not a {"type": "Logger", ...} dict
logger_val = serialized["params"].get("logger")
assert logger_val is None or logger_val is None
def test_dfs_strategy_logger_is_none_in_output(self):
strategy = DFSDeepCrawlStrategy(max_depth=3, max_pages=5)
serialized = to_serializable_dict(strategy)
assert serialized["type"] == "DFSDeepCrawlStrategy"
logger_val = serialized["params"].get("logger")
assert logger_val is None
def test_allowlisted_types_still_serialized(self):
"""Types in the allowlist should serialize normally."""
strategy = BFSDeepCrawlStrategy(max_depth=2, max_pages=10)
serialized = to_serializable_dict(strategy)
assert serialized["type"] == "BFSDeepCrawlStrategy"
assert serialized["params"]["max_depth"] == 2
assert serialized["params"]["max_pages"] == 10
def test_callable_serialized_as_none(self):
"""Callables (like on_state_change) should also be skipped."""
async def callback(state):
pass
result = to_serializable_dict(callback)
assert result is None
# -- Deserialization: unknown types return None instead of raising --
class TestDeserializationSkipsUnknown:
"""from_serializable_dict should return None for unknown types."""
def test_logger_type_returns_none(self):
"""The exact payload from the bug report should not raise."""
data = {
"type": "Logger",
"params": {"name": "crawl4ai.deep_crawling.bfs_strategy"}
}
result = from_serializable_dict(data)
assert result is None
def test_arbitrary_unknown_type_returns_none(self):
data = {"type": "SomeRandomClass", "params": {"foo": "bar"}}
result = from_serializable_dict(data)
assert result is None
def test_known_types_still_deserialize(self):
data = {
"type": "BFSDeepCrawlStrategy",
"params": {"max_depth": 2, "max_pages": 10}
}
result = from_serializable_dict(data)
assert isinstance(result, BFSDeepCrawlStrategy)
assert result.max_depth == 2
def test_no_valueerror_raised(self):
"""Must never raise ValueError for unknown types."""
data = {"type": "Logger", "params": {"name": "test"}}
# Should NOT raise
result = from_serializable_dict(data)
assert result is None
# -- Roundtrip: serialize then deserialize --
class TestSerializationRoundtrip:
"""Full roundtrip should work for strategies with logger."""
def test_bfs_strategy_roundtrip(self):
strategy = BFSDeepCrawlStrategy(max_depth=2, max_pages=10)
serialized = to_serializable_dict(strategy)
restored = from_serializable_dict(serialized)
assert isinstance(restored, BFSDeepCrawlStrategy)
assert restored.max_depth == 2
assert restored.max_pages == 10
def test_dfs_strategy_serialization_no_logger(self):
"""DFS strategy should not serialize Logger either."""
strategy = DFSDeepCrawlStrategy(max_depth=3, max_pages=5)
serialized = to_serializable_dict(strategy)
assert serialized["type"] == "DFSDeepCrawlStrategy"
import json
assert '"type": "Logger"' not in json.dumps(serialized)
def test_crawler_config_with_deep_crawl_roundtrip(self):
strategy = BFSDeepCrawlStrategy(max_depth=2, max_pages=10)
config = CrawlerRunConfig(
deep_crawl_strategy=strategy,
cache_mode=CacheMode.BYPASS,
verbose=False,
)
serialized = to_serializable_dict(config)
restored = from_serializable_dict(serialized)
assert isinstance(restored, CrawlerRunConfig)
assert isinstance(restored.deep_crawl_strategy, BFSDeepCrawlStrategy)
assert restored.deep_crawl_strategy.max_depth == 2
def test_browser_config_roundtrip(self):
config = BrowserConfig(headless=True)
serialized = to_serializable_dict(config)
restored = from_serializable_dict(serialized)
assert isinstance(restored, BrowserConfig)
# -- Reporter's exact scenario --
class TestReporterScenario:
"""Reproduce the exact scenario from issue #1848."""
def test_reporter_payload_deserializes(self):
"""The exact JSON from the bug report should work."""
payload = {
"type": "CrawlerRunConfig",
"params": {
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 2,
"max_pages": 10,
"logger": {
"type": "Logger",
"params": {
"name": "crawl4ai.deep_crawling.bfs_strategy"
}
}
}
}
}
}
result = from_serializable_dict(payload)
assert isinstance(result, CrawlerRunConfig)
assert isinstance(result.deep_crawl_strategy, BFSDeepCrawlStrategy)
assert result.deep_crawl_strategy.max_depth == 2
assert result.deep_crawl_strategy.max_pages == 10
def test_reporter_full_request(self):
"""Full request payload from the bug report."""
crawler_config = {
"type": "CrawlerRunConfig",
"params": {
"scraping_strategy": {
"type": "LXMLWebScrapingStrategy",
"params": {}
},
"table_extraction": {
"type": "DefaultTableExtraction",
"params": {}
},
"verbose": False,
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 2,
"max_pages": 10,
"logger": {
"type": "Logger",
"params": {
"name": "crawl4ai.deep_crawling.bfs_strategy"
}
}
}
}
}
}
result = from_serializable_dict(crawler_config)
assert isinstance(result, CrawlerRunConfig)
assert result.deep_crawl_strategy.max_depth == 2
def test_client_side_serialization_clean(self):
"""New client serialization should not include Logger at all."""
strategy = BFSDeepCrawlStrategy(max_depth=2, max_pages=10)
config = CrawlerRunConfig(
deep_crawl_strategy=strategy,
verbose=False,
)
serialized = to_serializable_dict(config)
# Walk the serialized dict — no "Logger" type should appear
import json
serialized_str = json.dumps(serialized)
assert '"type": "Logger"' not in serialized_str