mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-11 08:18:01 +00:00
fix: expose crawler_config on all MCP scrape tools (#1963)
MCP tools (md, html, screenshot, pdf, execute_js) hardcoded
CrawlerRunConfig() with no user input, so wait_until,
delay_before_return_html, cache_mode, and all other
CrawlerRunConfig fields were silently ignored. /crawl already
had full passthrough; this brings the remaining tools to parity.
- schemas.py: add crawler_config: Optional[Dict] to all five
request schemas so mcp_bridge.py exposes the field in MCP
tool inputSchemas automatically
- server.py: handlers now load via CrawlerRunConfig.load() then
stamp endpoint-required fields on top (screenshot, pdf, js_code);
fix screenshot_wait_for/wait_for_images defaults from 2/False to
None so they only override crawler_config when explicitly passed
- api.py: handle_markdown_request accepts crawler_config kwarg;
cache_mode precedence uses key-presence check instead of falsy
check so crawler_config.cache_mode correctly wins over legacy c
Tests: tests/mcp/test_mcp_crawler_config.py — 7 MCP SSE tests
proving delay_before_return_html is honoured server-side on all tools
This commit is contained in:
@@ -268,7 +268,8 @@ async def handle_markdown_request(
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None,
|
||||
temperature: Optional[float] = None,
|
||||
base_url: Optional[str] = None
|
||||
base_url: Optional[str] = None,
|
||||
crawler_config: Optional[dict] = None,
|
||||
) -> str:
|
||||
"""Handle markdown generation requests."""
|
||||
crawler = None
|
||||
@@ -313,14 +314,13 @@ async def handle_markdown_request(
|
||||
**_cfg["crawler"]["browser"].get("kwargs", {}),
|
||||
)
|
||||
crawler = await get_crawler(browser_cfg)
|
||||
result = await crawler.arun(
|
||||
url=decoded_url,
|
||||
config=CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
cache_mode=cache_mode
|
||||
)
|
||||
)
|
||||
cc = crawler_config or {}
|
||||
cfg = CrawlerRunConfig.load(cc)
|
||||
cfg.markdown_generator = md_generator
|
||||
cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
||||
if 'cache_mode' not in cc:
|
||||
cfg.cache_mode = cache_mode
|
||||
result = await crawler.arun(url=decoded_url, config=cfg)
|
||||
|
||||
if not result.success:
|
||||
raise HTTPException(
|
||||
|
||||
@@ -6,8 +6,25 @@ from utils import FilterType
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
urls: List[str] = Field(min_length=1, max_length=100)
|
||||
browser_config: Optional[Dict] = Field(default_factory=dict)
|
||||
crawler_config: Optional[Dict] = Field(default_factory=dict)
|
||||
browser_config: Optional[Dict] = Field(
|
||||
default_factory=dict,
|
||||
description="Optional BrowserConfig overrides (e.g. headless, user_agent, proxy, viewport)"
|
||||
)
|
||||
crawler_config: Optional[Dict] = Field(
|
||||
default_factory=dict,
|
||||
description=(
|
||||
"Optional CrawlerRunConfig overrides. Key parameters: "
|
||||
"wait_until ('load', 'domcontentloaded', 'networkidle', 'commit') — when to consider navigation done; "
|
||||
"delay_before_return_html (float, seconds) — extra wait before capturing HTML, useful for SPAs; "
|
||||
"cache_mode ('enabled', 'disabled', 'read_only', 'write_only', 'bypass') — cache behaviour; "
|
||||
"js_code (str | list) — JavaScript to execute after page load; "
|
||||
"wait_for (str) — CSS selector or JS expression to wait for before returning; "
|
||||
"screenshot (bool) — capture a screenshot; pdf (bool) — generate a PDF; "
|
||||
"extraction_strategy (dict) — structured extraction config; "
|
||||
"markdown_generator (dict) — markdown generation config. "
|
||||
"All CrawlerRunConfig fields are accepted; unknown keys are silently ignored."
|
||||
)
|
||||
)
|
||||
crawler_configs: Optional[List[Dict]] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
@@ -70,6 +87,12 @@ class MarkdownRequest(BaseModel):
|
||||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||||
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
|
||||
base_url: Optional[str] = Field(None, description="LLM API base URL override")
|
||||
crawler_config: Optional[Dict] = Field(
|
||||
default_factory=dict,
|
||||
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
|
||||
"Takes precedence over the 'c' cache parameter when cache_mode is specified here. "
|
||||
"scraping_strategy is always set to LXMLWebScrapingStrategy by this endpoint and cannot be overridden."
|
||||
)
|
||||
|
||||
|
||||
class RawCode(BaseModel):
|
||||
@@ -77,16 +100,30 @@ class RawCode(BaseModel):
|
||||
|
||||
class HTMLRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
crawler_config: Optional[Dict] = Field(
|
||||
default_factory=dict,
|
||||
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode)"
|
||||
)
|
||||
|
||||
class ScreenshotRequest(BaseModel):
|
||||
url: str
|
||||
screenshot_wait_for: Optional[float] = 2
|
||||
wait_for_images: Optional[bool] = False
|
||||
screenshot_wait_for: Optional[float] = None
|
||||
wait_for_images: Optional[bool] = None
|
||||
output_path: Optional[str] = None
|
||||
crawler_config: Optional[Dict] = Field(
|
||||
default_factory=dict,
|
||||
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
|
||||
"screenshot=True is always enforced."
|
||||
)
|
||||
|
||||
class PDFRequest(BaseModel):
|
||||
url: str
|
||||
output_path: Optional[str] = None
|
||||
crawler_config: Optional[Dict] = Field(
|
||||
default_factory=dict,
|
||||
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
|
||||
"pdf=True is always enforced."
|
||||
)
|
||||
|
||||
|
||||
class JSEndpointRequest(BaseModel):
|
||||
@@ -95,6 +132,11 @@ class JSEndpointRequest(BaseModel):
|
||||
...,
|
||||
description="List of separated JavaScript snippets to execute"
|
||||
)
|
||||
crawler_config: Optional[Dict] = Field(
|
||||
default_factory=dict,
|
||||
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
|
||||
"js_code is always set from the scripts parameter and cannot be overridden via crawler_config."
|
||||
)
|
||||
|
||||
|
||||
class WebhookConfig(BaseModel):
|
||||
|
||||
@@ -434,7 +434,8 @@ async def get_markdown(
|
||||
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
|
||||
markdown = await handle_markdown_request(
|
||||
body.url, body.f, body.q, body.c, config, body.provider,
|
||||
body.temperature, body.base_url
|
||||
body.temperature, body.base_url,
|
||||
crawler_config=body.crawler_config
|
||||
)
|
||||
return JSONResponse({
|
||||
"url": body.url,
|
||||
@@ -459,9 +460,9 @@ async def generate_html(
|
||||
Use when you need sanitized HTML structures for building schemas or further processing.
|
||||
"""
|
||||
validate_url_scheme(body.url, allow_raw=True)
|
||||
cfg = CrawlerRunConfig()
|
||||
crawler = None
|
||||
try:
|
||||
cfg = CrawlerRunConfig.load(body.crawler_config or {})
|
||||
crawler = await get_crawler(get_default_browser_config())
|
||||
results = await crawler.arun(url=body.url, config=cfg)
|
||||
if not results[0].success:
|
||||
@@ -496,7 +497,12 @@ async def generate_screenshot(
|
||||
validate_url_scheme(body.url)
|
||||
crawler = None
|
||||
try:
|
||||
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for, wait_for_images=body.wait_for_images)
|
||||
cfg = CrawlerRunConfig.load(body.crawler_config or {})
|
||||
cfg.screenshot = True
|
||||
if body.screenshot_wait_for is not None:
|
||||
cfg.screenshot_wait_for = body.screenshot_wait_for
|
||||
if body.wait_for_images is not None:
|
||||
cfg.wait_for_images = body.wait_for_images
|
||||
crawler = await get_crawler(get_default_browser_config())
|
||||
results = await crawler.arun(url=body.url, config=cfg)
|
||||
if not results[0].success:
|
||||
@@ -534,7 +540,8 @@ async def generate_pdf(
|
||||
validate_url_scheme(body.url)
|
||||
crawler = None
|
||||
try:
|
||||
cfg = CrawlerRunConfig(pdf=True)
|
||||
cfg = CrawlerRunConfig.load(body.crawler_config or {})
|
||||
cfg.pdf = True
|
||||
crawler = await get_crawler(get_default_browser_config())
|
||||
results = await crawler.arun(url=body.url, config=cfg)
|
||||
if not results[0].success:
|
||||
@@ -610,7 +617,8 @@ async def execute_js(
|
||||
validate_url_scheme(body.url)
|
||||
crawler = None
|
||||
try:
|
||||
cfg = CrawlerRunConfig(js_code=body.scripts)
|
||||
cfg = CrawlerRunConfig.load(body.crawler_config or {})
|
||||
cfg.js_code = body.scripts
|
||||
crawler = await get_crawler(get_default_browser_config())
|
||||
results = await crawler.arun(url=body.url, config=cfg)
|
||||
if not results[0].success:
|
||||
|
||||
281
tests/mcp/test_mcp_crawler_config.py
Normal file
281
tests/mcp/test_mcp_crawler_config.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Tests that each MCP tool correctly accepts and honours crawler_config.
|
||||
|
||||
Common crawler_config used in every test:
|
||||
wait_until = "domcontentloaded"
|
||||
delay_before_return_html = 0.5
|
||||
cache_mode = "bypass"
|
||||
|
||||
Run:
|
||||
source .venv/bin/activate
|
||||
python tests/mcp/test_mcp_crawler_config.py
|
||||
"""
|
||||
|
||||
import anyio, json, time
|
||||
from mcp.client.sse import sse_client
|
||||
from mcp.client.session import ClientSession
|
||||
|
||||
SSE_URL = "http://localhost:11235/mcp/sse"
|
||||
TARGET_URL = "https://example.com"
|
||||
|
||||
# Delay we assert is actually observed in every timed test.
|
||||
# Kept at 2 s so it's large enough to be unambiguous even with network variance.
|
||||
DELAY = 2.0
|
||||
|
||||
COMMON_CRAWLER_CONFIG = {
|
||||
"wait_until": "domcontentloaded",
|
||||
"delay_before_return_html": DELAY,
|
||||
"cache_mode": "bypass",
|
||||
}
|
||||
|
||||
# ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def _ok(name: str, detail: str = "") -> None:
|
||||
suffix = f" — {detail}" if detail else ""
|
||||
print(f" ✓ {name}{suffix}")
|
||||
|
||||
def _fail(name: str, reason: str) -> None:
|
||||
print(f" ✗ {name} FAILED: {reason}")
|
||||
|
||||
def _parse(res) -> dict | list:
|
||||
return json.loads(res.content[0].text)
|
||||
|
||||
def _assert_delay(elapsed: float, name: str) -> None:
|
||||
"""Assert that elapsed time is at least DELAY, proving the server waited."""
|
||||
if elapsed < DELAY:
|
||||
raise AssertionError(
|
||||
f"delay_before_return_html={DELAY}s was NOT honoured: "
|
||||
f"response returned in {elapsed:.2f}s (expected >= {DELAY}s)"
|
||||
)
|
||||
|
||||
# ── individual tests ──────────────────────────────────────────────────────────
|
||||
|
||||
async def test_schema_exposes_crawler_config(s: ClientSession) -> bool:
|
||||
"""All MCP tools we changed must advertise crawler_config in their inputSchema."""
|
||||
name = "schema check"
|
||||
expected = {"md", "html", "screenshot", "pdf", "execute_js", "crawl"}
|
||||
missing = []
|
||||
try:
|
||||
tools = (await s.list_tools()).tools
|
||||
tool_map = {t.name: t for t in tools}
|
||||
for tool_name in expected:
|
||||
tool = tool_map.get(tool_name)
|
||||
if tool is None:
|
||||
missing.append(f"{tool_name}(not found)")
|
||||
continue
|
||||
props = (tool.inputSchema or {}).get("properties", {})
|
||||
if "crawler_config" not in props:
|
||||
missing.append(f"{tool_name}(no crawler_config in schema)")
|
||||
if missing:
|
||||
_fail(name, ", ".join(missing))
|
||||
return False
|
||||
_ok(name, f"all {len(expected)} tools expose crawler_config")
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(name, str(e))
|
||||
return False
|
||||
|
||||
|
||||
async def test_md(s: ClientSession) -> bool:
|
||||
name = "md"
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
res = _parse(await s.call_tool(name, {
|
||||
"url": TARGET_URL,
|
||||
"f": "fit",
|
||||
"crawler_config": COMMON_CRAWLER_CONFIG,
|
||||
}))
|
||||
elapsed = time.monotonic() - t0
|
||||
assert res.get("success"), f"success=False: {res}"
|
||||
|
||||
md = res.get("markdown", "")
|
||||
assert md, "markdown field is empty"
|
||||
# Must look like markdown — at minimum contain a heading or a link
|
||||
assert any(tok in md for tok in ("#", "[", "*", "---")), \
|
||||
f"content does not look like markdown: {md[:200]!r}"
|
||||
|
||||
_assert_delay(elapsed, name)
|
||||
_ok(name, f"len={len(md)}, elapsed={elapsed:.2f}s, preview={md[:60]!r}")
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(name, str(e))
|
||||
return False
|
||||
|
||||
|
||||
async def test_html(s: ClientSession) -> bool:
|
||||
name = "html"
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
res = _parse(await s.call_tool(name, {
|
||||
"url": TARGET_URL,
|
||||
"crawler_config": COMMON_CRAWLER_CONFIG,
|
||||
}))
|
||||
elapsed = time.monotonic() - t0
|
||||
assert res.get("success"), f"success=False: {res}"
|
||||
|
||||
html = res.get("html", "")
|
||||
assert html, "html field is empty"
|
||||
# Must contain real HTML tags
|
||||
assert "<" in html and ">" in html, \
|
||||
f"content does not look like HTML: {html[:200]!r}"
|
||||
assert any(tag in html.lower() for tag in ("<html", "<body", "<div", "<p", "<h")), \
|
||||
f"no block-level HTML tags found: {html[:200]!r}"
|
||||
|
||||
_assert_delay(elapsed, name)
|
||||
_ok(name, f"len={len(html)}, elapsed={elapsed:.2f}s, first_tag={html[:40]!r}")
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(name, str(e))
|
||||
return False
|
||||
|
||||
|
||||
async def test_screenshot(s: ClientSession) -> bool:
|
||||
name = "screenshot"
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
res = _parse(await s.call_tool(name, {
|
||||
"url": TARGET_URL,
|
||||
"screenshot_wait_for": 1.0,
|
||||
"crawler_config": COMMON_CRAWLER_CONFIG,
|
||||
}))
|
||||
elapsed = time.monotonic() - t0
|
||||
assert res.get("success"), f"success=False: {res}"
|
||||
|
||||
shot_b64 = res.get("screenshot", "")
|
||||
assert shot_b64, "screenshot field is empty"
|
||||
# Decode and verify PNG magic bytes (\x89PNG\r\n\x1a\n)
|
||||
import base64
|
||||
raw = base64.b64decode(shot_b64)
|
||||
assert raw[:4] == b"\x89PNG", \
|
||||
f"screenshot is not a PNG — magic bytes: {raw[:8]!r}"
|
||||
assert len(raw) > 1000, f"PNG suspiciously small: {len(raw)} bytes"
|
||||
|
||||
_assert_delay(elapsed, name)
|
||||
_ok(name, f"PNG {len(raw)//1024}KB, elapsed={elapsed:.2f}s")
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(name, str(e))
|
||||
return False
|
||||
|
||||
|
||||
async def test_pdf(s: ClientSession) -> bool:
|
||||
name = "pdf"
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
res = _parse(await s.call_tool(name, {
|
||||
"url": TARGET_URL,
|
||||
"crawler_config": COMMON_CRAWLER_CONFIG,
|
||||
}))
|
||||
elapsed = time.monotonic() - t0
|
||||
assert res.get("success"), f"success=False: {res}"
|
||||
|
||||
pdf_b64 = res.get("pdf", "")
|
||||
assert pdf_b64, "pdf field is empty"
|
||||
# Decode and verify PDF magic bytes (%PDF-)
|
||||
import base64
|
||||
raw = base64.b64decode(pdf_b64)
|
||||
assert raw[:4] == b"%PDF", \
|
||||
f"response is not a PDF — magic bytes: {raw[:8]!r}"
|
||||
assert len(raw) > 500, f"PDF suspiciously small: {len(raw)} bytes"
|
||||
|
||||
_assert_delay(elapsed, name)
|
||||
_ok(name, f"PDF {len(raw)//1024}KB, elapsed={elapsed:.2f}s")
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(name, str(e))
|
||||
return False
|
||||
|
||||
|
||||
async def test_execute_js(s: ClientSession) -> bool:
|
||||
name = "execute_js"
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
# Ask for the page title — example.com always returns "Example Domain"
|
||||
res = _parse(await s.call_tool(name, {
|
||||
"url": TARGET_URL,
|
||||
"scripts": ["document.title"],
|
||||
"crawler_config": COMMON_CRAWLER_CONFIG,
|
||||
}))
|
||||
elapsed = time.monotonic() - t0
|
||||
assert res.get("success"), f"success=False: {res}"
|
||||
|
||||
# Full CrawlResult must be present
|
||||
assert res.get("html"), "CrawlResult missing html"
|
||||
assert res.get("url"), "CrawlResult missing url"
|
||||
|
||||
_assert_delay(elapsed, name)
|
||||
_ok(name, f"html len={len(res['html'])}, url={res['url']}, elapsed={elapsed:.2f}s")
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(name, str(e))
|
||||
return False
|
||||
|
||||
|
||||
async def test_crawl(s: ClientSession) -> bool:
|
||||
name = "crawl"
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
res = _parse(await s.call_tool(name, {
|
||||
"urls": [TARGET_URL],
|
||||
"browser_config": {},
|
||||
"crawler_config": {
|
||||
**COMMON_CRAWLER_CONFIG,
|
||||
"wait_until": "networkidle",
|
||||
},
|
||||
}))
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
results = res.get("results", [])
|
||||
assert results, "no results list"
|
||||
r0 = results[0]
|
||||
assert r0.get("success"), f"result[0] not successful: {r0}"
|
||||
|
||||
# Must have the full crawl payload
|
||||
assert r0.get("html"), "result missing html"
|
||||
assert r0.get("url"), "result missing url"
|
||||
assert r0.get("status_code") == 200, \
|
||||
f"unexpected status_code: {r0.get('status_code')}"
|
||||
|
||||
_assert_delay(elapsed, name)
|
||||
_ok(name, f"html len={len(r0['html'])}, status={r0['status_code']}, elapsed={elapsed:.2f}s")
|
||||
return True
|
||||
except Exception as e:
|
||||
_fail(name, str(e))
|
||||
return False
|
||||
|
||||
|
||||
# ── runner ────────────────────────────────────────────────────────────────────
|
||||
|
||||
async def main() -> None:
|
||||
print(f"\nConnecting to {SSE_URL} …")
|
||||
async with sse_client(SSE_URL) as (r, w):
|
||||
async with ClientSession(r, w) as s:
|
||||
await s.initialize()
|
||||
tools = [t.name for t in (await s.list_tools()).tools]
|
||||
print(f"Available tools: {tools}\n")
|
||||
|
||||
tests = [
|
||||
("schema check", test_schema_exposes_crawler_config),
|
||||
("md", test_md),
|
||||
("html", test_html),
|
||||
("screenshot", test_screenshot),
|
||||
("pdf", test_pdf),
|
||||
("execute_js", test_execute_js),
|
||||
("crawl", test_crawl),
|
||||
]
|
||||
|
||||
passed = 0
|
||||
failed = 0
|
||||
for label, fn in tests:
|
||||
ok = await fn(s)
|
||||
if ok:
|
||||
passed += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Results: {passed} passed, {failed} failed out of {len(tests)} tests")
|
||||
if failed:
|
||||
raise SystemExit(1)
|
||||
|
||||
anyio.run(main)
|
||||
Reference in New Issue
Block a user