fix: expose crawler_config on all MCP scrape tools (#1963)

MCP tools (md, html, screenshot, pdf, execute_js) hardcoded
  CrawlerRunConfig() with no user input, so wait_until,
  delay_before_return_html, cache_mode, and all other
  CrawlerRunConfig fields were silently ignored. /crawl already
  had full passthrough; this brings the remaining tools to parity.

  - schemas.py: add crawler_config: Optional[Dict] to all five
    request schemas so mcp_bridge.py exposes the field in MCP
    tool inputSchemas automatically
  - server.py: handlers now load via CrawlerRunConfig.load() then
    stamp endpoint-required fields on top (screenshot, pdf, js_code);
    fix screenshot_wait_for/wait_for_images defaults from 2/False to
    None so they only override crawler_config when explicitly passed
  - api.py: handle_markdown_request accepts crawler_config kwarg;
    cache_mode precedence uses key-presence check instead of falsy
    check so crawler_config.cache_mode correctly wins over legacy c

  Tests: tests/mcp/test_mcp_crawler_config.py — 7 MCP SSE tests
  proving delay_before_return_html is honoured server-side on all tools
This commit is contained in:
Soham Kukreti
2026-05-12 21:13:17 +05:30
parent dfb525edec
commit d709d670f9
4 changed files with 350 additions and 19 deletions

View File

@@ -268,7 +268,8 @@ async def handle_markdown_request(
config: Optional[dict] = None,
provider: Optional[str] = None,
temperature: Optional[float] = None,
base_url: Optional[str] = None
base_url: Optional[str] = None,
crawler_config: Optional[dict] = None,
) -> str:
"""Handle markdown generation requests."""
crawler = None
@@ -313,14 +314,13 @@ async def handle_markdown_request(
**_cfg["crawler"]["browser"].get("kwargs", {}),
)
crawler = await get_crawler(browser_cfg)
result = await crawler.arun(
url=decoded_url,
config=CrawlerRunConfig(
markdown_generator=md_generator,
scraping_strategy=LXMLWebScrapingStrategy(),
cache_mode=cache_mode
)
)
cc = crawler_config or {}
cfg = CrawlerRunConfig.load(cc)
cfg.markdown_generator = md_generator
cfg.scraping_strategy = LXMLWebScrapingStrategy()
if 'cache_mode' not in cc:
cfg.cache_mode = cache_mode
result = await crawler.arun(url=decoded_url, config=cfg)
if not result.success:
raise HTTPException(

View File

@@ -6,8 +6,25 @@ from utils import FilterType
class CrawlRequest(BaseModel):
urls: List[str] = Field(min_length=1, max_length=100)
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
browser_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional BrowserConfig overrides (e.g. headless, user_agent, proxy, viewport)"
)
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description=(
"Optional CrawlerRunConfig overrides. Key parameters: "
"wait_until ('load', 'domcontentloaded', 'networkidle', 'commit') — when to consider navigation done; "
"delay_before_return_html (float, seconds) — extra wait before capturing HTML, useful for SPAs; "
"cache_mode ('enabled', 'disabled', 'read_only', 'write_only', 'bypass') — cache behaviour; "
"js_code (str | list) — JavaScript to execute after page load; "
"wait_for (str) — CSS selector or JS expression to wait for before returning; "
"screenshot (bool) — capture a screenshot; pdf (bool) — generate a PDF; "
"extraction_strategy (dict) — structured extraction config; "
"markdown_generator (dict) — markdown generation config. "
"All CrawlerRunConfig fields are accepted; unknown keys are silently ignored."
)
)
crawler_configs: Optional[List[Dict]] = Field(
default=None,
description=(
@@ -70,6 +87,12 @@ class MarkdownRequest(BaseModel):
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
base_url: Optional[str] = Field(None, description="LLM API base URL override")
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
"Takes precedence over the 'c' cache parameter when cache_mode is specified here. "
"scraping_strategy is always set to LXMLWebScrapingStrategy by this endpoint and cannot be overridden."
)
class RawCode(BaseModel):
@@ -77,16 +100,30 @@ class RawCode(BaseModel):
class HTMLRequest(BaseModel):
url: str
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode)"
)
class ScreenshotRequest(BaseModel):
url: str
screenshot_wait_for: Optional[float] = 2
wait_for_images: Optional[bool] = False
screenshot_wait_for: Optional[float] = None
wait_for_images: Optional[bool] = None
output_path: Optional[str] = None
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
"screenshot=True is always enforced."
)
class PDFRequest(BaseModel):
url: str
output_path: Optional[str] = None
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
"pdf=True is always enforced."
)
class JSEndpointRequest(BaseModel):
@@ -95,6 +132,11 @@ class JSEndpointRequest(BaseModel):
...,
description="List of separated JavaScript snippets to execute"
)
crawler_config: Optional[Dict] = Field(
default_factory=dict,
description="Optional CrawlerRunConfig overrides (e.g. wait_until, delay_before_return_html, cache_mode). "
"js_code is always set from the scripts parameter and cannot be overridden via crawler_config."
)
class WebhookConfig(BaseModel):

View File

@@ -434,7 +434,8 @@ async def get_markdown(
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
markdown = await handle_markdown_request(
body.url, body.f, body.q, body.c, config, body.provider,
body.temperature, body.base_url
body.temperature, body.base_url,
crawler_config=body.crawler_config
)
return JSONResponse({
"url": body.url,
@@ -459,9 +460,9 @@ async def generate_html(
Use when you need sanitized HTML structures for building schemas or further processing.
"""
validate_url_scheme(body.url, allow_raw=True)
cfg = CrawlerRunConfig()
crawler = None
try:
cfg = CrawlerRunConfig.load(body.crawler_config or {})
crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
@@ -496,7 +497,12 @@ async def generate_screenshot(
validate_url_scheme(body.url)
crawler = None
try:
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for, wait_for_images=body.wait_for_images)
cfg = CrawlerRunConfig.load(body.crawler_config or {})
cfg.screenshot = True
if body.screenshot_wait_for is not None:
cfg.screenshot_wait_for = body.screenshot_wait_for
if body.wait_for_images is not None:
cfg.wait_for_images = body.wait_for_images
crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
@@ -534,7 +540,8 @@ async def generate_pdf(
validate_url_scheme(body.url)
crawler = None
try:
cfg = CrawlerRunConfig(pdf=True)
cfg = CrawlerRunConfig.load(body.crawler_config or {})
cfg.pdf = True
crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:
@@ -610,7 +617,8 @@ async def execute_js(
validate_url_scheme(body.url)
crawler = None
try:
cfg = CrawlerRunConfig(js_code=body.scripts)
cfg = CrawlerRunConfig.load(body.crawler_config or {})
cfg.js_code = body.scripts
crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
if not results[0].success:

View File

@@ -0,0 +1,281 @@
"""
Tests that each MCP tool correctly accepts and honours crawler_config.
Common crawler_config used in every test:
wait_until = "domcontentloaded"
delay_before_return_html = 0.5
cache_mode = "bypass"
Run:
source .venv/bin/activate
python tests/mcp/test_mcp_crawler_config.py
"""
import anyio, json, time
from mcp.client.sse import sse_client
from mcp.client.session import ClientSession
SSE_URL = "http://localhost:11235/mcp/sse"
TARGET_URL = "https://example.com"
# Delay we assert is actually observed in every timed test.
# Kept at 2 s so it's large enough to be unambiguous even with network variance.
DELAY = 2.0
COMMON_CRAWLER_CONFIG = {
"wait_until": "domcontentloaded",
"delay_before_return_html": DELAY,
"cache_mode": "bypass",
}
# ── helpers ───────────────────────────────────────────────────────────────────
def _ok(name: str, detail: str = "") -> None:
suffix = f"{detail}" if detail else ""
print(f"{name}{suffix}")
def _fail(name: str, reason: str) -> None:
print(f"{name} FAILED: {reason}")
def _parse(res) -> dict | list:
return json.loads(res.content[0].text)
def _assert_delay(elapsed: float, name: str) -> None:
"""Assert that elapsed time is at least DELAY, proving the server waited."""
if elapsed < DELAY:
raise AssertionError(
f"delay_before_return_html={DELAY}s was NOT honoured: "
f"response returned in {elapsed:.2f}s (expected >= {DELAY}s)"
)
# ── individual tests ──────────────────────────────────────────────────────────
async def test_schema_exposes_crawler_config(s: ClientSession) -> bool:
"""All MCP tools we changed must advertise crawler_config in their inputSchema."""
name = "schema check"
expected = {"md", "html", "screenshot", "pdf", "execute_js", "crawl"}
missing = []
try:
tools = (await s.list_tools()).tools
tool_map = {t.name: t for t in tools}
for tool_name in expected:
tool = tool_map.get(tool_name)
if tool is None:
missing.append(f"{tool_name}(not found)")
continue
props = (tool.inputSchema or {}).get("properties", {})
if "crawler_config" not in props:
missing.append(f"{tool_name}(no crawler_config in schema)")
if missing:
_fail(name, ", ".join(missing))
return False
_ok(name, f"all {len(expected)} tools expose crawler_config")
return True
except Exception as e:
_fail(name, str(e))
return False
async def test_md(s: ClientSession) -> bool:
name = "md"
try:
t0 = time.monotonic()
res = _parse(await s.call_tool(name, {
"url": TARGET_URL,
"f": "fit",
"crawler_config": COMMON_CRAWLER_CONFIG,
}))
elapsed = time.monotonic() - t0
assert res.get("success"), f"success=False: {res}"
md = res.get("markdown", "")
assert md, "markdown field is empty"
# Must look like markdown — at minimum contain a heading or a link
assert any(tok in md for tok in ("#", "[", "*", "---")), \
f"content does not look like markdown: {md[:200]!r}"
_assert_delay(elapsed, name)
_ok(name, f"len={len(md)}, elapsed={elapsed:.2f}s, preview={md[:60]!r}")
return True
except Exception as e:
_fail(name, str(e))
return False
async def test_html(s: ClientSession) -> bool:
name = "html"
try:
t0 = time.monotonic()
res = _parse(await s.call_tool(name, {
"url": TARGET_URL,
"crawler_config": COMMON_CRAWLER_CONFIG,
}))
elapsed = time.monotonic() - t0
assert res.get("success"), f"success=False: {res}"
html = res.get("html", "")
assert html, "html field is empty"
# Must contain real HTML tags
assert "<" in html and ">" in html, \
f"content does not look like HTML: {html[:200]!r}"
assert any(tag in html.lower() for tag in ("<html", "<body", "<div", "<p", "<h")), \
f"no block-level HTML tags found: {html[:200]!r}"
_assert_delay(elapsed, name)
_ok(name, f"len={len(html)}, elapsed={elapsed:.2f}s, first_tag={html[:40]!r}")
return True
except Exception as e:
_fail(name, str(e))
return False
async def test_screenshot(s: ClientSession) -> bool:
name = "screenshot"
try:
t0 = time.monotonic()
res = _parse(await s.call_tool(name, {
"url": TARGET_URL,
"screenshot_wait_for": 1.0,
"crawler_config": COMMON_CRAWLER_CONFIG,
}))
elapsed = time.monotonic() - t0
assert res.get("success"), f"success=False: {res}"
shot_b64 = res.get("screenshot", "")
assert shot_b64, "screenshot field is empty"
# Decode and verify PNG magic bytes (\x89PNG\r\n\x1a\n)
import base64
raw = base64.b64decode(shot_b64)
assert raw[:4] == b"\x89PNG", \
f"screenshot is not a PNG — magic bytes: {raw[:8]!r}"
assert len(raw) > 1000, f"PNG suspiciously small: {len(raw)} bytes"
_assert_delay(elapsed, name)
_ok(name, f"PNG {len(raw)//1024}KB, elapsed={elapsed:.2f}s")
return True
except Exception as e:
_fail(name, str(e))
return False
async def test_pdf(s: ClientSession) -> bool:
name = "pdf"
try:
t0 = time.monotonic()
res = _parse(await s.call_tool(name, {
"url": TARGET_URL,
"crawler_config": COMMON_CRAWLER_CONFIG,
}))
elapsed = time.monotonic() - t0
assert res.get("success"), f"success=False: {res}"
pdf_b64 = res.get("pdf", "")
assert pdf_b64, "pdf field is empty"
# Decode and verify PDF magic bytes (%PDF-)
import base64
raw = base64.b64decode(pdf_b64)
assert raw[:4] == b"%PDF", \
f"response is not a PDF — magic bytes: {raw[:8]!r}"
assert len(raw) > 500, f"PDF suspiciously small: {len(raw)} bytes"
_assert_delay(elapsed, name)
_ok(name, f"PDF {len(raw)//1024}KB, elapsed={elapsed:.2f}s")
return True
except Exception as e:
_fail(name, str(e))
return False
async def test_execute_js(s: ClientSession) -> bool:
name = "execute_js"
try:
t0 = time.monotonic()
# Ask for the page title — example.com always returns "Example Domain"
res = _parse(await s.call_tool(name, {
"url": TARGET_URL,
"scripts": ["document.title"],
"crawler_config": COMMON_CRAWLER_CONFIG,
}))
elapsed = time.monotonic() - t0
assert res.get("success"), f"success=False: {res}"
# Full CrawlResult must be present
assert res.get("html"), "CrawlResult missing html"
assert res.get("url"), "CrawlResult missing url"
_assert_delay(elapsed, name)
_ok(name, f"html len={len(res['html'])}, url={res['url']}, elapsed={elapsed:.2f}s")
return True
except Exception as e:
_fail(name, str(e))
return False
async def test_crawl(s: ClientSession) -> bool:
name = "crawl"
try:
t0 = time.monotonic()
res = _parse(await s.call_tool(name, {
"urls": [TARGET_URL],
"browser_config": {},
"crawler_config": {
**COMMON_CRAWLER_CONFIG,
"wait_until": "networkidle",
},
}))
elapsed = time.monotonic() - t0
results = res.get("results", [])
assert results, "no results list"
r0 = results[0]
assert r0.get("success"), f"result[0] not successful: {r0}"
# Must have the full crawl payload
assert r0.get("html"), "result missing html"
assert r0.get("url"), "result missing url"
assert r0.get("status_code") == 200, \
f"unexpected status_code: {r0.get('status_code')}"
_assert_delay(elapsed, name)
_ok(name, f"html len={len(r0['html'])}, status={r0['status_code']}, elapsed={elapsed:.2f}s")
return True
except Exception as e:
_fail(name, str(e))
return False
# ── runner ────────────────────────────────────────────────────────────────────
async def main() -> None:
print(f"\nConnecting to {SSE_URL}")
async with sse_client(SSE_URL) as (r, w):
async with ClientSession(r, w) as s:
await s.initialize()
tools = [t.name for t in (await s.list_tools()).tools]
print(f"Available tools: {tools}\n")
tests = [
("schema check", test_schema_exposes_crawler_config),
("md", test_md),
("html", test_html),
("screenshot", test_screenshot),
("pdf", test_pdf),
("execute_js", test_execute_js),
("crawl", test_crawl),
]
passed = 0
failed = 0
for label, fn in tests:
ok = await fn(s)
if ok:
passed += 1
else:
failed += 1
print(f"\n{'='*50}")
print(f"Results: {passed} passed, {failed} failed out of {len(tests)} tests")
if failed:
raise SystemExit(1)
anyio.run(main)