Files
crawl4ai/tests/test_pr_1795_1798_1734.py
unclecode 7c0cc3ed88 fix: batch merge of community PRs (#1622, #1786, #1796, #1795, #1798, #1734, #1290, #1668)
Bug fixes:
- Verify redirect targets are alive before returning from URL seeder (#1622)
- Wire mean_delay/max_range from CrawlerRunConfig into dispatcher rate limiter (#1786)
- Use DOMParser instead of innerHTML in process_iframes to prevent XSS (#1796)

Security/Docker:
- Require api_token for /token endpoint when configured (#1795)
- Deep-crawl streaming now mirrors Python library behavior via arun() (#1798)

CI:
- Bump GitHub Actions to latest versions - checkout v6, setup-python v6,
  build-push-action v6, setup-buildx v4, login v4 (#1734)

Features:
- Support type-list pipeline in JsonCssExtractionStrategy for chained
  extraction like ["attribute", "regex"] (#1290)
- Add --json-ensure-ascii CLI flag and JSON_ENSURE_ASCII config setting
  for Unicode preservation in JSON output (#1668)
2026-03-07 08:45:11 +00:00

223 lines
9.1 KiB
Python

"""
Tests for PR #1795, #1798, #1734
- #1795: /token endpoint requires api_token when configured
- #1798: Deep-crawl streaming branches to arun() for single URL
- #1734: GitHub Actions versions bumped to latest
"""
import pytest
import yaml
import ast
from unittest.mock import AsyncMock, MagicMock, patch
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
# ── PR #1795: api_token protection on /token endpoint ────────────────────
class TestTokenEndpointAuth:
"""Test the api_token gating logic added to the /token endpoint."""
def test_token_request_model_has_api_token_field(self):
"""auth.py TokenRequest should have an api_token field in its source."""
source = (ROOT / "deploy" / "docker" / "auth.py").read_text()
# Parse the AST to verify the field exists on the class
tree = ast.parse(source)
token_request = None
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef) and node.name == "TokenRequest":
token_request = node
break
assert token_request is not None, "TokenRequest class not found"
field_names = [
stmt.target.id
for stmt in token_request.body
if isinstance(stmt, ast.AnnAssign) and isinstance(stmt.target, ast.Name)
]
assert "email" in field_names, "TokenRequest missing email field"
assert "api_token" in field_names, "TokenRequest missing api_token field"
def test_server_token_check_logic_no_config(self):
"""When api_token is empty in config, any request should pass."""
config = {"security": {"api_token": ""}}
expected_token = config.get("security", {}).get("api_token", "")
# Empty string is falsy, so check should be skipped
assert not expected_token
def test_server_token_check_logic_with_config_match(self):
"""When api_token is set and request matches, should pass."""
config = {"security": {"api_token": "my-secret"}}
expected_token = config.get("security", {}).get("api_token", "")
req_token = "my-secret"
assert expected_token and req_token == expected_token
def test_server_token_check_logic_with_config_mismatch(self):
"""When api_token is set and request doesn't match, should reject."""
config = {"security": {"api_token": "my-secret"}}
expected_token = config.get("security", {}).get("api_token", "")
req_token = "wrong-token"
assert expected_token and req_token != expected_token
def test_server_token_check_logic_with_config_none(self):
"""When api_token is set and request sends None, should reject."""
config = {"security": {"api_token": "my-secret"}}
expected_token = config.get("security", {}).get("api_token", "")
req_token = None
assert expected_token and req_token != expected_token
def test_config_yml_has_api_token_field(self):
"""config.yml should include api_token under security."""
with open(ROOT / "deploy" / "docker" / "config.yml") as f:
cfg = yaml.safe_load(f)
assert "api_token" in cfg["security"]
# Default should be empty (disabled)
assert cfg["security"]["api_token"] == ""
def test_server_py_contains_token_check(self):
"""server.py get_token function should check api_token."""
source = (ROOT / "deploy" / "docker" / "server.py").read_text()
assert "api_token" in source
assert 'config.get("security", {}).get("api_token"' in source
assert "401" in source # HTTPException 401
# ── PR #1798: Deep-crawl streaming branches on strategy ──────────────────
class TestDeepCrawlStreamBranching:
"""Test the branching logic in handle_stream_crawl_request."""
def test_api_py_has_deep_crawl_branch(self):
"""api.py should branch on deep_crawl_strategy for streaming."""
source = (ROOT / "deploy" / "docker" / "api.py").read_text()
assert "deep_crawl_strategy is not None" in source
assert "crawler.arun(" in source # single-URL deep crawl path
assert "crawler.arun_many(" in source # multi-URL path
def test_api_py_rejects_multi_url_deep_crawl(self):
"""api.py should raise 400 for deep crawl with multiple URLs."""
source = (ROOT / "deploy" / "docker" / "api.py").read_text()
assert "exactly one URL per request" in source
assert "HTTP_400_BAD_REQUEST" in source
@pytest.mark.asyncio
async def test_deep_crawl_single_url_uses_arun(self):
"""With deep_crawl_strategy + 1 URL, should call crawler.arun()."""
from crawl4ai import CrawlerRunConfig, BrowserConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
cfg = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, max_pages=5),
stream=True,
)
# Verify the config has deep_crawl_strategy set
assert cfg.deep_crawl_strategy is not None
assert cfg.stream is True
# Simulate the branching logic from api.py
urls = ["https://example.com"]
if cfg.deep_crawl_strategy is not None and len(urls) == 1:
path = "arun"
else:
path = "arun_many"
assert path == "arun"
@pytest.mark.asyncio
async def test_no_deep_crawl_uses_arun_many(self):
"""Without deep_crawl_strategy, should use arun_many()."""
from crawl4ai import CrawlerRunConfig
cfg = CrawlerRunConfig(stream=True)
assert cfg.deep_crawl_strategy is None
urls = ["https://a.com", "https://b.com"]
if cfg.deep_crawl_strategy is not None and len(urls) == 1:
path = "arun"
else:
path = "arun_many"
assert path == "arun_many"
@pytest.mark.asyncio
async def test_deep_crawl_multi_url_rejected(self):
"""Deep crawl + multiple URLs should be rejected."""
from crawl4ai import CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
cfg = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, max_pages=5),
stream=True,
)
urls = ["https://a.com", "https://b.com"]
# This is what api.py does — raise before crawling
should_reject = cfg.deep_crawl_strategy is not None and len(urls) != 1
assert should_reject
# ── PR #1734: GitHub Actions version bumps ────────────────────────────────
class TestGitHubActionsVersions:
"""Verify all GitHub Actions are on current major versions."""
EXPECTED_VERSIONS = {
"actions/checkout": "v6",
"actions/setup-python": "v6",
"docker/build-push-action": "v6",
"docker/setup-buildx-action": "v4",
"docker/login-action": "v4",
"softprops/action-gh-release": "v2",
}
def _extract_actions(self, workflow_path):
"""Extract action@version pairs from a workflow file."""
with open(workflow_path) as f:
data = yaml.safe_load(f)
actions = {}
for job_name, job in data.get("jobs", {}).items():
for step in job.get("steps", []):
uses = step.get("uses", "")
if "@" in uses:
name, version = uses.rsplit("@", 1)
actions[name] = version
return actions
def test_docker_release_workflow(self):
"""docker-release.yml should use latest action versions."""
actions = self._extract_actions(
ROOT / ".github" / "workflows" / "docker-release.yml"
)
for name, expected in self.EXPECTED_VERSIONS.items():
if name in actions:
assert actions[name] == expected, (
f"{name} should be @{expected}, got @{actions[name]}"
)
def test_release_workflow(self):
"""release.yml should use latest action versions."""
actions = self._extract_actions(
ROOT / ".github" / "workflows" / "release.yml"
)
for name, expected in self.EXPECTED_VERSIONS.items():
if name in actions:
assert actions[name] == expected, (
f"{name} should be @{expected}, got @{actions[name]}"
)
def test_no_v4_or_v5_checkout_remaining(self):
"""No workflow should still reference checkout@v4 or v5."""
for wf in (ROOT / ".github" / "workflows").glob("*.yml"):
content = wf.read_text()
assert "actions/checkout@v4" not in content, f"{wf.name} still uses checkout@v4"
assert "actions/checkout@v5" not in content, f"{wf.name} still uses checkout@v5"
def test_no_old_build_push_action(self):
"""No workflow should still reference build-push-action@v5."""
for wf in (ROOT / ".github" / "workflows").glob("*.yml"):
content = wf.read_text()
assert "build-push-action@v5" not in content, (
f"{wf.name} still uses build-push-action@v5"
)