Files
crawl4ai/tests/test_pr_1290_1668.py
unclecode 7c0cc3ed88 fix: batch merge of community PRs (#1622, #1786, #1796, #1795, #1798, #1734, #1290, #1668)
Bug fixes:
- Verify redirect targets are alive before returning from URL seeder (#1622)
- Wire mean_delay/max_range from CrawlerRunConfig into dispatcher rate limiter (#1786)
- Use DOMParser instead of innerHTML in process_iframes to prevent XSS (#1796)

Security/Docker:
- Require api_token for /token endpoint when configured (#1795)
- Deep-crawl streaming now mirrors Python library behavior via arun() (#1798)

CI:
- Bump GitHub Actions to latest versions - checkout v6, setup-python v6,
  build-push-action v6, setup-buildx v4, login v4 (#1734)

Features:
- Support type-list pipeline in JsonCssExtractionStrategy for chained
  extraction like ["attribute", "regex"] (#1290)
- Add --json-ensure-ascii CLI flag and JSON_ENSURE_ASCII config setting
  for Unicode preservation in JSON output (#1668)
2026-03-07 08:45:11 +00:00

168 lines
6.7 KiB
Python

"""
Tests for PR #1290 and #1668
- #1290: Type-list pipeline in JsonCssExtractionStrategy._extract_single_field
- #1668: --json-ensure-ascii CLI flag and JSON_ENSURE_ASCII config
"""
import json
import pytest
from bs4 import BeautifulSoup
# ── PR #1290: Type-list pipeline in _extract_single_field ─────────────────
class TestTypePipeline:
"""Test that field type can be a list for chained extraction."""
@pytest.fixture
def strategy(self):
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
schema = {"name": "test", "baseSelector": "div", "fields": []}
return JsonCssExtractionStrategy(schema)
@pytest.fixture
def element(self):
html = '<div><a class="link" href="/product/12345?ref=home">Product Name</a></div>'
soup = BeautifulSoup(html, "html.parser")
return soup.find("div")
def test_single_type_text_still_works(self, strategy, element):
"""Single string type 'text' should still work as before."""
field = {"selector": "a", "type": "text"}
result = strategy._extract_single_field(element, field)
assert result == "Product Name"
def test_single_type_attribute_still_works(self, strategy, element):
"""Single string type 'attribute' should still work."""
field = {"selector": "a", "type": "attribute", "attribute": "href"}
result = strategy._extract_single_field(element, field)
assert result == "/product/12345?ref=home"
def test_single_type_html_still_works(self, strategy, element):
"""Single string type 'html' should still work."""
field = {"selector": "a", "type": "html"}
result = strategy._extract_single_field(element, field)
assert "Product Name" in result
assert "href" in result
def test_single_type_regex_still_works(self, strategy, element):
"""Single string type 'regex' should still work (reads text, applies pattern)."""
field = {"selector": "a", "type": "regex", "pattern": r"Product (\w+)"}
result = strategy._extract_single_field(element, field)
assert result == "Name"
def test_pipeline_attribute_then_regex(self, strategy, element):
"""Pipeline: get attribute, then regex-extract from it."""
field = {
"selector": "a",
"type": ["attribute", "regex"],
"attribute": "href",
"pattern": r"/product/(\d+)",
}
result = strategy._extract_single_field(element, field)
assert result == "12345"
def test_pipeline_html_then_regex(self, strategy, element):
"""Pipeline: get HTML, then regex-extract from it."""
field = {
"selector": "a",
"type": ["html", "regex"],
"pattern": r'href="([^"]+)"',
}
result = strategy._extract_single_field(element, field)
assert result == "/product/12345?ref=home"
def test_pipeline_text_then_regex(self, strategy, element):
"""Pipeline: get text, then regex — same as single 'regex' type."""
field = {
"selector": "a",
"type": ["text", "regex"],
"pattern": r"Product (\w+)",
}
result = strategy._extract_single_field(element, field)
assert result == "Name"
def test_pipeline_stops_on_none(self, strategy, element):
"""Pipeline should stop and return default when a step yields None."""
field = {
"selector": "a",
"type": ["attribute", "regex"],
"attribute": "href",
"pattern": r"NOMATCH(\d+)",
"default": "N/A",
}
result = strategy._extract_single_field(element, field)
assert result == "N/A"
def test_pipeline_custom_group(self, strategy):
"""Pipeline with custom regex group number."""
html = '<div><span data-info="key:value123">text</span></div>'
soup = BeautifulSoup(html, "html.parser")
doc = soup.find("div")
field = {
"selector": "span",
"type": ["attribute", "regex"],
"attribute": "data-info",
"pattern": r"(\w+):(\w+)",
"group": 2,
}
result = strategy._extract_single_field(doc, field)
assert result == "value123"
def test_single_element_list_same_as_string(self, strategy, element):
"""A list with one element should behave identically to a string."""
field_str = {"selector": "a", "type": "text"}
field_list = {"selector": "a", "type": ["text"]}
assert strategy._extract_single_field(element, field_str) == \
strategy._extract_single_field(element, field_list)
# ── PR #1668: JSON_ENSURE_ASCII config setting ────────────────────────────
class TestJsonEnsureAsciiConfig:
"""Test the JSON_ENSURE_ASCII configuration setting."""
def test_user_settings_has_json_ensure_ascii(self):
"""USER_SETTINGS should include JSON_ENSURE_ASCII."""
from crawl4ai.config import USER_SETTINGS
assert "JSON_ENSURE_ASCII" in USER_SETTINGS
assert USER_SETTINGS["JSON_ENSURE_ASCII"]["default"] is True
assert USER_SETTINGS["JSON_ENSURE_ASCII"]["type"] == "boolean"
def test_ensure_ascii_true_escapes_unicode(self):
"""With ensure_ascii=True, non-ASCII chars should be escaped."""
data = {"name": "Ján Kováč"}
output = json.dumps(data, ensure_ascii=True)
assert "\\u" in output
assert "Ján" not in output
def test_ensure_ascii_false_preserves_unicode(self):
"""With ensure_ascii=False, non-ASCII chars should be preserved."""
data = {"name": "Ján Kováč"}
output = json.dumps(data, ensure_ascii=False)
assert "Ján Kováč" in output
assert "\\u" not in output
def test_cli_has_json_ensure_ascii_option(self):
"""The crawl_cmd should accept --json-ensure-ascii flag."""
import click
from crawl4ai.cli import crawl_cmd
# Get the click command's params
param_names = [p.name for p in crawl_cmd.params]
assert "json_ensure_ascii" in param_names
def test_default_cmd_has_json_ensure_ascii_option(self):
"""The default command should accept --json-ensure-ascii flag."""
from crawl4ai.cli import default
param_names = [p.name for p in default.params]
assert "json_ensure_ascii" in param_names
def test_cli_source_uses_ensure_ascii_in_dumps(self):
"""cli.py should pass ensure_ascii to json.dumps calls."""
import inspect
from crawl4ai import cli
source = inspect.getsource(cli)
assert "ensure_ascii=ensure_ascii" in source