""" Tests for PR #1290 and #1668 - #1290: Type-list pipeline in JsonCssExtractionStrategy._extract_single_field - #1668: --json-ensure-ascii CLI flag and JSON_ENSURE_ASCII config """ import json import pytest from bs4 import BeautifulSoup # ── PR #1290: Type-list pipeline in _extract_single_field ───────────────── class TestTypePipeline: """Test that field type can be a list for chained extraction.""" @pytest.fixture def strategy(self): from crawl4ai.extraction_strategy import JsonCssExtractionStrategy schema = {"name": "test", "baseSelector": "div", "fields": []} return JsonCssExtractionStrategy(schema) @pytest.fixture def element(self): html = '
' soup = BeautifulSoup(html, "html.parser") return soup.find("div") def test_single_type_text_still_works(self, strategy, element): """Single string type 'text' should still work as before.""" field = {"selector": "a", "type": "text"} result = strategy._extract_single_field(element, field) assert result == "Product Name" def test_single_type_attribute_still_works(self, strategy, element): """Single string type 'attribute' should still work.""" field = {"selector": "a", "type": "attribute", "attribute": "href"} result = strategy._extract_single_field(element, field) assert result == "/product/12345?ref=home" def test_single_type_html_still_works(self, strategy, element): """Single string type 'html' should still work.""" field = {"selector": "a", "type": "html"} result = strategy._extract_single_field(element, field) assert "Product Name" in result assert "href" in result def test_single_type_regex_still_works(self, strategy, element): """Single string type 'regex' should still work (reads text, applies pattern).""" field = {"selector": "a", "type": "regex", "pattern": r"Product (\w+)"} result = strategy._extract_single_field(element, field) assert result == "Name" def test_pipeline_attribute_then_regex(self, strategy, element): """Pipeline: get attribute, then regex-extract from it.""" field = { "selector": "a", "type": ["attribute", "regex"], "attribute": "href", "pattern": r"/product/(\d+)", } result = strategy._extract_single_field(element, field) assert result == "12345" def test_pipeline_html_then_regex(self, strategy, element): """Pipeline: get HTML, then regex-extract from it.""" field = { "selector": "a", "type": ["html", "regex"], "pattern": r'href="([^"]+)"', } result = strategy._extract_single_field(element, field) assert result == "/product/12345?ref=home" def test_pipeline_text_then_regex(self, strategy, element): """Pipeline: get text, then regex — same as single 'regex' type.""" field = { "selector": "a", "type": ["text", "regex"], "pattern": r"Product (\w+)", } result = strategy._extract_single_field(element, field) assert result == "Name" def test_pipeline_stops_on_none(self, strategy, element): """Pipeline should stop and return default when a step yields None.""" field = { "selector": "a", "type": ["attribute", "regex"], "attribute": "href", "pattern": r"NOMATCH(\d+)", "default": "N/A", } result = strategy._extract_single_field(element, field) assert result == "N/A" def test_pipeline_custom_group(self, strategy): """Pipeline with custom regex group number.""" html = '