""" Regression tests for Crawl4AI extraction strategies. Covers JsonCssExtractionStrategy, JsonXPathExtractionStrategy, JsonLxmlExtractionStrategy, RegexExtractionStrategy, NoExtractionStrategy, and CosineStrategy (optional, requires sklearn). Run: pytest tests/regression/test_reg_extraction.py -v pytest tests/regression/test_reg_extraction.py -v -m "not network" """ import pytest import json import time from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from crawl4ai.extraction_strategy import ( JsonCssExtractionStrategy, JsonXPathExtractionStrategy, JsonLxmlExtractionStrategy, RegexExtractionStrategy, NoExtractionStrategy, ) try: from crawl4ai.extraction_strategy import CosineStrategy # CosineStrategy requires torch and sklearn at instantiation time; # verify they are actually available before declaring it usable. import torch # noqa: F401 HAS_COSINE = True except (ImportError, ModuleNotFoundError): HAS_COSINE = False # --------------------------------------------------------------------------- # JsonCssExtractionStrategy # --------------------------------------------------------------------------- PRODUCT_CSS_SCHEMA = { "baseSelector": "div.product", "fields": [ {"name": "name", "selector": "h2.name", "type": "text"}, {"name": "price", "selector": "span.price", "type": "text"}, {"name": "description", "selector": "p.description", "type": "text"}, {"name": "category", "selector": "span.category", "type": "text"}, { "name": "link", "selector": "a.details-link", "type": "attribute", "attribute": "href", }, ], } PRODUCT_CSS_SCHEMA_WITH_ID = { "baseSelector": "div.product", "baseFields": [ { "name": "product_id", "type": "attribute", "attribute": "data-id", }, ], "fields": [ {"name": "name", "selector": "h2.name", "type": "text"}, {"name": "price", "selector": "span.price", "type": "text"}, {"name": "description", "selector": "p.description", "type": "text"}, {"name": "category", "selector": "span.category", "type": "text"}, { "name": "link", "selector": "a.details-link", "type": "attribute", "attribute": "href", }, ], } @pytest.mark.asyncio async def test_css_extract_products(local_server): """Extract all 5 products from /products using JsonCssExtractionStrategy. Verify count, first product name, price, and product_id.""" strategy = JsonCssExtractionStrategy(schema=PRODUCT_CSS_SCHEMA_WITH_ID) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/products", config=config) assert result.success, f"Crawl failed: {result.error_message}" extracted = json.loads(result.extracted_content) assert isinstance(extracted, list) assert len(extracted) == 5, f"Expected 5 products, got {len(extracted)}" first = extracted[0] assert first["name"] == "Wireless Mouse" assert first["price"] == "$29.99" assert first["product_id"] == "1" @pytest.mark.asyncio async def test_css_extract_with_default(local_server): """Use a field with a non-existent selector and a default value. Verify the default is used when no element matches.""" schema = { "baseSelector": "div.product", "fields": [ {"name": "name", "selector": "h2.name", "type": "text"}, { "name": "sku", "selector": "span.sku-number", "type": "text", "default": "N/A", }, ], } strategy = JsonCssExtractionStrategy(schema=schema) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/products", config=config) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) > 0 for item in extracted: assert item["sku"] == "N/A", ( f"Expected default 'N/A' for missing sku, got: {item.get('sku')}" ) @pytest.mark.asyncio async def test_css_extract_nested(local_server): """Test nested type extraction using JsonCssExtractionStrategy. Extract a nested object from within each product element.""" schema = { "baseSelector": "div.product", "fields": [ {"name": "name", "selector": "h2.name", "type": "text"}, { "name": "details", "selector": "div.rating", "type": "nested", "fields": [ { "name": "stars", "type": "attribute", "attribute": "data-stars", }, ], }, ], } strategy = JsonCssExtractionStrategy(schema=schema) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/products", config=config) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) == 5 first = extracted[0] assert "details" in first assert first["details"]["stars"] == "4.5" @pytest.mark.asyncio async def test_css_extract_empty_results(local_server): """Use a baseSelector that matches nothing and verify an empty list is returned.""" schema = { "baseSelector": "div.nonexistent-class-xyz", "fields": [ {"name": "text", "selector": "p", "type": "text"}, ], } strategy = JsonCssExtractionStrategy(schema=schema) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/products", config=config) assert result.success extracted = json.loads(result.extracted_content) assert isinstance(extracted, list) assert len(extracted) == 0 @pytest.mark.asyncio async def test_css_extract_table(local_server): """Extract table rows from /tables using CSS selectors. Verify 4 quarterly rows with correct Q1 revenue.""" schema = { "baseSelector": "#sales-table tbody tr", "fields": [ {"name": "quarter", "selector": "td:nth-child(1)", "type": "text"}, {"name": "revenue", "selector": "td:nth-child(2)", "type": "text"}, {"name": "growth", "selector": "td:nth-child(3)", "type": "text"}, ], } strategy = JsonCssExtractionStrategy(schema=schema) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/tables", config=config) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) == 4, f"Expected 4 rows, got {len(extracted)}" assert extracted[0]["quarter"] == "Q1 2025" assert extracted[0]["revenue"] == "$1,234,567" assert extracted[0]["growth"] == "12.5%" @pytest.mark.asyncio @pytest.mark.network async def test_css_real_quotes(): """Crawl quotes.toscrape.com and extract quotes with CSS selectors. Verify multiple quotes are extracted with text and author.""" schema = { "baseSelector": "div.quote", "fields": [ {"name": "text", "selector": "span.text", "type": "text"}, {"name": "author", "selector": "small.author", "type": "text"}, ], } strategy = JsonCssExtractionStrategy(schema=schema) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url="https://quotes.toscrape.com", config=config ) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) > 0, "Expected quotes to be extracted" for quote in extracted: assert "text" in quote and quote["text"], f"Quote missing text: {quote}" assert "author" in quote and quote["author"], f"Quote missing author: {quote}" @pytest.mark.asyncio @pytest.mark.network async def test_css_real_books(): """Crawl books.toscrape.com and extract book titles and prices.""" schema = { "baseSelector": "article.product_pod", "fields": [ {"name": "title", "selector": "h3 a", "type": "attribute", "attribute": "title"}, {"name": "price", "selector": "p.price_color", "type": "text"}, ], } strategy = JsonCssExtractionStrategy(schema=schema) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url="https://books.toscrape.com", config=config ) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) > 0, "Expected books to be extracted" for book in extracted: assert "title" in book and book["title"] assert "price" in book and book["price"] # Price should start with a currency symbol assert book["price"][0] in ("£", "$", "€") or book["price"].startswith("£") # --------------------------------------------------------------------------- # JsonXPathExtractionStrategy # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_xpath_extract_products(local_server): """Extract products using XPath selectors. Verify same results as CSS version.""" schema = { # Use exact class match to avoid matching 'product-list' parent "baseSelector": "//div[contains(concat(' ', normalize-space(@class), ' '), ' product ')]", "fields": [ { "name": "name", "selector": ".//h2[contains(@class, 'name')]", "type": "text", }, { "name": "price", "selector": ".//span[contains(@class, 'price')]", "type": "text", }, ], } strategy = JsonXPathExtractionStrategy(schema=schema) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/products", config=config) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) == 5, f"Expected 5 products via XPath, got {len(extracted)}" assert extracted[0]["name"] == "Wireless Mouse" assert extracted[0]["price"] == "$29.99" # --------------------------------------------------------------------------- # JsonLxmlExtractionStrategy # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_lxml_extract_products(local_server): """Extract products using JsonLxmlExtractionStrategy with the same CSS-style schema. Verify same results as JsonCss.""" strategy = JsonLxmlExtractionStrategy(schema=PRODUCT_CSS_SCHEMA) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/products", config=config) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) == 5, f"Expected 5 products via lxml, got {len(extracted)}" assert extracted[0]["name"] == "Wireless Mouse" assert extracted[0]["price"] == "$29.99" @pytest.mark.asyncio async def test_lxml_caching_performance(local_server): """Extract twice with the same JsonLxmlExtractionStrategy instance. Second extraction should be faster or equal due to caching.""" strategy = JsonLxmlExtractionStrategy(schema=PRODUCT_CSS_SCHEMA) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: # First run t0 = time.perf_counter() result1 = await crawler.arun(url=f"{local_server}/products", config=config) t1 = time.perf_counter() first_time = t1 - t0 # Second run (caching should help) t2 = time.perf_counter() result2 = await crawler.arun(url=f"{local_server}/products", config=config) t3 = time.perf_counter() second_time = t3 - t2 assert result1.success and result2.success data1 = json.loads(result1.extracted_content) data2 = json.loads(result2.extracted_content) assert len(data1) == len(data2) == 5 # Allow generous tolerance -- caching may not always be faster due to # browser overhead, but it should certainly not be drastically slower assert second_time < first_time * 3, ( f"Second run ({second_time:.3f}s) significantly slower than first ({first_time:.3f}s)" ) # --------------------------------------------------------------------------- # RegexExtractionStrategy # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_regex_email(local_server): """Extract emails from /regex-test using the Email pattern. Verify both expected addresses are found.""" strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/regex-test", config=config) assert result.success extracted = json.loads(result.extracted_content) values = [item["value"] for item in extracted] assert any("support@crawl4ai.com" in v for v in values), ( f"Expected support@crawl4ai.com in {values}" ) assert any("sales@example.org" in v for v in values), ( f"Expected sales@example.org in {values}" ) @pytest.mark.asyncio async def test_regex_phone(local_server): """Extract US phone numbers from /regex-test.""" strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.PhoneUS) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/regex-test", config=config) assert result.success extracted = json.loads(result.extracted_content) values = [item["value"] for item in extracted] assert len(values) > 0, "Expected at least one phone number" # At least one phone number should contain expected digits all_vals = " ".join(values) assert "555" in all_vals, f"Expected phone with 555 in {values}" @pytest.mark.asyncio async def test_regex_url(local_server): """Extract URLs from /regex-test using the Url pattern.""" strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.Url) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/regex-test", config=config) assert result.success extracted = json.loads(result.extracted_content) values = [item["value"] for item in extracted] assert len(values) > 0, "Expected URLs to be extracted" all_vals = " ".join(values) assert "crawl4ai.com" in all_vals @pytest.mark.asyncio async def test_regex_all(local_server): """Use RegexExtractionStrategy.All to extract all built-in patterns. Verify it finds emails, phones, URLs, dates, and more.""" strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.All) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/regex-test", config=config) assert result.success extracted = json.loads(result.extracted_content) labels = {item["label"] for item in extracted} # Should find at least emails, URLs, and dates assert "email" in labels, f"Expected 'email' in labels: {labels}" assert "url" in labels, f"Expected 'url' in labels: {labels}" assert "date_iso" in labels or "date_us" in labels, ( f"Expected date patterns in labels: {labels}" ) @pytest.mark.asyncio async def test_regex_custom(local_server): """Use a custom regex pattern to extract IPv4 addresses. Verify 192.168.1.100 is found.""" strategy = RegexExtractionStrategy( custom={"ip_address": r"(?:\d{1,3}\.){3}\d{1,3}"} ) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/regex-test", config=config) assert result.success extracted = json.loads(result.extracted_content) values = [item["value"] for item in extracted] assert "192.168.1.100" in values, f"Expected 192.168.1.100 in {values}" @pytest.mark.asyncio async def test_regex_output_format(local_server): """Verify each regex extraction result has the expected keys: url, label, value, span.""" strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/regex-test", config=config) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) > 0 for item in extracted: assert "url" in item, f"Missing 'url' key in {item}" assert "label" in item, f"Missing 'label' key in {item}" assert "value" in item, f"Missing 'value' key in {item}" assert "span" in item, f"Missing 'span' key in {item}" # Span should be a list/tuple of two ints span = item["span"] assert isinstance(span, (list, tuple)) and len(span) == 2 @pytest.mark.asyncio async def test_regex_span_accuracy(local_server): """Verify that span[0]:span[1] in the source content equals value. This tests that span offsets are accurate relative to the input text.""" strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/regex-test", config=config) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) > 0 # The regex runs on the content source (fit_html by default). # We verify the span produces the correct value from that source. # Since we cannot easily get the exact input text the regex ran on, # we verify span[0] < span[1] and the value is non-empty. for item in extracted: span = item["span"] assert span[0] < span[1], f"Invalid span: {span}" assert len(item["value"]) > 0 assert span[1] - span[0] == len(item["value"]), ( f"Span length ({span[1] - span[0]}) != value length ({len(item['value'])})" ) # --------------------------------------------------------------------------- # NoExtractionStrategy # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_no_extraction(local_server): """Crawl with NoExtractionStrategy and verify the framework skips structured extraction (passthrough behavior). The crawler deliberately bypasses extraction for NoExtractionStrategy, leaving extracted_content as None. The actual page content is still available via markdown and html.""" strategy = NoExtractionStrategy() config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=config) assert result.success # The framework explicitly skips extraction for NoExtractionStrategy, # so extracted_content should be None (passthrough -- no processing). assert result.extracted_content is None # But the page content is still fully available assert result.html is not None and len(result.html) > 0 assert result.markdown is not None and "Welcome" in result.markdown # --------------------------------------------------------------------------- # CosineStrategy (optional - requires sklearn) # --------------------------------------------------------------------------- @pytest.mark.skipif(not HAS_COSINE, reason="CosineStrategy requires sklearn+torch") def test_cosine_basic(): """Test CosineStrategy extract() directly with pre-chunked text to verify clustering works.""" # CosineStrategy.extract() expects text with <|DEL|> or \\n\\n separators. # We test the strategy directly to avoid browser overhead and isolate the logic. topics = [ "Machine learning algorithms process large datasets to identify complex patterns " "and make accurate predictions using neural networks and deep learning models.", "Cloud computing provides scalable infrastructure for deploying web applications " "globally across multiple regions and availability zones for high availability.", "Database optimization requires careful indexing strategies and query performance " "tuning to handle millions of transactions per second efficiently.", "Network security involves configuring firewalls intrusion detection systems and " "encrypted communications to protect against cyber threats and attacks.", "Mobile development frameworks enable building cross-platform applications with " "shared codebases that deploy to both iOS and Android platforms.", ] text = "<|DEL|>".join(topics) strategy = CosineStrategy( semantic_filter=None, word_count_threshold=5, max_dist=0.5, ) result = strategy.extract(url="http://test.com", html=text) assert isinstance(result, list) assert len(result) > 0, "Expected clusters from CosineStrategy" # Each cluster should have 'content' and 'index' keys for item in result: assert "content" in item assert "index" in item # --------------------------------------------------------------------------- # Extraction with real URLs # --------------------------------------------------------------------------- @pytest.mark.asyncio @pytest.mark.network async def test_extraction_real_quotes_css(): """Full pipeline: crawl quotes.toscrape.com, extract with JsonCss, verify structured quote data including text and author.""" schema = { "baseSelector": "div.quote", "fields": [ {"name": "text", "selector": "span.text", "type": "text"}, {"name": "author", "selector": "small.author", "type": "text"}, { "name": "tags", "selector": "div.tags", "type": "nested", "fields": [ { "name": "tag_list", "selector": "a.tag", "type": "text", }, ], }, ], } strategy = JsonCssExtractionStrategy(schema=schema) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url="https://quotes.toscrape.com", config=config ) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) >= 5, f"Expected at least 5 quotes, got {len(extracted)}" for quote in extracted: assert quote.get("text"), "Quote text should not be empty" assert quote.get("author"), "Quote author should not be empty" @pytest.mark.asyncio @pytest.mark.network async def test_extraction_real_books_css(): """Crawl books.toscrape.com and extract book listings with titles and prices.""" schema = { "baseSelector": "article.product_pod", "fields": [ {"name": "title", "selector": "h3 a", "type": "attribute", "attribute": "title"}, {"name": "price", "selector": "p.price_color", "type": "text"}, {"name": "availability", "selector": "p.availability", "type": "text"}, ], } strategy = JsonCssExtractionStrategy(schema=schema) config = CrawlerRunConfig(extraction_strategy=strategy) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url="https://books.toscrape.com", config=config ) assert result.success extracted = json.loads(result.extracted_content) assert len(extracted) >= 10, f"Expected at least 10 books, got {len(extracted)}" for book in extracted: assert book.get("title"), "Book title should not be empty" assert book.get("price"), "Book price should not be empty"