""" Regression tests for Crawl4AI content processing pipeline. Covers markdown generation, content filtering (BM25, Pruning), link/image/table extraction, metadata extraction, tag exclusion, CSS selector targeting, and real-URL content quality. Run: pytest tests/regression/test_reg_content.py -v pytest tests/regression/test_reg_content.py -v -m "not network" """ import pytest import json from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter # --------------------------------------------------------------------------- # Markdown generation # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_markdown_raw(local_server): """Crawl the home page and verify raw markdown is a non-empty string containing the expected heading text and heading markers.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=CrawlerRunConfig()) assert result.success, f"Crawl failed: {result.error_message}" md = result.markdown assert md is not None assert isinstance(md, str) assert len(md) > 0 assert "Welcome to the Crawl4AI Test Site" in md # Should have at least one markdown heading marker assert "#" in md @pytest.mark.asyncio async def test_markdown_has_headings(local_server): """Verify markdown contains the expected h1 and h2 headings.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=CrawlerRunConfig()) assert result.success md = result.markdown assert "# Welcome" in md or "# Welcome to the Crawl4AI Test Site" in md # h2 heading for Features Overview assert "## Features" in md or "## Features Overview" in md @pytest.mark.asyncio async def test_markdown_has_code_block(local_server): """Verify markdown preserves the code block with triple backticks.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=CrawlerRunConfig()) assert result.success md = result.markdown assert "```" in md assert "AsyncWebCrawler" in md @pytest.mark.asyncio async def test_markdown_has_list(local_server): """Verify markdown contains list items from the home page features list.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=CrawlerRunConfig()) assert result.success md = result.markdown # Markdown list items should contain at least some of these assert "Content extraction" in md or "content extraction" in md assert "Link discovery" in md or "link discovery" in md @pytest.mark.asyncio async def test_markdown_citations(local_server): """Access markdown_with_citations and verify it contains numbered citation references.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=CrawlerRunConfig()) assert result.success citations_md = result.markdown.markdown_with_citations assert isinstance(citations_md, str) assert len(citations_md) > 0 # Should have at least one citation reference like [1] or similar has_citation = any(f"[{i}]" in citations_md for i in range(1, 20)) # Some implementations use a different format assert has_citation or "⟨" in citations_md or "[" in citations_md @pytest.mark.asyncio async def test_markdown_references(local_server): """Access references_markdown and verify it contains URLs.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=CrawlerRunConfig()) assert result.success refs = result.markdown.references_markdown assert isinstance(refs, str) # References should mention URLs or link targets assert "http" in refs or "/" in refs @pytest.mark.asyncio async def test_markdown_string_compat(local_server): """Verify StringCompatibleMarkdown behaves like a string: str() works, equality with raw_markdown, and 'in' operator.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=CrawlerRunConfig()) assert result.success md = result.markdown raw = md.raw_markdown # str(result.markdown) should equal raw_markdown assert str(md) == raw # 'in' operator should work on the string content assert "Welcome" in md # --------------------------------------------------------------------------- # Content filtering - BM25 # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_bm25_fit_markdown(local_server): """Crawl with BM25ContentFilter and verify fit_markdown is shorter than the full raw_markdown (content was filtered).""" gen = DefaultMarkdownGenerator( content_filter=BM25ContentFilter(user_query="features") ) config = CrawlerRunConfig(markdown_generator=gen) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=config) assert result.success fit = result.markdown.fit_markdown raw = result.markdown.raw_markdown assert fit is not None assert len(fit) > 0 assert len(fit) < len(raw), ( "fit_markdown should be shorter than raw_markdown after BM25 filtering" ) # --------------------------------------------------------------------------- # Content filtering - Pruning # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_pruning_fit_markdown(local_server): """Crawl with PruningContentFilter and verify fit_markdown exists and is shorter than the full raw_markdown.""" gen = DefaultMarkdownGenerator(content_filter=PruningContentFilter()) config = CrawlerRunConfig(markdown_generator=gen) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=config) assert result.success fit = result.markdown.fit_markdown raw = result.markdown.raw_markdown assert fit is not None assert len(fit) > 0 assert len(fit) <= len(raw), ( "fit_markdown should not be longer than raw_markdown" ) # --------------------------------------------------------------------------- # Link extraction # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_links_internal(local_server): """Crawl /links-page and verify internal links are extracted with href keys.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/links-page", config=CrawlerRunConfig()) assert result.success internal = result.links.get("internal", []) assert isinstance(internal, list) assert len(internal) > 0, "Expected internal links to be found" # Each link dict should have an href for link in internal: assert "href" in link, f"Link missing 'href' key: {link}" @pytest.mark.asyncio async def test_links_external(local_server): """Verify external links include the expected domains.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/links-page", config=CrawlerRunConfig()) assert result.success external = result.links.get("external", []) assert len(external) > 0, "Expected external links to be found" hrefs = [link["href"] for link in external] all_hrefs = " ".join(hrefs) assert "example.com" in all_hrefs assert "github.com" in all_hrefs assert "python.org" in all_hrefs @pytest.mark.asyncio async def test_links_exclude_external(local_server): """Crawl with exclude_external_links=True and verify no external links remain.""" config = CrawlerRunConfig(exclude_external_links=True) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/links-page", config=config) assert result.success external = result.links.get("external", []) assert len(external) == 0, f"Expected no external links, got {len(external)}" @pytest.mark.asyncio async def test_links_exclude_social(local_server): """Crawl with exclude_social_media_links=True and verify no social media links appear in the external links list.""" config = CrawlerRunConfig(exclude_social_media_links=True) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/links-page", config=config) assert result.success external = result.links.get("external", []) social_domains = ["twitter.com", "facebook.com", "linkedin.com"] for link in external: href = link.get("href", "") for domain in social_domains: assert domain not in href, ( f"Social media link should be excluded: {href}" ) @pytest.mark.asyncio @pytest.mark.network async def test_links_real_url(): """Crawl a real URL (quotes.toscrape.com) and verify internal links are found (pagination links exist on the main page).""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url="https://quotes.toscrape.com", config=CrawlerRunConfig(), ) assert result.success internal = result.links.get("internal", []) assert len(internal) > 0, "Expected internal links on quotes.toscrape.com" # --------------------------------------------------------------------------- # Image extraction # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_images_extracted(local_server): """Crawl /images-page and verify images are extracted.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/images-page", config=CrawlerRunConfig()) assert result.success images = result.media.get("images", []) assert isinstance(images, list) assert len(images) > 0, "Expected images to be extracted" @pytest.mark.asyncio async def test_images_have_fields(local_server): """Verify each extracted image dict has src, alt, and score keys.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/images-page", config=CrawlerRunConfig()) assert result.success images = result.media.get("images", []) assert len(images) > 0 for img in images: assert "src" in img, f"Image missing 'src': {img}" assert "alt" in img, f"Image missing 'alt': {img}" assert "score" in img, f"Image missing 'score': {img}" @pytest.mark.asyncio async def test_images_scoring(local_server): """High-quality images (large, with alt text) should score higher than small icons without alt text.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/images-page", config=CrawlerRunConfig()) assert result.success images = result.media.get("images", []) assert len(images) >= 2 # Find the hero/landscape image and the small icon hero = None icon = None for img in images: src = img.get("src", "") if "landscape" in src or "hero" in src: hero = img elif "icon" in src and img.get("alt", "") == "": icon = img if hero and icon: assert hero["score"] > icon["score"], ( f"Hero score ({hero['score']}) should exceed icon score ({icon['score']})" ) @pytest.mark.asyncio async def test_images_exclude_all(local_server): """Crawl with exclude_all_images=True and verify no images are returned.""" config = CrawlerRunConfig(exclude_all_images=True) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/images-page", config=config) assert result.success images = result.media.get("images", []) assert len(images) == 0, f"Expected no images with exclude_all_images, got {len(images)}" # --------------------------------------------------------------------------- # Table extraction # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_tables_extracted(local_server): """Crawl /tables and verify tables appear in the result (either in result.media, result.tables, or markdown pipe formatting).""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/tables", config=CrawlerRunConfig()) assert result.success # Tables may appear in result.tables, result.media, or markdown has_tables = ( len(getattr(result, "tables", []) or []) > 0 or "tables" in result.media or "|" in str(result.markdown) ) assert has_tables, "Expected table data to be found in the result" @pytest.mark.asyncio async def test_tables_in_markdown(local_server): """Verify the markdown output contains table formatting with pipes and dashes.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/tables", config=CrawlerRunConfig()) assert result.success md = str(result.markdown) assert "|" in md, "Expected pipe character in markdown tables" assert "---" in md or "- -" in md, "Expected separator row in markdown tables" # --------------------------------------------------------------------------- # Metadata extraction # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_metadata_title(local_server): """Crawl /structured-data and verify the page title is in metadata.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url=f"{local_server}/structured-data", config=CrawlerRunConfig() ) assert result.success assert result.metadata is not None # Title should be "Article with Structured Data" title = result.metadata.get("title", "") assert "Article with Structured Data" in title or "Structured Data" in title @pytest.mark.asyncio async def test_metadata_og_tags(local_server): """Verify og:title, og:description, og:image are present in metadata.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url=f"{local_server}/structured-data", config=CrawlerRunConfig() ) assert result.success meta = result.metadata assert meta is not None # Check for og tags -- they may be stored with different key formats og_title = meta.get("og:title", meta.get("og_title", "")) og_desc = meta.get("og:description", meta.get("og_description", "")) og_image = meta.get("og:image", meta.get("og_image", "")) assert og_title, f"Missing og:title in metadata: {meta}" assert og_desc, f"Missing og:description in metadata: {meta}" assert og_image, f"Missing og:image in metadata: {meta}" @pytest.mark.asyncio async def test_metadata_description(local_server): """Verify meta description is present in metadata.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url=f"{local_server}/structured-data", config=CrawlerRunConfig() ) assert result.success meta = result.metadata assert meta is not None desc = meta.get("description", "") assert desc, f"Missing description in metadata: {meta}" assert "web crawling" in desc.lower() @pytest.mark.asyncio @pytest.mark.network async def test_metadata_real(): """Crawl https://example.com and verify title metadata exists.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url="https://example.com", config=CrawlerRunConfig() ) assert result.success assert result.metadata is not None title = result.metadata.get("title", "") assert title, "Expected title metadata from example.com" # --------------------------------------------------------------------------- # Excluded tags # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_excluded_tags_nav(local_server): """Crawl / with excluded_tags=["nav"] and verify navigation links are removed from cleaned_html.""" config = CrawlerRunConfig(excluded_tags=["nav"]) async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/", config=config) assert result.success cleaned = result.cleaned_html or "" # The nav element contained links to Products, Links, Tables # After exclusion these should be absent from cleaned_html assert " assert "Footer content" not in md @pytest.mark.asyncio async def test_css_selector_product(local_server): """Crawl /products with css_selector targeting only product #1 and verify only the first product is extracted.""" config = CrawlerRunConfig(css_selector=".product[data-id='1']") async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun(url=f"{local_server}/products", config=config) assert result.success md = str(result.markdown) assert "Wireless Mouse" in md # Other products should not appear assert "Mechanical Keyboard" not in md assert "USB-C Hub" not in md # --------------------------------------------------------------------------- # Real URL content tests # --------------------------------------------------------------------------- @pytest.mark.asyncio @pytest.mark.network async def test_real_url_markdown_quality(): """Crawl https://example.com and verify markdown has reasonable content with more than 50 chars and contains 'Example Domain'.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url="https://example.com", config=CrawlerRunConfig() ) assert result.success md = str(result.markdown) assert len(md) > 50, f"Markdown too short ({len(md)} chars)" assert "Example Domain" in md @pytest.mark.asyncio @pytest.mark.network async def test_real_url_links(): """Crawl https://books.toscrape.com and verify internal links (product links) and images (book covers) are found.""" async with AsyncWebCrawler(config=BrowserConfig(headless=True, verbose=False)) as crawler: result = await crawler.arun( url="https://books.toscrape.com", config=CrawlerRunConfig() ) assert result.success internal = result.links.get("internal", []) assert len(internal) > 0, "Expected product links on books.toscrape.com" images = result.media.get("images", []) assert len(images) > 0, "Expected book cover images on books.toscrape.com"