diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 7d663414..e7946f0a 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -579,6 +579,11 @@ class BrowserConfig: process to reclaim leaked memory. 0 = disabled. Recommended: 500-1000 for long-running crawlers. Default: 0. + avoid_ads (bool): If True, blocks ad-related and tracker network requests at the + browser context level using a curated blocklist of top ad/tracker + domains. Default: False. + avoid_css (bool): If True, blocks loading of CSS files (css, less, scss, sass) to + reduce resource usage and speed up crawling. Default: False. """ def __init__( @@ -627,6 +632,8 @@ class BrowserConfig: debugging_port: int = 9222, host: str = "localhost", enable_stealth: bool = False, + avoid_ads: bool = False, + avoid_css: bool = False, init_scripts: List[str] = None, memory_saving_mode: bool = False, max_pages_before_recycle: int = 0, @@ -692,6 +699,8 @@ class BrowserConfig: self.debugging_port = debugging_port self.host = host self.enable_stealth = enable_stealth + self.avoid_ads = avoid_ads + self.avoid_css = avoid_css self.init_scripts = init_scripts if init_scripts is not None else [] self.memory_saving_mode = memory_saving_mode self.max_pages_before_recycle = max_pages_before_recycle @@ -785,6 +794,8 @@ class BrowserConfig: "debugging_port": self.debugging_port, "host": self.host, "enable_stealth": self.enable_stealth, + "avoid_ads": self.avoid_ads, + "avoid_css": self.avoid_css, "init_scripts": self.init_scripts, "memory_saving_mode": self.memory_saving_mode, "max_pages_before_recycle": self.max_pages_before_recycle, diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 64ca153b..6cbe27f3 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -1258,59 +1258,47 @@ class BrowserManager: } proxy_settings = {"server": self.config.proxy} if self.config.proxy else None - blocked_extensions = [ + # CSS extensions (blocked separately via avoid_css flag) + css_extensions = ["css", "less", "scss", "sass"] + + # Static resource extensions (blocked when text_mode is enabled) + static_extensions = [ # Images - "jpg", - "jpeg", - "png", - "gif", - "webp", - "svg", - "ico", - "bmp", - "tiff", - "psd", + "jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd", # Fonts - "woff", - "woff2", - "ttf", - "otf", - "eot", - # Styles - # 'css', 'less', 'scss', 'sass', + "woff", "woff2", "ttf", "otf", "eot", # Media - "mp4", - "webm", - "ogg", - "avi", - "mov", - "wmv", - "flv", - "m4v", - "mp3", - "wav", - "aac", - "m4a", - "opus", - "flac", + "mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", + "mp3", "wav", "aac", "m4a", "opus", "flac", # Documents - "pdf", - "doc", - "docx", - "xls", - "xlsx", - "ppt", - "pptx", + "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", # Archives - "zip", - "rar", - "7z", - "tar", - "gz", + "zip", "rar", "7z", "tar", "gz", # Scripts and data - "xml", - "swf", - "wasm", + "xml", "swf", "wasm", + ] + + # Ad and tracker domain patterns (curated from uBlock/EasyList sources) + ad_tracker_patterns = [ + "**/google-analytics.com/**", + "**/googletagmanager.com/**", + "**/googlesyndication.com/**", + "**/doubleclick.net/**", + "**/adservice.google.com/**", + "**/adsystem.com/**", + "**/adzerk.net/**", + "**/adnxs.com/**", + "**/ads.linkedin.com/**", + "**/facebook.net/**", + "**/analytics.twitter.com/**", + "**/ads-twitter.com/**", + "**/hotjar.com/**", + "**/clarity.ms/**", + "**/scorecardresearch.com/**", + "**/pixel.wp.com/**", + "**/amazon-adsystem.com/**", + "**/mixpanel.com/**", + "**/segment.com/**", ] # Common context settings @@ -1364,11 +1352,21 @@ class BrowserManager: # Create and return the context with all settings context = await self.browser.new_context(**context_settings) - # Apply text mode settings if enabled + # Build dynamic blocking list based on config flags + to_block = [] + if self.config.avoid_css: + to_block.extend(css_extensions) if self.config.text_mode: - # Create and apply route patterns for each extension - for ext in blocked_extensions: + to_block.extend(static_extensions) + + if to_block: + for ext in to_block: await context.route(f"**/*.{ext}", lambda route: route.abort()) + + if self.config.avoid_ads: + for pattern in ad_tracker_patterns: + await context.route(pattern, lambda route: route.abort()) + return context def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str: diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 81cd312a..431a0d68 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -67,7 +67,8 @@ async def handle_llm_qa( config: dict ) -> str: """Process QA using LLM with crawled content as context.""" - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler + crawler = None try: if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")): url = 'https://' + url @@ -121,6 +122,9 @@ async def handle_llm_qa( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) ) + finally: + if crawler: + await release_crawler(crawler) async def process_llm_extraction( redis: aioredis.Redis, @@ -249,6 +253,7 @@ async def handle_markdown_request( base_url: Optional[str] = None ) -> str: """Handle markdown generation requests.""" + crawler = None try: # Validate provider if using LLM filter if filter_type == FilterType.LLM: @@ -282,7 +287,7 @@ async def handle_markdown_request( cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler from utils import load_config as _load_config _cfg = _load_config() browser_cfg = BrowserConfig( @@ -315,6 +320,9 @@ async def handle_markdown_request( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) ) + finally: + if crawler: + await release_crawler(crawler) async def handle_llm_request( redis: aioredis.Redis, @@ -481,6 +489,7 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) """Stream results with heartbeats and completion markers.""" import json from utils import datetime_handler + from crawler_pool import release_crawler try: async for result in results_gen: @@ -507,11 +516,8 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) except asyncio.CancelledError: logger.warning("Client disconnected during streaming") finally: - # try: - # await crawler.close() - # except Exception as e: - # logger.error(f"Crawler cleanup error: {e}") - pass + if crawler: + await release_crawler(crawler) async def handle_crawl_request( urls: List[str], @@ -523,6 +529,7 @@ async def handle_crawl_request( """Handle non-streaming crawl requests with optional hooks.""" # Track request start request_id = f"req_{uuid4().hex[:8]}" + crawler = None try: from monitor import get_monitor await get_monitor().track_request_start( @@ -549,11 +556,8 @@ async def handle_crawl_request( ) if config["crawler"]["rate_limiter"]["enabled"] else None ) - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler crawler = await get_crawler(browser_config) - - # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config) - # await crawler.start() # Attach hooks if provided hooks_status = {} @@ -589,8 +593,6 @@ async def handle_crawl_request( if not isinstance(results, list): results = [results] - # await crawler.close() - end_mem_mb = _get_memory_mb() # <--- Get memory after end_time = time.time() @@ -689,13 +691,6 @@ async def handle_crawl_request( except: pass - if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started - # try: - # await crawler.close() - # except Exception as close_e: - # logger.error(f"Error closing crawler during exception handling: {close_e}") - logger.error(f"Error closing crawler during exception handling: {str(e)}") - # Measure memory even on error if possible end_mem_mb_error = _get_memory_mb() if start_mem_mb is not None and end_mem_mb_error is not None: @@ -709,6 +704,9 @@ async def handle_crawl_request( "server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0) }) ) + finally: + if crawler: + await release_crawler(crawler) async def handle_stream_crawl_request( urls: List[str], @@ -719,6 +717,7 @@ async def handle_stream_crawl_request( ) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]: """Handle streaming crawl requests with optional hooks.""" hooks_info = None + crawler = None try: browser_config = BrowserConfig.load(browser_config) # browser_config.verbose = True # Set to False or remove for production stress testing @@ -734,11 +733,8 @@ async def handle_stream_crawl_request( ) ) - from crawler_pool import get_crawler + from crawler_pool import get_crawler, release_crawler crawler = await get_crawler(browser_config) - - # crawler = AsyncWebCrawler(config=browser_config) - # await crawler.start() # Attach hooks if provided if hooks_config: @@ -763,13 +759,10 @@ async def handle_stream_crawl_request( return crawler, results_gen, hooks_info except Exception as e: - # Make sure to close crawler if started during an error here - if 'crawler' in locals() and crawler.ready: - # try: - # await crawler.close() - # except Exception as close_e: - # logger.error(f"Error closing crawler during stream setup exception: {close_e}") - logger.error(f"Error closing crawler during stream setup exception: {str(e)}") + # Release crawler on setup error (for successful streams, + # release happens in stream_results finally block) + if crawler: + await release_crawler(crawler) logger.error(f"Stream crawl error: {str(e)}", exc_info=True) # Raising HTTPException here will prevent streaming response raise HTTPException( diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index 509cbba9..c310588d 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -39,6 +39,9 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: if PERMANENT and _is_default_config(sig): LAST_USED[sig] = time.time() USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 + if not hasattr(PERMANENT, 'active_requests'): + PERMANENT.active_requests = 0 + PERMANENT.active_requests += 1 logger.info("πŸ”₯ Using permanent browser") return PERMANENT @@ -46,13 +49,21 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: if sig in HOT_POOL: LAST_USED[sig] = time.time() USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 - logger.info(f"♨️ Using hot pool browser (sig={sig[:8]})") - return HOT_POOL[sig] + crawler = HOT_POOL[sig] + if not hasattr(crawler, 'active_requests'): + crawler.active_requests = 0 + crawler.active_requests += 1 + logger.info(f"♨️ Using hot pool browser (sig={sig[:8]}, active={crawler.active_requests})") + return crawler # Check cold pool (promote to hot if used 3+ times) if sig in COLD_POOL: LAST_USED[sig] = time.time() USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1 + crawler = COLD_POOL[sig] + if not hasattr(crawler, 'active_requests'): + crawler.active_requests = 0 + crawler.active_requests += 1 if USAGE_COUNT[sig] >= 3: logger.info(f"⬆️ Promoting to hot pool (sig={sig[:8]}, count={USAGE_COUNT[sig]})") @@ -68,7 +79,7 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: return HOT_POOL[sig] logger.info(f"❄️ Using cold pool browser (sig={sig[:8]})") - return COLD_POOL[sig] + return crawler # Memory check before creating new mem_pct = get_container_memory_percent() @@ -80,11 +91,23 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: logger.info(f"πŸ†• Creating new browser in cold pool (sig={sig[:8]}, mem={mem_pct:.1f}%)") crawler = AsyncWebCrawler(config=cfg, thread_safe=False) await crawler.start() + crawler.active_requests = 1 COLD_POOL[sig] = crawler LAST_USED[sig] = time.time() USAGE_COUNT[sig] = 1 return crawler +async def release_crawler(crawler: AsyncWebCrawler): + """Decrement active request count for a pooled crawler. + + Call this in a finally block after finishing work with a crawler + obtained via get_crawler() so the janitor knows when it's safe + to close idle browsers. + """ + async with LOCK: + if hasattr(crawler, 'active_requests'): + crawler.active_requests = max(0, crawler.active_requests - 1) + async def init_permanent(cfg: BrowserConfig): """Initialize permanent default browser.""" global PERMANENT, DEFAULT_CONFIG_SIG @@ -132,10 +155,13 @@ async def janitor(): # Clean cold pool for sig in list(COLD_POOL.keys()): if now - LAST_USED.get(sig, now) > cold_ttl: + crawler = COLD_POOL[sig] + if getattr(crawler, 'active_requests', 0) > 0: + continue # still serving requests, skip idle_time = now - LAST_USED[sig] logger.info(f"🧹 Closing cold browser (sig={sig[:8]}, idle={idle_time:.0f}s)") with suppress(Exception): - await COLD_POOL[sig].close() + await crawler.close() COLD_POOL.pop(sig, None) LAST_USED.pop(sig, None) USAGE_COUNT.pop(sig, None) @@ -150,10 +176,13 @@ async def janitor(): # Clean hot pool (more conservative) for sig in list(HOT_POOL.keys()): if now - LAST_USED.get(sig, now) > hot_ttl: + crawler = HOT_POOL[sig] + if getattr(crawler, 'active_requests', 0) > 0: + continue # still serving requests, skip idle_time = now - LAST_USED[sig] logger.info(f"🧹 Closing hot browser (sig={sig[:8]}, idle={idle_time:.0f}s)") with suppress(Exception): - await HOT_POOL[sig].close() + await crawler.close() HOT_POOL.pop(sig, None) LAST_USED.pop(sig, None) USAGE_COUNT.pop(sig, None) diff --git a/deploy/docker/server.py b/deploy/docker/server.py index e4d5b055..210f421b 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -7,7 +7,7 @@ Crawl4AI FastAPI entry‑point """ # ── stdlib & 3rd‑party imports ─────────────────────────────── -from crawler_pool import get_crawler, close_all, janitor +from crawler_pool import get_crawler, release_crawler, close_all, janitor from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from crawl4ai.__version__ import __version__ from auth import create_access_token, get_token_dependency, TokenRequest @@ -367,8 +367,8 @@ async def generate_html( Use when you need sanitized HTML structures for building schemas or further processing. """ validate_url_scheme(body.url, allow_raw=True) - from crawler_pool import get_crawler cfg = CrawlerRunConfig() + crawler = None try: crawler = await get_crawler(get_default_browser_config()) results = await crawler.arun(url=body.url, config=cfg) @@ -381,6 +381,9 @@ async def generate_html( return JSONResponse({"html": processed_html, "url": body.url, "success": True}) except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) # Screenshot endpoint @@ -399,7 +402,7 @@ async def generate_screenshot( Then in result instead of the screenshot you will get a path to the saved file. """ validate_url_scheme(body.url) - from crawler_pool import get_crawler + crawler = None try: cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for) crawler = await get_crawler(get_default_browser_config()) @@ -416,6 +419,9 @@ async def generate_screenshot( return {"success": True, "screenshot": screenshot_data} except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) # PDF endpoint @@ -434,7 +440,7 @@ async def generate_pdf( Then in result instead of the PDF you will get a path to the saved file. """ validate_url_scheme(body.url) - from crawler_pool import get_crawler + crawler = None try: cfg = CrawlerRunConfig(pdf=True) crawler = await get_crawler(get_default_browser_config()) @@ -451,6 +457,9 @@ async def generate_pdf( return {"success": True, "pdf": base64.b64encode(pdf_data).decode()} except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) @app.post("/execute_js") @@ -507,7 +516,7 @@ async def execute_js( """ validate_url_scheme(body.url) - from crawler_pool import get_crawler + crawler = None try: cfg = CrawlerRunConfig(js_code=body.scripts) crawler = await get_crawler(get_default_browser_config()) @@ -518,6 +527,9 @@ async def execute_js( return JSONResponse(data) except Exception as e: raise HTTPException(500, detail=str(e)) + finally: + if crawler: + await release_crawler(crawler) @app.get("/llm/{url:path}") diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 064a2388..439d0ef6 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -49,6 +49,8 @@ browser_cfg = BrowserConfig( | **`user_agent_generator_config`** | `dict` (default: `{}`) | Configuration dict for user agent generation when `user_agent_mode="random"`. | | **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. | | **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. | +| **`avoid_ads`** | `bool` (default: `False`) | If `True`, blocks requests to common ad/tracker domains (Google Analytics, DoubleClick, Facebook, Hotjar, etc.) at the browser context level. | +| **`avoid_css`** | `bool` (default: `False`) | If `True`, blocks loading of CSS files (`.css`, `.less`, `.scss`, `.sass`) for faster, leaner crawls when only text content is needed. | | **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. | | **`enable_stealth`** | `bool` (default: `False`) | Enable playwright-stealth mode to bypass bot detection. Cannot be used with `browser_mode="builtin"`. | diff --git a/docs/md_v2/complete-sdk-reference.md b/docs/md_v2/complete-sdk-reference.md index e79b63b8..8f263fae 100644 --- a/docs/md_v2/complete-sdk-reference.md +++ b/docs/md_v2/complete-sdk-reference.md @@ -1402,6 +1402,8 @@ class BrowserConfig: user_agent=None, text_mode=False, light_mode=False, + avoid_ads=False, + avoid_css=False, extra_args=None, enable_stealth=False, # ... other advanced parameters omitted here @@ -1440,15 +1442,19 @@ class BrowserConfig: 8. **`user_agent`**: - Custom User-Agent string. If `None`, a default is used. - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection). -9. **`text_mode`** & **`light_mode`**: - - `text_mode=True` disables images, possibly speeding up text-only crawls. - - `light_mode=True` turns off certain background features for performance. -10. **`extra_args`**: - - Additional flags for the underlying browser. +9. **`text_mode`** & **`light_mode`**: + - `text_mode=True` disables images, possibly speeding up text-only crawls. + - `light_mode=True` turns off certain background features for performance. +10. **`avoid_ads`** & **`avoid_css`**: + - `avoid_ads=True` blocks requests to common ad and tracker domains (Google Analytics, DoubleClick, Facebook, Hotjar, etc.) at the browser context level. Reduces network overhead and memory usage. + - `avoid_css=True` blocks loading of CSS files (`.css`, `.less`, `.scss`, `.sass`), useful when you only need text content and want faster, leaner crawls. + - Both default to `False` (opt-in). Can be combined with each other and with `text_mode`. +11. **`extra_args`**: + - Additional flags for the underlying browser. - E.g. `["--disable-extensions"]`. -11. **`enable_stealth`**: - - If `True`, enables stealth mode using playwright-stealth. - - Modifies browser fingerprints to avoid basic bot detection. +12. **`enable_stealth`**: + - If `True`, enables stealth mode using playwright-stealth. + - Modifies browser fingerprints to avoid basic bot detection. - Default is `False`. Recommended for sites with bot protection. ### Helper Methods Both configuration classes provide a `clone()` method to create modified copies: diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index 983fd071..d9946c68 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -109,17 +109,22 @@ class BrowserConfig: - `user_agent`: Custom User-Agent string. If `None`, a default is used. - `user_agent_mode`: Set to `"random"` for randomization (helps fight bot detection). -12.β €**`text_mode`** & **`light_mode`** - - `text_mode=True` disables images, possibly speeding up text-only crawls. - - `light_mode=True` turns off certain background features for performance. +12.β €**`text_mode`** & **`light_mode`** + - `text_mode=True` disables images, possibly speeding up text-only crawls. + - `light_mode=True` turns off certain background features for performance. -13.β €**`extra_args`** +13.β €**`avoid_ads`** & **`avoid_css`** + - `avoid_ads=True` blocks requests to common ad and tracker domains (Google Analytics, DoubleClick, Facebook, Hotjar, etc.) at the browser context level. Reduces network overhead and memory usage. + - `avoid_css=True` blocks loading of CSS files (`.css`, `.less`, `.scss`, `.sass`), useful when you only need text content and want faster, leaner crawls. + - Both default to `False` (opt-in). Can be combined with each other and with `text_mode`. + +14.β €**`extra_args`** - Additional flags for the underlying browser. - E.g. `["--disable-extensions"]`. -14.β €**`enable_stealth`** - - If `True`, enables stealth mode using playwright-stealth. - - Modifies browser fingerprints to avoid basic bot detection. +15.β €**`enable_stealth`** + - If `True`, enables stealth mode using playwright-stealth. + - Modifies browser fingerprints to avoid basic bot detection. - Default is `False`. Recommended for sites with bot protection. ### Helper Methods diff --git a/tests/browser/test_resource_filtering.py b/tests/browser/test_resource_filtering.py new file mode 100644 index 00000000..552aadd2 --- /dev/null +++ b/tests/browser/test_resource_filtering.py @@ -0,0 +1,178 @@ +"""E2E tests for avoid_ads / avoid_css resource filtering. + +These tests launch real browsers and crawl real websites to verify +that route-based resource blocking actually works. + +Domains used: + - books.toscrape.com (CSS-heavy practice site, designed for scraping) + - quotes.toscrape.com (simple practice site) + - httpbin.org/html (static HTML, no trackers) + - en.wikipedia.org (real site with analytics) +""" + +import pytest +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + + +# --------------------------------------------------------------------------- +# Basic success tests β€” flags should not break crawling +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_crawl_with_avoid_css_succeeds(): + """Crawl books.toscrape.com with avoid_css=True β€” page should load fine.""" + browser_config = BrowserConfig(headless=True, avoid_css=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://books.toscrape.com", + config=CrawlerRunConfig(cache_mode="bypass"), + ) + assert result.success, f"Crawl failed: {result.error_message}" + assert len(result.html) > 500, "Page HTML is suspiciously short" + + +@pytest.mark.asyncio +async def test_crawl_with_avoid_ads_succeeds(): + """Crawl Wikipedia with avoid_ads=True β€” content should be intact.""" + browser_config = BrowserConfig(headless=True, avoid_ads=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Web_scraping", + config=CrawlerRunConfig(cache_mode="bypass"), + ) + assert result.success, f"Crawl failed: {result.error_message}" + # Wikipedia article content must be present + html_lower = result.html.lower() + assert "web scraping" in html_lower, "Wikipedia content missing" + + +@pytest.mark.asyncio +async def test_crawl_with_both_flags_succeeds(): + """Both avoid_css and avoid_ads enabled simultaneously.""" + browser_config = BrowserConfig(headless=True, avoid_css=True, avoid_ads=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://quotes.toscrape.com", + config=CrawlerRunConfig(cache_mode="bypass"), + ) + assert result.success, f"Crawl failed: {result.error_message}" + html_lower = result.html.lower() + assert "quote" in html_lower or "toscrape" in html_lower + + +@pytest.mark.asyncio +async def test_avoid_ads_does_not_block_page_content(): + """avoid_ads must not interfere with first-party page content.""" + browser_config = BrowserConfig(headless=True, avoid_ads=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://httpbin.org/html", + config=CrawlerRunConfig(cache_mode="bypass"), + ) + assert result.success, f"Crawl failed: {result.error_message}" + # httpbin.org/html serves a Moby Dick excerpt + assert "Herman Melville" in result.html, "First-party content missing" + + +# --------------------------------------------------------------------------- +# Network-level verification β€” prove routes actually block requests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_without_flags_css_loads_normally(): + """Baseline: without avoid_css, CSS responses should appear in network log.""" + browser_config = BrowserConfig(headless=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://books.toscrape.com", + config=CrawlerRunConfig( + cache_mode="bypass", + capture_network_requests=True, + ), + ) + assert result.success + assert result.network_requests is not None, "Network requests not captured" + + # There should be successful CSS responses + css_responses = [ + r + for r in result.network_requests + if r.get("event_type") == "response" and ".css" in r.get("url", "") + ] + assert ( + len(css_responses) > 0 + ), "CSS should load normally without avoid_css flag" + + +@pytest.mark.asyncio +async def test_avoid_css_blocks_css_requests(): + """With avoid_css=True, CSS requests must be aborted (no successful responses).""" + browser_config = BrowserConfig(headless=True, avoid_css=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://books.toscrape.com", + config=CrawlerRunConfig( + cache_mode="bypass", + capture_network_requests=True, + ), + ) + assert result.success + assert result.network_requests is not None, "Network requests not captured" + + # No CSS should have gotten a successful response + css_responses = [ + r + for r in result.network_requests + if r.get("event_type") == "response" and ".css" in r.get("url", "") + ] + assert ( + len(css_responses) == 0 + ), f"CSS responses should be blocked, but found: {[r['url'] for r in css_responses]}" + + # There SHOULD be request_failed events for CSS (proves blocking happened) + css_failures = [ + r + for r in result.network_requests + if r.get("event_type") == "request_failed" + and ".css" in r.get("url", "") + ] + assert ( + len(css_failures) > 0 + ), "Expected request_failed events for blocked CSS files" + + +@pytest.mark.asyncio +async def test_avoid_css_with_text_mode_combines(): + """Both avoid_css and text_mode should combine their blocking rules.""" + browser_config = BrowserConfig( + headless=True, avoid_css=True, text_mode=True + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://books.toscrape.com", + config=CrawlerRunConfig( + cache_mode="bypass", + capture_network_requests=True, + ), + ) + assert result.success + assert result.network_requests is not None + + successful = [ + r for r in result.network_requests if r.get("event_type") == "response" + ] + + # CSS should be blocked (via avoid_css) + css_hits = [r for r in successful if ".css" in r.get("url", "")] + assert len(css_hits) == 0, "CSS should be blocked by avoid_css" + + # Images should be blocked (via text_mode) + img_exts = (".jpg", ".jpeg", ".png", ".gif", ".webp") + img_hits = [ + r + for r in successful + if any(r.get("url", "").lower().endswith(ext) for ext in img_exts) + ] + assert len(img_hits) == 0, "Images should be blocked by text_mode" diff --git a/tests/docker/test_pool_release.py b/tests/docker/test_pool_release.py new file mode 100644 index 00000000..6c81b3e5 --- /dev/null +++ b/tests/docker/test_pool_release.py @@ -0,0 +1,155 @@ +"""Tests for crawler pool release_crawler() and active_requests tracking. + +These tests validate the pool lifecycle without requiring Docker or a running +server. They test the release logic directly using mock crawler objects. +""" + +import asyncio +import pytest +from unittest.mock import MagicMock + + +# --------------------------------------------------------------------------- +# Standalone release_crawler implementation for testing +# (mirrors the logic that will be added to deploy/docker/crawler_pool.py) +# --------------------------------------------------------------------------- + +_TEST_LOCK = asyncio.Lock() + + +async def _release_crawler(crawler, lock=None): + """Standalone release logic matching crawler_pool.release_crawler().""" + lock = lock or _TEST_LOCK + async with lock: + if hasattr(crawler, "active_requests"): + crawler.active_requests = max(0, crawler.active_requests - 1) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestReleaseCrawler: + """Tests for the release_crawler function.""" + + @pytest.mark.asyncio + async def test_release_decrements_active_requests(self): + """release_crawler should decrement active_requests by 1.""" + crawler = MagicMock() + crawler.active_requests = 3 + + await _release_crawler(crawler) + assert crawler.active_requests == 2 + + @pytest.mark.asyncio + async def test_release_floors_at_zero(self): + """active_requests should never go below 0.""" + crawler = MagicMock() + crawler.active_requests = 0 + + await _release_crawler(crawler) + assert crawler.active_requests == 0 + + @pytest.mark.asyncio + async def test_release_from_one_to_zero(self): + """Standard case: single request finishes.""" + crawler = MagicMock() + crawler.active_requests = 1 + + await _release_crawler(crawler) + assert crawler.active_requests == 0 + + @pytest.mark.asyncio + async def test_release_handles_missing_attribute(self): + """Should not crash if crawler has no active_requests attribute.""" + crawler = MagicMock(spec=[]) # no attributes at all + # Should not raise + await _release_crawler(crawler) + + @pytest.mark.asyncio + async def test_multiple_releases_decrement_correctly(self): + """Multiple sequential releases should each decrement by 1.""" + crawler = MagicMock() + crawler.active_requests = 5 + + for expected in [4, 3, 2, 1, 0, 0]: # last one should floor at 0 + await _release_crawler(crawler) + assert crawler.active_requests == expected + + @pytest.mark.asyncio + async def test_concurrent_releases_are_safe(self): + """Concurrent releases should not corrupt the counter.""" + crawler = MagicMock() + crawler.active_requests = 100 + lock = asyncio.Lock() + + async def release_n_times(n): + for _ in range(n): + await _release_crawler(crawler, lock=lock) + + # 10 concurrent tasks each releasing 10 times = 100 total + tasks = [asyncio.create_task(release_n_times(10)) for _ in range(10)] + await asyncio.gather(*tasks) + + assert crawler.active_requests == 0 + + +class TestActiveRequestsTracking: + """Tests for the get/release lifecycle pattern.""" + + @pytest.mark.asyncio + async def test_get_sets_active_requests(self): + """Simulated get_crawler should set active_requests to 1 for new crawlers.""" + crawler = MagicMock() + # Simulate what get_crawler does for a new browser + crawler.active_requests = 1 + + assert crawler.active_requests == 1 + + @pytest.mark.asyncio + async def test_get_increments_existing(self): + """Simulated get_crawler should increment for existing pooled crawlers.""" + crawler = MagicMock() + crawler.active_requests = 2 + + # Simulate another get_crawler call returning same browser + crawler.active_requests += 1 + assert crawler.active_requests == 3 + + @pytest.mark.asyncio + async def test_full_get_release_lifecycle(self): + """Full lifecycle: get -> use -> release -> get -> release.""" + crawler = MagicMock() + + # First request gets the crawler + crawler.active_requests = 1 + + # Second concurrent request gets same crawler + crawler.active_requests += 1 + assert crawler.active_requests == 2 + + # First request finishes + await _release_crawler(crawler) + assert crawler.active_requests == 1 + + # Second request finishes + await _release_crawler(crawler) + assert crawler.active_requests == 0 + + @pytest.mark.asyncio + async def test_janitor_safety_check(self): + """Janitor should only close browsers with active_requests == 0.""" + crawler = MagicMock() + crawler.active_requests = 1 + + # Janitor check: should NOT close + should_close = getattr(crawler, "active_requests", 0) == 0 + assert should_close is False + + # Request finishes + await _release_crawler(crawler) + + # Janitor check: now safe to close + should_close = getattr(crawler, "active_requests", 0) == 0 + assert should_close is True diff --git a/tests/unit/test_resource_filtering_config.py b/tests/unit/test_resource_filtering_config.py new file mode 100644 index 00000000..738cc46c --- /dev/null +++ b/tests/unit/test_resource_filtering_config.py @@ -0,0 +1,100 @@ +"""Unit tests for BrowserConfig avoid_ads / avoid_css flags. + +Tests the config plumbing: defaults, serialization, cloning, roundtrips. +No browser or network required. +""" + +import pytest +from crawl4ai.async_configs import BrowserConfig + + +@pytest.fixture(autouse=True) +def _reset_defaults(): + """Ensure clean slate for each test.""" + BrowserConfig.reset_defaults() + yield + BrowserConfig.reset_defaults() + + +class TestResourceFilteringDefaults: + """Both flags must default to False (opt-in only).""" + + def test_default_values_are_false(self): + config = BrowserConfig() + assert config.avoid_ads is False + assert config.avoid_css is False + + def test_custom_values(self): + config = BrowserConfig(avoid_ads=True, avoid_css=True) + assert config.avoid_ads is True + assert config.avoid_css is True + + def test_mixed_values(self): + c1 = BrowserConfig(avoid_ads=True, avoid_css=False) + assert c1.avoid_ads is True + assert c1.avoid_css is False + + c2 = BrowserConfig(avoid_ads=False, avoid_css=True) + assert c2.avoid_ads is False + assert c2.avoid_css is True + + +class TestResourceFilteringSerialization: + """Flags must survive to_dict / from_kwargs / dump / load roundtrips.""" + + def test_to_dict_includes_flags(self): + config = BrowserConfig(avoid_ads=True, avoid_css=True) + d = config.to_dict() + assert "avoid_ads" in d + assert "avoid_css" in d + assert d["avoid_ads"] is True + assert d["avoid_css"] is True + + def test_to_dict_includes_false_values(self): + config = BrowserConfig() + d = config.to_dict() + assert d["avoid_ads"] is False + assert d["avoid_css"] is False + + def test_from_kwargs_roundtrip(self): + original = BrowserConfig(avoid_ads=True, avoid_css=False) + d = original.to_dict() + restored = BrowserConfig.from_kwargs(d) + assert restored.avoid_ads is True + assert restored.avoid_css is False + + def test_from_kwargs_with_true_values(self): + restored = BrowserConfig.from_kwargs({"avoid_ads": True, "avoid_css": True}) + assert restored.avoid_ads is True + assert restored.avoid_css is True + + def test_dump_load_roundtrip(self): + original = BrowserConfig(avoid_ads=True, avoid_css=True) + dumped = original.dump() + restored = BrowserConfig.load(dumped) + assert restored.avoid_ads is True + assert restored.avoid_css is True + + +class TestResourceFilteringClone: + """clone() must preserve flags and allow overrides.""" + + def test_clone_preserves_flags(self): + config = BrowserConfig(avoid_ads=True, avoid_css=True) + cloned = config.clone() + assert cloned.avoid_ads is True + assert cloned.avoid_css is True + + def test_clone_allows_override(self): + config = BrowserConfig(avoid_ads=True, avoid_css=False) + cloned = config.clone(avoid_css=True) + assert cloned.avoid_ads is True + assert cloned.avoid_css is True + # original unchanged + assert config.avoid_css is False + + def test_clone_can_disable_flag(self): + config = BrowserConfig(avoid_ads=True, avoid_css=True) + cloned = config.clone(avoid_ads=False) + assert cloned.avoid_ads is False + assert cloned.avoid_css is True