feat: add avoid_ads/avoid_css resource filtering and pool release lifecycle

Add opt-in BrowserConfig flags (avoid_ads, avoid_css) for blocking ad/tracker
domains and CSS resources at the browser context level. Refactor crawler pool
with release_crawler() and active_requests tracking to prevent janitor from
closing browsers with in-flight requests. Add proper finally blocks to all
Docker API/server handlers. Update docs for new config options.

Inspired by #1689.
This commit is contained in:
unclecode
2026-02-25 05:56:29 +00:00
parent 8d35d17d01
commit c0912f7234
11 changed files with 595 additions and 106 deletions

View File

@@ -579,6 +579,11 @@ class BrowserConfig:
process to reclaim leaked memory. 0 = disabled.
Recommended: 500-1000 for long-running crawlers.
Default: 0.
avoid_ads (bool): If True, blocks ad-related and tracker network requests at the
browser context level using a curated blocklist of top ad/tracker
domains. Default: False.
avoid_css (bool): If True, blocks loading of CSS files (css, less, scss, sass) to
reduce resource usage and speed up crawling. Default: False.
"""
def __init__(
@@ -627,6 +632,8 @@ class BrowserConfig:
debugging_port: int = 9222,
host: str = "localhost",
enable_stealth: bool = False,
avoid_ads: bool = False,
avoid_css: bool = False,
init_scripts: List[str] = None,
memory_saving_mode: bool = False,
max_pages_before_recycle: int = 0,
@@ -692,6 +699,8 @@ class BrowserConfig:
self.debugging_port = debugging_port
self.host = host
self.enable_stealth = enable_stealth
self.avoid_ads = avoid_ads
self.avoid_css = avoid_css
self.init_scripts = init_scripts if init_scripts is not None else []
self.memory_saving_mode = memory_saving_mode
self.max_pages_before_recycle = max_pages_before_recycle
@@ -785,6 +794,8 @@ class BrowserConfig:
"debugging_port": self.debugging_port,
"host": self.host,
"enable_stealth": self.enable_stealth,
"avoid_ads": self.avoid_ads,
"avoid_css": self.avoid_css,
"init_scripts": self.init_scripts,
"memory_saving_mode": self.memory_saving_mode,
"max_pages_before_recycle": self.max_pages_before_recycle,

View File

@@ -1258,59 +1258,47 @@ class BrowserManager:
}
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
blocked_extensions = [
# CSS extensions (blocked separately via avoid_css flag)
css_extensions = ["css", "less", "scss", "sass"]
# Static resource extensions (blocked when text_mode is enabled)
static_extensions = [
# Images
"jpg",
"jpeg",
"png",
"gif",
"webp",
"svg",
"ico",
"bmp",
"tiff",
"psd",
"jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
# Fonts
"woff",
"woff2",
"ttf",
"otf",
"eot",
# Styles
# 'css', 'less', 'scss', 'sass',
"woff", "woff2", "ttf", "otf", "eot",
# Media
"mp4",
"webm",
"ogg",
"avi",
"mov",
"wmv",
"flv",
"m4v",
"mp3",
"wav",
"aac",
"m4a",
"opus",
"flac",
"mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v",
"mp3", "wav", "aac", "m4a", "opus", "flac",
# Documents
"pdf",
"doc",
"docx",
"xls",
"xlsx",
"ppt",
"pptx",
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
# Archives
"zip",
"rar",
"7z",
"tar",
"gz",
"zip", "rar", "7z", "tar", "gz",
# Scripts and data
"xml",
"swf",
"wasm",
"xml", "swf", "wasm",
]
# Ad and tracker domain patterns (curated from uBlock/EasyList sources)
ad_tracker_patterns = [
"**/google-analytics.com/**",
"**/googletagmanager.com/**",
"**/googlesyndication.com/**",
"**/doubleclick.net/**",
"**/adservice.google.com/**",
"**/adsystem.com/**",
"**/adzerk.net/**",
"**/adnxs.com/**",
"**/ads.linkedin.com/**",
"**/facebook.net/**",
"**/analytics.twitter.com/**",
"**/ads-twitter.com/**",
"**/hotjar.com/**",
"**/clarity.ms/**",
"**/scorecardresearch.com/**",
"**/pixel.wp.com/**",
"**/amazon-adsystem.com/**",
"**/mixpanel.com/**",
"**/segment.com/**",
]
# Common context settings
@@ -1364,11 +1352,21 @@ class BrowserManager:
# Create and return the context with all settings
context = await self.browser.new_context(**context_settings)
# Apply text mode settings if enabled
# Build dynamic blocking list based on config flags
to_block = []
if self.config.avoid_css:
to_block.extend(css_extensions)
if self.config.text_mode:
# Create and apply route patterns for each extension
for ext in blocked_extensions:
to_block.extend(static_extensions)
if to_block:
for ext in to_block:
await context.route(f"**/*.{ext}", lambda route: route.abort())
if self.config.avoid_ads:
for pattern in ad_tracker_patterns:
await context.route(pattern, lambda route: route.abort())
return context
def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:

View File

@@ -67,7 +67,8 @@ async def handle_llm_qa(
config: dict
) -> str:
"""Process QA using LLM with crawled content as context."""
from crawler_pool import get_crawler
from crawler_pool import get_crawler, release_crawler
crawler = None
try:
if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
url = 'https://' + url
@@ -121,6 +122,9 @@ async def handle_llm_qa(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
finally:
if crawler:
await release_crawler(crawler)
async def process_llm_extraction(
redis: aioredis.Redis,
@@ -249,6 +253,7 @@ async def handle_markdown_request(
base_url: Optional[str] = None
) -> str:
"""Handle markdown generation requests."""
crawler = None
try:
# Validate provider if using LLM filter
if filter_type == FilterType.LLM:
@@ -282,7 +287,7 @@ async def handle_markdown_request(
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
from crawler_pool import get_crawler
from crawler_pool import get_crawler, release_crawler
from utils import load_config as _load_config
_cfg = _load_config()
browser_cfg = BrowserConfig(
@@ -315,6 +320,9 @@ async def handle_markdown_request(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
finally:
if crawler:
await release_crawler(crawler)
async def handle_llm_request(
redis: aioredis.Redis,
@@ -481,6 +489,7 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
"""Stream results with heartbeats and completion markers."""
import json
from utils import datetime_handler
from crawler_pool import release_crawler
try:
async for result in results_gen:
@@ -507,11 +516,8 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
except asyncio.CancelledError:
logger.warning("Client disconnected during streaming")
finally:
# try:
# await crawler.close()
# except Exception as e:
# logger.error(f"Crawler cleanup error: {e}")
pass
if crawler:
await release_crawler(crawler)
async def handle_crawl_request(
urls: List[str],
@@ -523,6 +529,7 @@ async def handle_crawl_request(
"""Handle non-streaming crawl requests with optional hooks."""
# Track request start
request_id = f"req_{uuid4().hex[:8]}"
crawler = None
try:
from monitor import get_monitor
await get_monitor().track_request_start(
@@ -549,11 +556,8 @@ async def handle_crawl_request(
) if config["crawler"]["rate_limiter"]["enabled"] else None
)
from crawler_pool import get_crawler
from crawler_pool import get_crawler, release_crawler
crawler = await get_crawler(browser_config)
# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
# Attach hooks if provided
hooks_status = {}
@@ -589,8 +593,6 @@ async def handle_crawl_request(
if not isinstance(results, list):
results = [results]
# await crawler.close()
end_mem_mb = _get_memory_mb() # <--- Get memory after
end_time = time.time()
@@ -689,13 +691,6 @@ async def handle_crawl_request(
except:
pass
if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
# try:
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during exception handling: {close_e}")
logger.error(f"Error closing crawler during exception handling: {str(e)}")
# Measure memory even on error if possible
end_mem_mb_error = _get_memory_mb()
if start_mem_mb is not None and end_mem_mb_error is not None:
@@ -709,6 +704,9 @@ async def handle_crawl_request(
"server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0)
})
)
finally:
if crawler:
await release_crawler(crawler)
async def handle_stream_crawl_request(
urls: List[str],
@@ -719,6 +717,7 @@ async def handle_stream_crawl_request(
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
"""Handle streaming crawl requests with optional hooks."""
hooks_info = None
crawler = None
try:
browser_config = BrowserConfig.load(browser_config)
# browser_config.verbose = True # Set to False or remove for production stress testing
@@ -734,11 +733,8 @@ async def handle_stream_crawl_request(
)
)
from crawler_pool import get_crawler
from crawler_pool import get_crawler, release_crawler
crawler = await get_crawler(browser_config)
# crawler = AsyncWebCrawler(config=browser_config)
# await crawler.start()
# Attach hooks if provided
if hooks_config:
@@ -763,13 +759,10 @@ async def handle_stream_crawl_request(
return crawler, results_gen, hooks_info
except Exception as e:
# Make sure to close crawler if started during an error here
if 'crawler' in locals() and crawler.ready:
# try:
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
# Release crawler on setup error (for successful streams,
# release happens in stream_results finally block)
if crawler:
await release_crawler(crawler)
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
# Raising HTTPException here will prevent streaming response
raise HTTPException(

View File

@@ -39,6 +39,9 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
if PERMANENT and _is_default_config(sig):
LAST_USED[sig] = time.time()
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
if not hasattr(PERMANENT, 'active_requests'):
PERMANENT.active_requests = 0
PERMANENT.active_requests += 1
logger.info("🔥 Using permanent browser")
return PERMANENT
@@ -46,13 +49,21 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
if sig in HOT_POOL:
LAST_USED[sig] = time.time()
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
logger.info(f"♨️ Using hot pool browser (sig={sig[:8]})")
return HOT_POOL[sig]
crawler = HOT_POOL[sig]
if not hasattr(crawler, 'active_requests'):
crawler.active_requests = 0
crawler.active_requests += 1
logger.info(f"♨️ Using hot pool browser (sig={sig[:8]}, active={crawler.active_requests})")
return crawler
# Check cold pool (promote to hot if used 3+ times)
if sig in COLD_POOL:
LAST_USED[sig] = time.time()
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
crawler = COLD_POOL[sig]
if not hasattr(crawler, 'active_requests'):
crawler.active_requests = 0
crawler.active_requests += 1
if USAGE_COUNT[sig] >= 3:
logger.info(f"⬆️ Promoting to hot pool (sig={sig[:8]}, count={USAGE_COUNT[sig]})")
@@ -68,7 +79,7 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
return HOT_POOL[sig]
logger.info(f"❄️ Using cold pool browser (sig={sig[:8]})")
return COLD_POOL[sig]
return crawler
# Memory check before creating new
mem_pct = get_container_memory_percent()
@@ -80,11 +91,23 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
logger.info(f"🆕 Creating new browser in cold pool (sig={sig[:8]}, mem={mem_pct:.1f}%)")
crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
await crawler.start()
crawler.active_requests = 1
COLD_POOL[sig] = crawler
LAST_USED[sig] = time.time()
USAGE_COUNT[sig] = 1
return crawler
async def release_crawler(crawler: AsyncWebCrawler):
"""Decrement active request count for a pooled crawler.
Call this in a finally block after finishing work with a crawler
obtained via get_crawler() so the janitor knows when it's safe
to close idle browsers.
"""
async with LOCK:
if hasattr(crawler, 'active_requests'):
crawler.active_requests = max(0, crawler.active_requests - 1)
async def init_permanent(cfg: BrowserConfig):
"""Initialize permanent default browser."""
global PERMANENT, DEFAULT_CONFIG_SIG
@@ -132,10 +155,13 @@ async def janitor():
# Clean cold pool
for sig in list(COLD_POOL.keys()):
if now - LAST_USED.get(sig, now) > cold_ttl:
crawler = COLD_POOL[sig]
if getattr(crawler, 'active_requests', 0) > 0:
continue # still serving requests, skip
idle_time = now - LAST_USED[sig]
logger.info(f"🧹 Closing cold browser (sig={sig[:8]}, idle={idle_time:.0f}s)")
with suppress(Exception):
await COLD_POOL[sig].close()
await crawler.close()
COLD_POOL.pop(sig, None)
LAST_USED.pop(sig, None)
USAGE_COUNT.pop(sig, None)
@@ -150,10 +176,13 @@ async def janitor():
# Clean hot pool (more conservative)
for sig in list(HOT_POOL.keys()):
if now - LAST_USED.get(sig, now) > hot_ttl:
crawler = HOT_POOL[sig]
if getattr(crawler, 'active_requests', 0) > 0:
continue # still serving requests, skip
idle_time = now - LAST_USED[sig]
logger.info(f"🧹 Closing hot browser (sig={sig[:8]}, idle={idle_time:.0f}s)")
with suppress(Exception):
await HOT_POOL[sig].close()
await crawler.close()
HOT_POOL.pop(sig, None)
LAST_USED.pop(sig, None)
USAGE_COUNT.pop(sig, None)

View File

@@ -7,7 +7,7 @@ Crawl4AI FastAPI entrypoint
"""
# ── stdlib & 3rdparty imports ───────────────────────────────
from crawler_pool import get_crawler, close_all, janitor
from crawler_pool import get_crawler, release_crawler, close_all, janitor
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.__version__ import __version__
from auth import create_access_token, get_token_dependency, TokenRequest
@@ -367,8 +367,8 @@ async def generate_html(
Use when you need sanitized HTML structures for building schemas or further processing.
"""
validate_url_scheme(body.url, allow_raw=True)
from crawler_pool import get_crawler
cfg = CrawlerRunConfig()
crawler = None
try:
crawler = await get_crawler(get_default_browser_config())
results = await crawler.arun(url=body.url, config=cfg)
@@ -381,6 +381,9 @@ async def generate_html(
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
except Exception as e:
raise HTTPException(500, detail=str(e))
finally:
if crawler:
await release_crawler(crawler)
# Screenshot endpoint
@@ -399,7 +402,7 @@ async def generate_screenshot(
Then in result instead of the screenshot you will get a path to the saved file.
"""
validate_url_scheme(body.url)
from crawler_pool import get_crawler
crawler = None
try:
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
crawler = await get_crawler(get_default_browser_config())
@@ -416,6 +419,9 @@ async def generate_screenshot(
return {"success": True, "screenshot": screenshot_data}
except Exception as e:
raise HTTPException(500, detail=str(e))
finally:
if crawler:
await release_crawler(crawler)
# PDF endpoint
@@ -434,7 +440,7 @@ async def generate_pdf(
Then in result instead of the PDF you will get a path to the saved file.
"""
validate_url_scheme(body.url)
from crawler_pool import get_crawler
crawler = None
try:
cfg = CrawlerRunConfig(pdf=True)
crawler = await get_crawler(get_default_browser_config())
@@ -451,6 +457,9 @@ async def generate_pdf(
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
except Exception as e:
raise HTTPException(500, detail=str(e))
finally:
if crawler:
await release_crawler(crawler)
@app.post("/execute_js")
@@ -507,7 +516,7 @@ async def execute_js(
"""
validate_url_scheme(body.url)
from crawler_pool import get_crawler
crawler = None
try:
cfg = CrawlerRunConfig(js_code=body.scripts)
crawler = await get_crawler(get_default_browser_config())
@@ -518,6 +527,9 @@ async def execute_js(
return JSONResponse(data)
except Exception as e:
raise HTTPException(500, detail=str(e))
finally:
if crawler:
await release_crawler(crawler)
@app.get("/llm/{url:path}")

View File

@@ -49,6 +49,8 @@ browser_cfg = BrowserConfig(
| **`user_agent_generator_config`** | `dict` (default: `{}`) | Configuration dict for user agent generation when `user_agent_mode="random"`. |
| **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. |
| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. |
| **`avoid_ads`** | `bool` (default: `False`) | If `True`, blocks requests to common ad/tracker domains (Google Analytics, DoubleClick, Facebook, Hotjar, etc.) at the browser context level. |
| **`avoid_css`** | `bool` (default: `False`) | If `True`, blocks loading of CSS files (`.css`, `.less`, `.scss`, `.sass`) for faster, leaner crawls when only text content is needed. |
| **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. |
| **`enable_stealth`** | `bool` (default: `False`) | Enable playwright-stealth mode to bypass bot detection. Cannot be used with `browser_mode="builtin"`. |

View File

@@ -1402,6 +1402,8 @@ class BrowserConfig:
user_agent=None,
text_mode=False,
light_mode=False,
avoid_ads=False,
avoid_css=False,
extra_args=None,
enable_stealth=False,
# ... other advanced parameters omitted here
@@ -1440,15 +1442,19 @@ class BrowserConfig:
8. **`user_agent`**:
- Custom User-Agent string. If `None`, a default is used.
- You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
9. **`text_mode`** & **`light_mode`**:
- `text_mode=True` disables images, possibly speeding up text-only crawls.
- `light_mode=True` turns off certain background features for performance.
10. **`extra_args`**:
- Additional flags for the underlying browser.
9. **`text_mode`** & **`light_mode`**:
- `text_mode=True` disables images, possibly speeding up text-only crawls.
- `light_mode=True` turns off certain background features for performance.
10. **`avoid_ads`** & **`avoid_css`**:
- `avoid_ads=True` blocks requests to common ad and tracker domains (Google Analytics, DoubleClick, Facebook, Hotjar, etc.) at the browser context level. Reduces network overhead and memory usage.
- `avoid_css=True` blocks loading of CSS files (`.css`, `.less`, `.scss`, `.sass`), useful when you only need text content and want faster, leaner crawls.
- Both default to `False` (opt-in). Can be combined with each other and with `text_mode`.
11. **`extra_args`**:
- Additional flags for the underlying browser.
- E.g. `["--disable-extensions"]`.
11. **`enable_stealth`**:
- If `True`, enables stealth mode using playwright-stealth.
- Modifies browser fingerprints to avoid basic bot detection.
12. **`enable_stealth`**:
- If `True`, enables stealth mode using playwright-stealth.
- Modifies browser fingerprints to avoid basic bot detection.
- Default is `False`. Recommended for sites with bot protection.
### Helper Methods
Both configuration classes provide a `clone()` method to create modified copies:

View File

@@ -109,17 +109,22 @@ class BrowserConfig:
- `user_agent`: Custom User-Agent string. If `None`, a default is used.
- `user_agent_mode`: Set to `"random"` for randomization (helps fight bot detection).
12.**`text_mode`** & **`light_mode`**
- `text_mode=True` disables images, possibly speeding up text-only crawls.
- `light_mode=True` turns off certain background features for performance.
12.**`text_mode`** & **`light_mode`**
- `text_mode=True` disables images, possibly speeding up text-only crawls.
- `light_mode=True` turns off certain background features for performance.
13.**`extra_args`**
13.**`avoid_ads`** & **`avoid_css`**
- `avoid_ads=True` blocks requests to common ad and tracker domains (Google Analytics, DoubleClick, Facebook, Hotjar, etc.) at the browser context level. Reduces network overhead and memory usage.
- `avoid_css=True` blocks loading of CSS files (`.css`, `.less`, `.scss`, `.sass`), useful when you only need text content and want faster, leaner crawls.
- Both default to `False` (opt-in). Can be combined with each other and with `text_mode`.
14.**`extra_args`**
- Additional flags for the underlying browser.
- E.g. `["--disable-extensions"]`.
14.**`enable_stealth`**
- If `True`, enables stealth mode using playwright-stealth.
- Modifies browser fingerprints to avoid basic bot detection.
15.**`enable_stealth`**
- If `True`, enables stealth mode using playwright-stealth.
- Modifies browser fingerprints to avoid basic bot detection.
- Default is `False`. Recommended for sites with bot protection.
### Helper Methods

View File

@@ -0,0 +1,178 @@
"""E2E tests for avoid_ads / avoid_css resource filtering.
These tests launch real browsers and crawl real websites to verify
that route-based resource blocking actually works.
Domains used:
- books.toscrape.com (CSS-heavy practice site, designed for scraping)
- quotes.toscrape.com (simple practice site)
- httpbin.org/html (static HTML, no trackers)
- en.wikipedia.org (real site with analytics)
"""
import pytest
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
# ---------------------------------------------------------------------------
# Basic success tests — flags should not break crawling
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_crawl_with_avoid_css_succeeds():
"""Crawl books.toscrape.com with avoid_css=True — page should load fine."""
browser_config = BrowserConfig(headless=True, avoid_css=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://books.toscrape.com",
config=CrawlerRunConfig(cache_mode="bypass"),
)
assert result.success, f"Crawl failed: {result.error_message}"
assert len(result.html) > 500, "Page HTML is suspiciously short"
@pytest.mark.asyncio
async def test_crawl_with_avoid_ads_succeeds():
"""Crawl Wikipedia with avoid_ads=True — content should be intact."""
browser_config = BrowserConfig(headless=True, avoid_ads=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/Web_scraping",
config=CrawlerRunConfig(cache_mode="bypass"),
)
assert result.success, f"Crawl failed: {result.error_message}"
# Wikipedia article content must be present
html_lower = result.html.lower()
assert "web scraping" in html_lower, "Wikipedia content missing"
@pytest.mark.asyncio
async def test_crawl_with_both_flags_succeeds():
"""Both avoid_css and avoid_ads enabled simultaneously."""
browser_config = BrowserConfig(headless=True, avoid_css=True, avoid_ads=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://quotes.toscrape.com",
config=CrawlerRunConfig(cache_mode="bypass"),
)
assert result.success, f"Crawl failed: {result.error_message}"
html_lower = result.html.lower()
assert "quote" in html_lower or "toscrape" in html_lower
@pytest.mark.asyncio
async def test_avoid_ads_does_not_block_page_content():
"""avoid_ads must not interfere with first-party page content."""
browser_config = BrowserConfig(headless=True, avoid_ads=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://httpbin.org/html",
config=CrawlerRunConfig(cache_mode="bypass"),
)
assert result.success, f"Crawl failed: {result.error_message}"
# httpbin.org/html serves a Moby Dick excerpt
assert "Herman Melville" in result.html, "First-party content missing"
# ---------------------------------------------------------------------------
# Network-level verification — prove routes actually block requests
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_without_flags_css_loads_normally():
"""Baseline: without avoid_css, CSS responses should appear in network log."""
browser_config = BrowserConfig(headless=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://books.toscrape.com",
config=CrawlerRunConfig(
cache_mode="bypass",
capture_network_requests=True,
),
)
assert result.success
assert result.network_requests is not None, "Network requests not captured"
# There should be successful CSS responses
css_responses = [
r
for r in result.network_requests
if r.get("event_type") == "response" and ".css" in r.get("url", "")
]
assert (
len(css_responses) > 0
), "CSS should load normally without avoid_css flag"
@pytest.mark.asyncio
async def test_avoid_css_blocks_css_requests():
"""With avoid_css=True, CSS requests must be aborted (no successful responses)."""
browser_config = BrowserConfig(headless=True, avoid_css=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://books.toscrape.com",
config=CrawlerRunConfig(
cache_mode="bypass",
capture_network_requests=True,
),
)
assert result.success
assert result.network_requests is not None, "Network requests not captured"
# No CSS should have gotten a successful response
css_responses = [
r
for r in result.network_requests
if r.get("event_type") == "response" and ".css" in r.get("url", "")
]
assert (
len(css_responses) == 0
), f"CSS responses should be blocked, but found: {[r['url'] for r in css_responses]}"
# There SHOULD be request_failed events for CSS (proves blocking happened)
css_failures = [
r
for r in result.network_requests
if r.get("event_type") == "request_failed"
and ".css" in r.get("url", "")
]
assert (
len(css_failures) > 0
), "Expected request_failed events for blocked CSS files"
@pytest.mark.asyncio
async def test_avoid_css_with_text_mode_combines():
"""Both avoid_css and text_mode should combine their blocking rules."""
browser_config = BrowserConfig(
headless=True, avoid_css=True, text_mode=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://books.toscrape.com",
config=CrawlerRunConfig(
cache_mode="bypass",
capture_network_requests=True,
),
)
assert result.success
assert result.network_requests is not None
successful = [
r for r in result.network_requests if r.get("event_type") == "response"
]
# CSS should be blocked (via avoid_css)
css_hits = [r for r in successful if ".css" in r.get("url", "")]
assert len(css_hits) == 0, "CSS should be blocked by avoid_css"
# Images should be blocked (via text_mode)
img_exts = (".jpg", ".jpeg", ".png", ".gif", ".webp")
img_hits = [
r
for r in successful
if any(r.get("url", "").lower().endswith(ext) for ext in img_exts)
]
assert len(img_hits) == 0, "Images should be blocked by text_mode"

View File

@@ -0,0 +1,155 @@
"""Tests for crawler pool release_crawler() and active_requests tracking.
These tests validate the pool lifecycle without requiring Docker or a running
server. They test the release logic directly using mock crawler objects.
"""
import asyncio
import pytest
from unittest.mock import MagicMock
# ---------------------------------------------------------------------------
# Standalone release_crawler implementation for testing
# (mirrors the logic that will be added to deploy/docker/crawler_pool.py)
# ---------------------------------------------------------------------------
_TEST_LOCK = asyncio.Lock()
async def _release_crawler(crawler, lock=None):
"""Standalone release logic matching crawler_pool.release_crawler()."""
lock = lock or _TEST_LOCK
async with lock:
if hasattr(crawler, "active_requests"):
crawler.active_requests = max(0, crawler.active_requests - 1)
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestReleaseCrawler:
"""Tests for the release_crawler function."""
@pytest.mark.asyncio
async def test_release_decrements_active_requests(self):
"""release_crawler should decrement active_requests by 1."""
crawler = MagicMock()
crawler.active_requests = 3
await _release_crawler(crawler)
assert crawler.active_requests == 2
@pytest.mark.asyncio
async def test_release_floors_at_zero(self):
"""active_requests should never go below 0."""
crawler = MagicMock()
crawler.active_requests = 0
await _release_crawler(crawler)
assert crawler.active_requests == 0
@pytest.mark.asyncio
async def test_release_from_one_to_zero(self):
"""Standard case: single request finishes."""
crawler = MagicMock()
crawler.active_requests = 1
await _release_crawler(crawler)
assert crawler.active_requests == 0
@pytest.mark.asyncio
async def test_release_handles_missing_attribute(self):
"""Should not crash if crawler has no active_requests attribute."""
crawler = MagicMock(spec=[]) # no attributes at all
# Should not raise
await _release_crawler(crawler)
@pytest.mark.asyncio
async def test_multiple_releases_decrement_correctly(self):
"""Multiple sequential releases should each decrement by 1."""
crawler = MagicMock()
crawler.active_requests = 5
for expected in [4, 3, 2, 1, 0, 0]: # last one should floor at 0
await _release_crawler(crawler)
assert crawler.active_requests == expected
@pytest.mark.asyncio
async def test_concurrent_releases_are_safe(self):
"""Concurrent releases should not corrupt the counter."""
crawler = MagicMock()
crawler.active_requests = 100
lock = asyncio.Lock()
async def release_n_times(n):
for _ in range(n):
await _release_crawler(crawler, lock=lock)
# 10 concurrent tasks each releasing 10 times = 100 total
tasks = [asyncio.create_task(release_n_times(10)) for _ in range(10)]
await asyncio.gather(*tasks)
assert crawler.active_requests == 0
class TestActiveRequestsTracking:
"""Tests for the get/release lifecycle pattern."""
@pytest.mark.asyncio
async def test_get_sets_active_requests(self):
"""Simulated get_crawler should set active_requests to 1 for new crawlers."""
crawler = MagicMock()
# Simulate what get_crawler does for a new browser
crawler.active_requests = 1
assert crawler.active_requests == 1
@pytest.mark.asyncio
async def test_get_increments_existing(self):
"""Simulated get_crawler should increment for existing pooled crawlers."""
crawler = MagicMock()
crawler.active_requests = 2
# Simulate another get_crawler call returning same browser
crawler.active_requests += 1
assert crawler.active_requests == 3
@pytest.mark.asyncio
async def test_full_get_release_lifecycle(self):
"""Full lifecycle: get -> use -> release -> get -> release."""
crawler = MagicMock()
# First request gets the crawler
crawler.active_requests = 1
# Second concurrent request gets same crawler
crawler.active_requests += 1
assert crawler.active_requests == 2
# First request finishes
await _release_crawler(crawler)
assert crawler.active_requests == 1
# Second request finishes
await _release_crawler(crawler)
assert crawler.active_requests == 0
@pytest.mark.asyncio
async def test_janitor_safety_check(self):
"""Janitor should only close browsers with active_requests == 0."""
crawler = MagicMock()
crawler.active_requests = 1
# Janitor check: should NOT close
should_close = getattr(crawler, "active_requests", 0) == 0
assert should_close is False
# Request finishes
await _release_crawler(crawler)
# Janitor check: now safe to close
should_close = getattr(crawler, "active_requests", 0) == 0
assert should_close is True

View File

@@ -0,0 +1,100 @@
"""Unit tests for BrowserConfig avoid_ads / avoid_css flags.
Tests the config plumbing: defaults, serialization, cloning, roundtrips.
No browser or network required.
"""
import pytest
from crawl4ai.async_configs import BrowserConfig
@pytest.fixture(autouse=True)
def _reset_defaults():
"""Ensure clean slate for each test."""
BrowserConfig.reset_defaults()
yield
BrowserConfig.reset_defaults()
class TestResourceFilteringDefaults:
"""Both flags must default to False (opt-in only)."""
def test_default_values_are_false(self):
config = BrowserConfig()
assert config.avoid_ads is False
assert config.avoid_css is False
def test_custom_values(self):
config = BrowserConfig(avoid_ads=True, avoid_css=True)
assert config.avoid_ads is True
assert config.avoid_css is True
def test_mixed_values(self):
c1 = BrowserConfig(avoid_ads=True, avoid_css=False)
assert c1.avoid_ads is True
assert c1.avoid_css is False
c2 = BrowserConfig(avoid_ads=False, avoid_css=True)
assert c2.avoid_ads is False
assert c2.avoid_css is True
class TestResourceFilteringSerialization:
"""Flags must survive to_dict / from_kwargs / dump / load roundtrips."""
def test_to_dict_includes_flags(self):
config = BrowserConfig(avoid_ads=True, avoid_css=True)
d = config.to_dict()
assert "avoid_ads" in d
assert "avoid_css" in d
assert d["avoid_ads"] is True
assert d["avoid_css"] is True
def test_to_dict_includes_false_values(self):
config = BrowserConfig()
d = config.to_dict()
assert d["avoid_ads"] is False
assert d["avoid_css"] is False
def test_from_kwargs_roundtrip(self):
original = BrowserConfig(avoid_ads=True, avoid_css=False)
d = original.to_dict()
restored = BrowserConfig.from_kwargs(d)
assert restored.avoid_ads is True
assert restored.avoid_css is False
def test_from_kwargs_with_true_values(self):
restored = BrowserConfig.from_kwargs({"avoid_ads": True, "avoid_css": True})
assert restored.avoid_ads is True
assert restored.avoid_css is True
def test_dump_load_roundtrip(self):
original = BrowserConfig(avoid_ads=True, avoid_css=True)
dumped = original.dump()
restored = BrowserConfig.load(dumped)
assert restored.avoid_ads is True
assert restored.avoid_css is True
class TestResourceFilteringClone:
"""clone() must preserve flags and allow overrides."""
def test_clone_preserves_flags(self):
config = BrowserConfig(avoid_ads=True, avoid_css=True)
cloned = config.clone()
assert cloned.avoid_ads is True
assert cloned.avoid_css is True
def test_clone_allows_override(self):
config = BrowserConfig(avoid_ads=True, avoid_css=False)
cloned = config.clone(avoid_css=True)
assert cloned.avoid_ads is True
assert cloned.avoid_css is True
# original unchanged
assert config.avoid_css is False
def test_clone_can_disable_flag(self):
config = BrowserConfig(avoid_ads=True, avoid_css=True)
cloned = config.clone(avoid_ads=False)
assert cloned.avoid_ads is False
assert cloned.avoid_css is True