mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
feat: add avoid_ads/avoid_css resource filtering and pool release lifecycle
Add opt-in BrowserConfig flags (avoid_ads, avoid_css) for blocking ad/tracker domains and CSS resources at the browser context level. Refactor crawler pool with release_crawler() and active_requests tracking to prevent janitor from closing browsers with in-flight requests. Add proper finally blocks to all Docker API/server handlers. Update docs for new config options. Inspired by #1689.
This commit is contained in:
@@ -579,6 +579,11 @@ class BrowserConfig:
|
||||
process to reclaim leaked memory. 0 = disabled.
|
||||
Recommended: 500-1000 for long-running crawlers.
|
||||
Default: 0.
|
||||
avoid_ads (bool): If True, blocks ad-related and tracker network requests at the
|
||||
browser context level using a curated blocklist of top ad/tracker
|
||||
domains. Default: False.
|
||||
avoid_css (bool): If True, blocks loading of CSS files (css, less, scss, sass) to
|
||||
reduce resource usage and speed up crawling. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -627,6 +632,8 @@ class BrowserConfig:
|
||||
debugging_port: int = 9222,
|
||||
host: str = "localhost",
|
||||
enable_stealth: bool = False,
|
||||
avoid_ads: bool = False,
|
||||
avoid_css: bool = False,
|
||||
init_scripts: List[str] = None,
|
||||
memory_saving_mode: bool = False,
|
||||
max_pages_before_recycle: int = 0,
|
||||
@@ -692,6 +699,8 @@ class BrowserConfig:
|
||||
self.debugging_port = debugging_port
|
||||
self.host = host
|
||||
self.enable_stealth = enable_stealth
|
||||
self.avoid_ads = avoid_ads
|
||||
self.avoid_css = avoid_css
|
||||
self.init_scripts = init_scripts if init_scripts is not None else []
|
||||
self.memory_saving_mode = memory_saving_mode
|
||||
self.max_pages_before_recycle = max_pages_before_recycle
|
||||
@@ -785,6 +794,8 @@ class BrowserConfig:
|
||||
"debugging_port": self.debugging_port,
|
||||
"host": self.host,
|
||||
"enable_stealth": self.enable_stealth,
|
||||
"avoid_ads": self.avoid_ads,
|
||||
"avoid_css": self.avoid_css,
|
||||
"init_scripts": self.init_scripts,
|
||||
"memory_saving_mode": self.memory_saving_mode,
|
||||
"max_pages_before_recycle": self.max_pages_before_recycle,
|
||||
|
||||
@@ -1258,59 +1258,47 @@ class BrowserManager:
|
||||
}
|
||||
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
|
||||
|
||||
blocked_extensions = [
|
||||
# CSS extensions (blocked separately via avoid_css flag)
|
||||
css_extensions = ["css", "less", "scss", "sass"]
|
||||
|
||||
# Static resource extensions (blocked when text_mode is enabled)
|
||||
static_extensions = [
|
||||
# Images
|
||||
"jpg",
|
||||
"jpeg",
|
||||
"png",
|
||||
"gif",
|
||||
"webp",
|
||||
"svg",
|
||||
"ico",
|
||||
"bmp",
|
||||
"tiff",
|
||||
"psd",
|
||||
"jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
|
||||
# Fonts
|
||||
"woff",
|
||||
"woff2",
|
||||
"ttf",
|
||||
"otf",
|
||||
"eot",
|
||||
# Styles
|
||||
# 'css', 'less', 'scss', 'sass',
|
||||
"woff", "woff2", "ttf", "otf", "eot",
|
||||
# Media
|
||||
"mp4",
|
||||
"webm",
|
||||
"ogg",
|
||||
"avi",
|
||||
"mov",
|
||||
"wmv",
|
||||
"flv",
|
||||
"m4v",
|
||||
"mp3",
|
||||
"wav",
|
||||
"aac",
|
||||
"m4a",
|
||||
"opus",
|
||||
"flac",
|
||||
"mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v",
|
||||
"mp3", "wav", "aac", "m4a", "opus", "flac",
|
||||
# Documents
|
||||
"pdf",
|
||||
"doc",
|
||||
"docx",
|
||||
"xls",
|
||||
"xlsx",
|
||||
"ppt",
|
||||
"pptx",
|
||||
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
|
||||
# Archives
|
||||
"zip",
|
||||
"rar",
|
||||
"7z",
|
||||
"tar",
|
||||
"gz",
|
||||
"zip", "rar", "7z", "tar", "gz",
|
||||
# Scripts and data
|
||||
"xml",
|
||||
"swf",
|
||||
"wasm",
|
||||
"xml", "swf", "wasm",
|
||||
]
|
||||
|
||||
# Ad and tracker domain patterns (curated from uBlock/EasyList sources)
|
||||
ad_tracker_patterns = [
|
||||
"**/google-analytics.com/**",
|
||||
"**/googletagmanager.com/**",
|
||||
"**/googlesyndication.com/**",
|
||||
"**/doubleclick.net/**",
|
||||
"**/adservice.google.com/**",
|
||||
"**/adsystem.com/**",
|
||||
"**/adzerk.net/**",
|
||||
"**/adnxs.com/**",
|
||||
"**/ads.linkedin.com/**",
|
||||
"**/facebook.net/**",
|
||||
"**/analytics.twitter.com/**",
|
||||
"**/ads-twitter.com/**",
|
||||
"**/hotjar.com/**",
|
||||
"**/clarity.ms/**",
|
||||
"**/scorecardresearch.com/**",
|
||||
"**/pixel.wp.com/**",
|
||||
"**/amazon-adsystem.com/**",
|
||||
"**/mixpanel.com/**",
|
||||
"**/segment.com/**",
|
||||
]
|
||||
|
||||
# Common context settings
|
||||
@@ -1364,11 +1352,21 @@ class BrowserManager:
|
||||
# Create and return the context with all settings
|
||||
context = await self.browser.new_context(**context_settings)
|
||||
|
||||
# Apply text mode settings if enabled
|
||||
# Build dynamic blocking list based on config flags
|
||||
to_block = []
|
||||
if self.config.avoid_css:
|
||||
to_block.extend(css_extensions)
|
||||
if self.config.text_mode:
|
||||
# Create and apply route patterns for each extension
|
||||
for ext in blocked_extensions:
|
||||
to_block.extend(static_extensions)
|
||||
|
||||
if to_block:
|
||||
for ext in to_block:
|
||||
await context.route(f"**/*.{ext}", lambda route: route.abort())
|
||||
|
||||
if self.config.avoid_ads:
|
||||
for pattern in ad_tracker_patterns:
|
||||
await context.route(pattern, lambda route: route.abort())
|
||||
|
||||
return context
|
||||
|
||||
def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
|
||||
|
||||
@@ -67,7 +67,8 @@ async def handle_llm_qa(
|
||||
config: dict
|
||||
) -> str:
|
||||
"""Process QA using LLM with crawled content as context."""
|
||||
from crawler_pool import get_crawler
|
||||
from crawler_pool import get_crawler, release_crawler
|
||||
crawler = None
|
||||
try:
|
||||
if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
|
||||
url = 'https://' + url
|
||||
@@ -121,6 +122,9 @@ async def handle_llm_qa(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(e)
|
||||
)
|
||||
finally:
|
||||
if crawler:
|
||||
await release_crawler(crawler)
|
||||
|
||||
async def process_llm_extraction(
|
||||
redis: aioredis.Redis,
|
||||
@@ -249,6 +253,7 @@ async def handle_markdown_request(
|
||||
base_url: Optional[str] = None
|
||||
) -> str:
|
||||
"""Handle markdown generation requests."""
|
||||
crawler = None
|
||||
try:
|
||||
# Validate provider if using LLM filter
|
||||
if filter_type == FilterType.LLM:
|
||||
@@ -282,7 +287,7 @@ async def handle_markdown_request(
|
||||
|
||||
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
|
||||
|
||||
from crawler_pool import get_crawler
|
||||
from crawler_pool import get_crawler, release_crawler
|
||||
from utils import load_config as _load_config
|
||||
_cfg = _load_config()
|
||||
browser_cfg = BrowserConfig(
|
||||
@@ -315,6 +320,9 @@ async def handle_markdown_request(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(e)
|
||||
)
|
||||
finally:
|
||||
if crawler:
|
||||
await release_crawler(crawler)
|
||||
|
||||
async def handle_llm_request(
|
||||
redis: aioredis.Redis,
|
||||
@@ -481,6 +489,7 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
||||
"""Stream results with heartbeats and completion markers."""
|
||||
import json
|
||||
from utils import datetime_handler
|
||||
from crawler_pool import release_crawler
|
||||
|
||||
try:
|
||||
async for result in results_gen:
|
||||
@@ -507,11 +516,8 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
||||
except asyncio.CancelledError:
|
||||
logger.warning("Client disconnected during streaming")
|
||||
finally:
|
||||
# try:
|
||||
# await crawler.close()
|
||||
# except Exception as e:
|
||||
# logger.error(f"Crawler cleanup error: {e}")
|
||||
pass
|
||||
if crawler:
|
||||
await release_crawler(crawler)
|
||||
|
||||
async def handle_crawl_request(
|
||||
urls: List[str],
|
||||
@@ -523,6 +529,7 @@ async def handle_crawl_request(
|
||||
"""Handle non-streaming crawl requests with optional hooks."""
|
||||
# Track request start
|
||||
request_id = f"req_{uuid4().hex[:8]}"
|
||||
crawler = None
|
||||
try:
|
||||
from monitor import get_monitor
|
||||
await get_monitor().track_request_start(
|
||||
@@ -549,11 +556,8 @@ async def handle_crawl_request(
|
||||
) if config["crawler"]["rate_limiter"]["enabled"] else None
|
||||
)
|
||||
|
||||
from crawler_pool import get_crawler
|
||||
from crawler_pool import get_crawler, release_crawler
|
||||
crawler = await get_crawler(browser_config)
|
||||
|
||||
# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
|
||||
# await crawler.start()
|
||||
|
||||
# Attach hooks if provided
|
||||
hooks_status = {}
|
||||
@@ -589,8 +593,6 @@ async def handle_crawl_request(
|
||||
if not isinstance(results, list):
|
||||
results = [results]
|
||||
|
||||
# await crawler.close()
|
||||
|
||||
end_mem_mb = _get_memory_mb() # <--- Get memory after
|
||||
end_time = time.time()
|
||||
|
||||
@@ -689,13 +691,6 @@ async def handle_crawl_request(
|
||||
except:
|
||||
pass
|
||||
|
||||
if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
|
||||
# try:
|
||||
# await crawler.close()
|
||||
# except Exception as close_e:
|
||||
# logger.error(f"Error closing crawler during exception handling: {close_e}")
|
||||
logger.error(f"Error closing crawler during exception handling: {str(e)}")
|
||||
|
||||
# Measure memory even on error if possible
|
||||
end_mem_mb_error = _get_memory_mb()
|
||||
if start_mem_mb is not None and end_mem_mb_error is not None:
|
||||
@@ -709,6 +704,9 @@ async def handle_crawl_request(
|
||||
"server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0)
|
||||
})
|
||||
)
|
||||
finally:
|
||||
if crawler:
|
||||
await release_crawler(crawler)
|
||||
|
||||
async def handle_stream_crawl_request(
|
||||
urls: List[str],
|
||||
@@ -719,6 +717,7 @@ async def handle_stream_crawl_request(
|
||||
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
|
||||
"""Handle streaming crawl requests with optional hooks."""
|
||||
hooks_info = None
|
||||
crawler = None
|
||||
try:
|
||||
browser_config = BrowserConfig.load(browser_config)
|
||||
# browser_config.verbose = True # Set to False or remove for production stress testing
|
||||
@@ -734,11 +733,8 @@ async def handle_stream_crawl_request(
|
||||
)
|
||||
)
|
||||
|
||||
from crawler_pool import get_crawler
|
||||
from crawler_pool import get_crawler, release_crawler
|
||||
crawler = await get_crawler(browser_config)
|
||||
|
||||
# crawler = AsyncWebCrawler(config=browser_config)
|
||||
# await crawler.start()
|
||||
|
||||
# Attach hooks if provided
|
||||
if hooks_config:
|
||||
@@ -763,13 +759,10 @@ async def handle_stream_crawl_request(
|
||||
return crawler, results_gen, hooks_info
|
||||
|
||||
except Exception as e:
|
||||
# Make sure to close crawler if started during an error here
|
||||
if 'crawler' in locals() and crawler.ready:
|
||||
# try:
|
||||
# await crawler.close()
|
||||
# except Exception as close_e:
|
||||
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
||||
logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
|
||||
# Release crawler on setup error (for successful streams,
|
||||
# release happens in stream_results finally block)
|
||||
if crawler:
|
||||
await release_crawler(crawler)
|
||||
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
|
||||
# Raising HTTPException here will prevent streaming response
|
||||
raise HTTPException(
|
||||
|
||||
@@ -39,6 +39,9 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
|
||||
if PERMANENT and _is_default_config(sig):
|
||||
LAST_USED[sig] = time.time()
|
||||
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
|
||||
if not hasattr(PERMANENT, 'active_requests'):
|
||||
PERMANENT.active_requests = 0
|
||||
PERMANENT.active_requests += 1
|
||||
logger.info("🔥 Using permanent browser")
|
||||
return PERMANENT
|
||||
|
||||
@@ -46,13 +49,21 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
|
||||
if sig in HOT_POOL:
|
||||
LAST_USED[sig] = time.time()
|
||||
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
|
||||
logger.info(f"♨️ Using hot pool browser (sig={sig[:8]})")
|
||||
return HOT_POOL[sig]
|
||||
crawler = HOT_POOL[sig]
|
||||
if not hasattr(crawler, 'active_requests'):
|
||||
crawler.active_requests = 0
|
||||
crawler.active_requests += 1
|
||||
logger.info(f"♨️ Using hot pool browser (sig={sig[:8]}, active={crawler.active_requests})")
|
||||
return crawler
|
||||
|
||||
# Check cold pool (promote to hot if used 3+ times)
|
||||
if sig in COLD_POOL:
|
||||
LAST_USED[sig] = time.time()
|
||||
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
|
||||
crawler = COLD_POOL[sig]
|
||||
if not hasattr(crawler, 'active_requests'):
|
||||
crawler.active_requests = 0
|
||||
crawler.active_requests += 1
|
||||
|
||||
if USAGE_COUNT[sig] >= 3:
|
||||
logger.info(f"⬆️ Promoting to hot pool (sig={sig[:8]}, count={USAGE_COUNT[sig]})")
|
||||
@@ -68,7 +79,7 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
|
||||
return HOT_POOL[sig]
|
||||
|
||||
logger.info(f"❄️ Using cold pool browser (sig={sig[:8]})")
|
||||
return COLD_POOL[sig]
|
||||
return crawler
|
||||
|
||||
# Memory check before creating new
|
||||
mem_pct = get_container_memory_percent()
|
||||
@@ -80,11 +91,23 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
|
||||
logger.info(f"🆕 Creating new browser in cold pool (sig={sig[:8]}, mem={mem_pct:.1f}%)")
|
||||
crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
|
||||
await crawler.start()
|
||||
crawler.active_requests = 1
|
||||
COLD_POOL[sig] = crawler
|
||||
LAST_USED[sig] = time.time()
|
||||
USAGE_COUNT[sig] = 1
|
||||
return crawler
|
||||
|
||||
async def release_crawler(crawler: AsyncWebCrawler):
|
||||
"""Decrement active request count for a pooled crawler.
|
||||
|
||||
Call this in a finally block after finishing work with a crawler
|
||||
obtained via get_crawler() so the janitor knows when it's safe
|
||||
to close idle browsers.
|
||||
"""
|
||||
async with LOCK:
|
||||
if hasattr(crawler, 'active_requests'):
|
||||
crawler.active_requests = max(0, crawler.active_requests - 1)
|
||||
|
||||
async def init_permanent(cfg: BrowserConfig):
|
||||
"""Initialize permanent default browser."""
|
||||
global PERMANENT, DEFAULT_CONFIG_SIG
|
||||
@@ -132,10 +155,13 @@ async def janitor():
|
||||
# Clean cold pool
|
||||
for sig in list(COLD_POOL.keys()):
|
||||
if now - LAST_USED.get(sig, now) > cold_ttl:
|
||||
crawler = COLD_POOL[sig]
|
||||
if getattr(crawler, 'active_requests', 0) > 0:
|
||||
continue # still serving requests, skip
|
||||
idle_time = now - LAST_USED[sig]
|
||||
logger.info(f"🧹 Closing cold browser (sig={sig[:8]}, idle={idle_time:.0f}s)")
|
||||
with suppress(Exception):
|
||||
await COLD_POOL[sig].close()
|
||||
await crawler.close()
|
||||
COLD_POOL.pop(sig, None)
|
||||
LAST_USED.pop(sig, None)
|
||||
USAGE_COUNT.pop(sig, None)
|
||||
@@ -150,10 +176,13 @@ async def janitor():
|
||||
# Clean hot pool (more conservative)
|
||||
for sig in list(HOT_POOL.keys()):
|
||||
if now - LAST_USED.get(sig, now) > hot_ttl:
|
||||
crawler = HOT_POOL[sig]
|
||||
if getattr(crawler, 'active_requests', 0) > 0:
|
||||
continue # still serving requests, skip
|
||||
idle_time = now - LAST_USED[sig]
|
||||
logger.info(f"🧹 Closing hot browser (sig={sig[:8]}, idle={idle_time:.0f}s)")
|
||||
with suppress(Exception):
|
||||
await HOT_POOL[sig].close()
|
||||
await crawler.close()
|
||||
HOT_POOL.pop(sig, None)
|
||||
LAST_USED.pop(sig, None)
|
||||
USAGE_COUNT.pop(sig, None)
|
||||
|
||||
@@ -7,7 +7,7 @@ Crawl4AI FastAPI entry‑point
|
||||
"""
|
||||
|
||||
# ── stdlib & 3rd‑party imports ───────────────────────────────
|
||||
from crawler_pool import get_crawler, close_all, janitor
|
||||
from crawler_pool import get_crawler, release_crawler, close_all, janitor
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.__version__ import __version__
|
||||
from auth import create_access_token, get_token_dependency, TokenRequest
|
||||
@@ -367,8 +367,8 @@ async def generate_html(
|
||||
Use when you need sanitized HTML structures for building schemas or further processing.
|
||||
"""
|
||||
validate_url_scheme(body.url, allow_raw=True)
|
||||
from crawler_pool import get_crawler
|
||||
cfg = CrawlerRunConfig()
|
||||
crawler = None
|
||||
try:
|
||||
crawler = await get_crawler(get_default_browser_config())
|
||||
results = await crawler.arun(url=body.url, config=cfg)
|
||||
@@ -381,6 +381,9 @@ async def generate_html(
|
||||
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
|
||||
except Exception as e:
|
||||
raise HTTPException(500, detail=str(e))
|
||||
finally:
|
||||
if crawler:
|
||||
await release_crawler(crawler)
|
||||
|
||||
# Screenshot endpoint
|
||||
|
||||
@@ -399,7 +402,7 @@ async def generate_screenshot(
|
||||
Then in result instead of the screenshot you will get a path to the saved file.
|
||||
"""
|
||||
validate_url_scheme(body.url)
|
||||
from crawler_pool import get_crawler
|
||||
crawler = None
|
||||
try:
|
||||
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
|
||||
crawler = await get_crawler(get_default_browser_config())
|
||||
@@ -416,6 +419,9 @@ async def generate_screenshot(
|
||||
return {"success": True, "screenshot": screenshot_data}
|
||||
except Exception as e:
|
||||
raise HTTPException(500, detail=str(e))
|
||||
finally:
|
||||
if crawler:
|
||||
await release_crawler(crawler)
|
||||
|
||||
# PDF endpoint
|
||||
|
||||
@@ -434,7 +440,7 @@ async def generate_pdf(
|
||||
Then in result instead of the PDF you will get a path to the saved file.
|
||||
"""
|
||||
validate_url_scheme(body.url)
|
||||
from crawler_pool import get_crawler
|
||||
crawler = None
|
||||
try:
|
||||
cfg = CrawlerRunConfig(pdf=True)
|
||||
crawler = await get_crawler(get_default_browser_config())
|
||||
@@ -451,6 +457,9 @@ async def generate_pdf(
|
||||
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
|
||||
except Exception as e:
|
||||
raise HTTPException(500, detail=str(e))
|
||||
finally:
|
||||
if crawler:
|
||||
await release_crawler(crawler)
|
||||
|
||||
|
||||
@app.post("/execute_js")
|
||||
@@ -507,7 +516,7 @@ async def execute_js(
|
||||
|
||||
"""
|
||||
validate_url_scheme(body.url)
|
||||
from crawler_pool import get_crawler
|
||||
crawler = None
|
||||
try:
|
||||
cfg = CrawlerRunConfig(js_code=body.scripts)
|
||||
crawler = await get_crawler(get_default_browser_config())
|
||||
@@ -518,6 +527,9 @@ async def execute_js(
|
||||
return JSONResponse(data)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, detail=str(e))
|
||||
finally:
|
||||
if crawler:
|
||||
await release_crawler(crawler)
|
||||
|
||||
|
||||
@app.get("/llm/{url:path}")
|
||||
|
||||
@@ -49,6 +49,8 @@ browser_cfg = BrowserConfig(
|
||||
| **`user_agent_generator_config`** | `dict` (default: `{}`) | Configuration dict for user agent generation when `user_agent_mode="random"`. |
|
||||
| **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. |
|
||||
| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. |
|
||||
| **`avoid_ads`** | `bool` (default: `False`) | If `True`, blocks requests to common ad/tracker domains (Google Analytics, DoubleClick, Facebook, Hotjar, etc.) at the browser context level. |
|
||||
| **`avoid_css`** | `bool` (default: `False`) | If `True`, blocks loading of CSS files (`.css`, `.less`, `.scss`, `.sass`) for faster, leaner crawls when only text content is needed. |
|
||||
| **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. |
|
||||
| **`enable_stealth`** | `bool` (default: `False`) | Enable playwright-stealth mode to bypass bot detection. Cannot be used with `browser_mode="builtin"`. |
|
||||
|
||||
|
||||
@@ -1402,6 +1402,8 @@ class BrowserConfig:
|
||||
user_agent=None,
|
||||
text_mode=False,
|
||||
light_mode=False,
|
||||
avoid_ads=False,
|
||||
avoid_css=False,
|
||||
extra_args=None,
|
||||
enable_stealth=False,
|
||||
# ... other advanced parameters omitted here
|
||||
@@ -1440,15 +1442,19 @@ class BrowserConfig:
|
||||
8. **`user_agent`**:
|
||||
- Custom User-Agent string. If `None`, a default is used.
|
||||
- You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
|
||||
9. **`text_mode`** & **`light_mode`**:
|
||||
- `text_mode=True` disables images, possibly speeding up text-only crawls.
|
||||
- `light_mode=True` turns off certain background features for performance.
|
||||
10. **`extra_args`**:
|
||||
- Additional flags for the underlying browser.
|
||||
9. **`text_mode`** & **`light_mode`**:
|
||||
- `text_mode=True` disables images, possibly speeding up text-only crawls.
|
||||
- `light_mode=True` turns off certain background features for performance.
|
||||
10. **`avoid_ads`** & **`avoid_css`**:
|
||||
- `avoid_ads=True` blocks requests to common ad and tracker domains (Google Analytics, DoubleClick, Facebook, Hotjar, etc.) at the browser context level. Reduces network overhead and memory usage.
|
||||
- `avoid_css=True` blocks loading of CSS files (`.css`, `.less`, `.scss`, `.sass`), useful when you only need text content and want faster, leaner crawls.
|
||||
- Both default to `False` (opt-in). Can be combined with each other and with `text_mode`.
|
||||
11. **`extra_args`**:
|
||||
- Additional flags for the underlying browser.
|
||||
- E.g. `["--disable-extensions"]`.
|
||||
11. **`enable_stealth`**:
|
||||
- If `True`, enables stealth mode using playwright-stealth.
|
||||
- Modifies browser fingerprints to avoid basic bot detection.
|
||||
12. **`enable_stealth`**:
|
||||
- If `True`, enables stealth mode using playwright-stealth.
|
||||
- Modifies browser fingerprints to avoid basic bot detection.
|
||||
- Default is `False`. Recommended for sites with bot protection.
|
||||
### Helper Methods
|
||||
Both configuration classes provide a `clone()` method to create modified copies:
|
||||
|
||||
@@ -109,17 +109,22 @@ class BrowserConfig:
|
||||
- `user_agent`: Custom User-Agent string. If `None`, a default is used.
|
||||
- `user_agent_mode`: Set to `"random"` for randomization (helps fight bot detection).
|
||||
|
||||
12.⠀**`text_mode`** & **`light_mode`**
|
||||
- `text_mode=True` disables images, possibly speeding up text-only crawls.
|
||||
- `light_mode=True` turns off certain background features for performance.
|
||||
12.⠀**`text_mode`** & **`light_mode`**
|
||||
- `text_mode=True` disables images, possibly speeding up text-only crawls.
|
||||
- `light_mode=True` turns off certain background features for performance.
|
||||
|
||||
13.⠀**`extra_args`**
|
||||
13.⠀**`avoid_ads`** & **`avoid_css`**
|
||||
- `avoid_ads=True` blocks requests to common ad and tracker domains (Google Analytics, DoubleClick, Facebook, Hotjar, etc.) at the browser context level. Reduces network overhead and memory usage.
|
||||
- `avoid_css=True` blocks loading of CSS files (`.css`, `.less`, `.scss`, `.sass`), useful when you only need text content and want faster, leaner crawls.
|
||||
- Both default to `False` (opt-in). Can be combined with each other and with `text_mode`.
|
||||
|
||||
14.⠀**`extra_args`**
|
||||
- Additional flags for the underlying browser.
|
||||
- E.g. `["--disable-extensions"]`.
|
||||
|
||||
14.⠀**`enable_stealth`**
|
||||
- If `True`, enables stealth mode using playwright-stealth.
|
||||
- Modifies browser fingerprints to avoid basic bot detection.
|
||||
15.⠀**`enable_stealth`**
|
||||
- If `True`, enables stealth mode using playwright-stealth.
|
||||
- Modifies browser fingerprints to avoid basic bot detection.
|
||||
- Default is `False`. Recommended for sites with bot protection.
|
||||
|
||||
### Helper Methods
|
||||
|
||||
178
tests/browser/test_resource_filtering.py
Normal file
178
tests/browser/test_resource_filtering.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""E2E tests for avoid_ads / avoid_css resource filtering.
|
||||
|
||||
These tests launch real browsers and crawl real websites to verify
|
||||
that route-based resource blocking actually works.
|
||||
|
||||
Domains used:
|
||||
- books.toscrape.com (CSS-heavy practice site, designed for scraping)
|
||||
- quotes.toscrape.com (simple practice site)
|
||||
- httpbin.org/html (static HTML, no trackers)
|
||||
- en.wikipedia.org (real site with analytics)
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Basic success tests — flags should not break crawling
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_with_avoid_css_succeeds():
|
||||
"""Crawl books.toscrape.com with avoid_css=True — page should load fine."""
|
||||
browser_config = BrowserConfig(headless=True, avoid_css=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://books.toscrape.com",
|
||||
config=CrawlerRunConfig(cache_mode="bypass"),
|
||||
)
|
||||
assert result.success, f"Crawl failed: {result.error_message}"
|
||||
assert len(result.html) > 500, "Page HTML is suspiciously short"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_with_avoid_ads_succeeds():
|
||||
"""Crawl Wikipedia with avoid_ads=True — content should be intact."""
|
||||
browser_config = BrowserConfig(headless=True, avoid_ads=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Web_scraping",
|
||||
config=CrawlerRunConfig(cache_mode="bypass"),
|
||||
)
|
||||
assert result.success, f"Crawl failed: {result.error_message}"
|
||||
# Wikipedia article content must be present
|
||||
html_lower = result.html.lower()
|
||||
assert "web scraping" in html_lower, "Wikipedia content missing"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_with_both_flags_succeeds():
|
||||
"""Both avoid_css and avoid_ads enabled simultaneously."""
|
||||
browser_config = BrowserConfig(headless=True, avoid_css=True, avoid_ads=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://quotes.toscrape.com",
|
||||
config=CrawlerRunConfig(cache_mode="bypass"),
|
||||
)
|
||||
assert result.success, f"Crawl failed: {result.error_message}"
|
||||
html_lower = result.html.lower()
|
||||
assert "quote" in html_lower or "toscrape" in html_lower
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_avoid_ads_does_not_block_page_content():
|
||||
"""avoid_ads must not interfere with first-party page content."""
|
||||
browser_config = BrowserConfig(headless=True, avoid_ads=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://httpbin.org/html",
|
||||
config=CrawlerRunConfig(cache_mode="bypass"),
|
||||
)
|
||||
assert result.success, f"Crawl failed: {result.error_message}"
|
||||
# httpbin.org/html serves a Moby Dick excerpt
|
||||
assert "Herman Melville" in result.html, "First-party content missing"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Network-level verification — prove routes actually block requests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_without_flags_css_loads_normally():
|
||||
"""Baseline: without avoid_css, CSS responses should appear in network log."""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://books.toscrape.com",
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode="bypass",
|
||||
capture_network_requests=True,
|
||||
),
|
||||
)
|
||||
assert result.success
|
||||
assert result.network_requests is not None, "Network requests not captured"
|
||||
|
||||
# There should be successful CSS responses
|
||||
css_responses = [
|
||||
r
|
||||
for r in result.network_requests
|
||||
if r.get("event_type") == "response" and ".css" in r.get("url", "")
|
||||
]
|
||||
assert (
|
||||
len(css_responses) > 0
|
||||
), "CSS should load normally without avoid_css flag"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_avoid_css_blocks_css_requests():
|
||||
"""With avoid_css=True, CSS requests must be aborted (no successful responses)."""
|
||||
browser_config = BrowserConfig(headless=True, avoid_css=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://books.toscrape.com",
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode="bypass",
|
||||
capture_network_requests=True,
|
||||
),
|
||||
)
|
||||
assert result.success
|
||||
assert result.network_requests is not None, "Network requests not captured"
|
||||
|
||||
# No CSS should have gotten a successful response
|
||||
css_responses = [
|
||||
r
|
||||
for r in result.network_requests
|
||||
if r.get("event_type") == "response" and ".css" in r.get("url", "")
|
||||
]
|
||||
assert (
|
||||
len(css_responses) == 0
|
||||
), f"CSS responses should be blocked, but found: {[r['url'] for r in css_responses]}"
|
||||
|
||||
# There SHOULD be request_failed events for CSS (proves blocking happened)
|
||||
css_failures = [
|
||||
r
|
||||
for r in result.network_requests
|
||||
if r.get("event_type") == "request_failed"
|
||||
and ".css" in r.get("url", "")
|
||||
]
|
||||
assert (
|
||||
len(css_failures) > 0
|
||||
), "Expected request_failed events for blocked CSS files"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_avoid_css_with_text_mode_combines():
|
||||
"""Both avoid_css and text_mode should combine their blocking rules."""
|
||||
browser_config = BrowserConfig(
|
||||
headless=True, avoid_css=True, text_mode=True
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://books.toscrape.com",
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode="bypass",
|
||||
capture_network_requests=True,
|
||||
),
|
||||
)
|
||||
assert result.success
|
||||
assert result.network_requests is not None
|
||||
|
||||
successful = [
|
||||
r for r in result.network_requests if r.get("event_type") == "response"
|
||||
]
|
||||
|
||||
# CSS should be blocked (via avoid_css)
|
||||
css_hits = [r for r in successful if ".css" in r.get("url", "")]
|
||||
assert len(css_hits) == 0, "CSS should be blocked by avoid_css"
|
||||
|
||||
# Images should be blocked (via text_mode)
|
||||
img_exts = (".jpg", ".jpeg", ".png", ".gif", ".webp")
|
||||
img_hits = [
|
||||
r
|
||||
for r in successful
|
||||
if any(r.get("url", "").lower().endswith(ext) for ext in img_exts)
|
||||
]
|
||||
assert len(img_hits) == 0, "Images should be blocked by text_mode"
|
||||
155
tests/docker/test_pool_release.py
Normal file
155
tests/docker/test_pool_release.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""Tests for crawler pool release_crawler() and active_requests tracking.
|
||||
|
||||
These tests validate the pool lifecycle without requiring Docker or a running
|
||||
server. They test the release logic directly using mock crawler objects.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Standalone release_crawler implementation for testing
|
||||
# (mirrors the logic that will be added to deploy/docker/crawler_pool.py)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_TEST_LOCK = asyncio.Lock()
|
||||
|
||||
|
||||
async def _release_crawler(crawler, lock=None):
|
||||
"""Standalone release logic matching crawler_pool.release_crawler()."""
|
||||
lock = lock or _TEST_LOCK
|
||||
async with lock:
|
||||
if hasattr(crawler, "active_requests"):
|
||||
crawler.active_requests = max(0, crawler.active_requests - 1)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestReleaseCrawler:
|
||||
"""Tests for the release_crawler function."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_decrements_active_requests(self):
|
||||
"""release_crawler should decrement active_requests by 1."""
|
||||
crawler = MagicMock()
|
||||
crawler.active_requests = 3
|
||||
|
||||
await _release_crawler(crawler)
|
||||
assert crawler.active_requests == 2
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_floors_at_zero(self):
|
||||
"""active_requests should never go below 0."""
|
||||
crawler = MagicMock()
|
||||
crawler.active_requests = 0
|
||||
|
||||
await _release_crawler(crawler)
|
||||
assert crawler.active_requests == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_from_one_to_zero(self):
|
||||
"""Standard case: single request finishes."""
|
||||
crawler = MagicMock()
|
||||
crawler.active_requests = 1
|
||||
|
||||
await _release_crawler(crawler)
|
||||
assert crawler.active_requests == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_handles_missing_attribute(self):
|
||||
"""Should not crash if crawler has no active_requests attribute."""
|
||||
crawler = MagicMock(spec=[]) # no attributes at all
|
||||
# Should not raise
|
||||
await _release_crawler(crawler)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multiple_releases_decrement_correctly(self):
|
||||
"""Multiple sequential releases should each decrement by 1."""
|
||||
crawler = MagicMock()
|
||||
crawler.active_requests = 5
|
||||
|
||||
for expected in [4, 3, 2, 1, 0, 0]: # last one should floor at 0
|
||||
await _release_crawler(crawler)
|
||||
assert crawler.active_requests == expected
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_concurrent_releases_are_safe(self):
|
||||
"""Concurrent releases should not corrupt the counter."""
|
||||
crawler = MagicMock()
|
||||
crawler.active_requests = 100
|
||||
lock = asyncio.Lock()
|
||||
|
||||
async def release_n_times(n):
|
||||
for _ in range(n):
|
||||
await _release_crawler(crawler, lock=lock)
|
||||
|
||||
# 10 concurrent tasks each releasing 10 times = 100 total
|
||||
tasks = [asyncio.create_task(release_n_times(10)) for _ in range(10)]
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
assert crawler.active_requests == 0
|
||||
|
||||
|
||||
class TestActiveRequestsTracking:
|
||||
"""Tests for the get/release lifecycle pattern."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_sets_active_requests(self):
|
||||
"""Simulated get_crawler should set active_requests to 1 for new crawlers."""
|
||||
crawler = MagicMock()
|
||||
# Simulate what get_crawler does for a new browser
|
||||
crawler.active_requests = 1
|
||||
|
||||
assert crawler.active_requests == 1
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_increments_existing(self):
|
||||
"""Simulated get_crawler should increment for existing pooled crawlers."""
|
||||
crawler = MagicMock()
|
||||
crawler.active_requests = 2
|
||||
|
||||
# Simulate another get_crawler call returning same browser
|
||||
crawler.active_requests += 1
|
||||
assert crawler.active_requests == 3
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_full_get_release_lifecycle(self):
|
||||
"""Full lifecycle: get -> use -> release -> get -> release."""
|
||||
crawler = MagicMock()
|
||||
|
||||
# First request gets the crawler
|
||||
crawler.active_requests = 1
|
||||
|
||||
# Second concurrent request gets same crawler
|
||||
crawler.active_requests += 1
|
||||
assert crawler.active_requests == 2
|
||||
|
||||
# First request finishes
|
||||
await _release_crawler(crawler)
|
||||
assert crawler.active_requests == 1
|
||||
|
||||
# Second request finishes
|
||||
await _release_crawler(crawler)
|
||||
assert crawler.active_requests == 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_janitor_safety_check(self):
|
||||
"""Janitor should only close browsers with active_requests == 0."""
|
||||
crawler = MagicMock()
|
||||
crawler.active_requests = 1
|
||||
|
||||
# Janitor check: should NOT close
|
||||
should_close = getattr(crawler, "active_requests", 0) == 0
|
||||
assert should_close is False
|
||||
|
||||
# Request finishes
|
||||
await _release_crawler(crawler)
|
||||
|
||||
# Janitor check: now safe to close
|
||||
should_close = getattr(crawler, "active_requests", 0) == 0
|
||||
assert should_close is True
|
||||
100
tests/unit/test_resource_filtering_config.py
Normal file
100
tests/unit/test_resource_filtering_config.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Unit tests for BrowserConfig avoid_ads / avoid_css flags.
|
||||
|
||||
Tests the config plumbing: defaults, serialization, cloning, roundtrips.
|
||||
No browser or network required.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_defaults():
|
||||
"""Ensure clean slate for each test."""
|
||||
BrowserConfig.reset_defaults()
|
||||
yield
|
||||
BrowserConfig.reset_defaults()
|
||||
|
||||
|
||||
class TestResourceFilteringDefaults:
|
||||
"""Both flags must default to False (opt-in only)."""
|
||||
|
||||
def test_default_values_are_false(self):
|
||||
config = BrowserConfig()
|
||||
assert config.avoid_ads is False
|
||||
assert config.avoid_css is False
|
||||
|
||||
def test_custom_values(self):
|
||||
config = BrowserConfig(avoid_ads=True, avoid_css=True)
|
||||
assert config.avoid_ads is True
|
||||
assert config.avoid_css is True
|
||||
|
||||
def test_mixed_values(self):
|
||||
c1 = BrowserConfig(avoid_ads=True, avoid_css=False)
|
||||
assert c1.avoid_ads is True
|
||||
assert c1.avoid_css is False
|
||||
|
||||
c2 = BrowserConfig(avoid_ads=False, avoid_css=True)
|
||||
assert c2.avoid_ads is False
|
||||
assert c2.avoid_css is True
|
||||
|
||||
|
||||
class TestResourceFilteringSerialization:
|
||||
"""Flags must survive to_dict / from_kwargs / dump / load roundtrips."""
|
||||
|
||||
def test_to_dict_includes_flags(self):
|
||||
config = BrowserConfig(avoid_ads=True, avoid_css=True)
|
||||
d = config.to_dict()
|
||||
assert "avoid_ads" in d
|
||||
assert "avoid_css" in d
|
||||
assert d["avoid_ads"] is True
|
||||
assert d["avoid_css"] is True
|
||||
|
||||
def test_to_dict_includes_false_values(self):
|
||||
config = BrowserConfig()
|
||||
d = config.to_dict()
|
||||
assert d["avoid_ads"] is False
|
||||
assert d["avoid_css"] is False
|
||||
|
||||
def test_from_kwargs_roundtrip(self):
|
||||
original = BrowserConfig(avoid_ads=True, avoid_css=False)
|
||||
d = original.to_dict()
|
||||
restored = BrowserConfig.from_kwargs(d)
|
||||
assert restored.avoid_ads is True
|
||||
assert restored.avoid_css is False
|
||||
|
||||
def test_from_kwargs_with_true_values(self):
|
||||
restored = BrowserConfig.from_kwargs({"avoid_ads": True, "avoid_css": True})
|
||||
assert restored.avoid_ads is True
|
||||
assert restored.avoid_css is True
|
||||
|
||||
def test_dump_load_roundtrip(self):
|
||||
original = BrowserConfig(avoid_ads=True, avoid_css=True)
|
||||
dumped = original.dump()
|
||||
restored = BrowserConfig.load(dumped)
|
||||
assert restored.avoid_ads is True
|
||||
assert restored.avoid_css is True
|
||||
|
||||
|
||||
class TestResourceFilteringClone:
|
||||
"""clone() must preserve flags and allow overrides."""
|
||||
|
||||
def test_clone_preserves_flags(self):
|
||||
config = BrowserConfig(avoid_ads=True, avoid_css=True)
|
||||
cloned = config.clone()
|
||||
assert cloned.avoid_ads is True
|
||||
assert cloned.avoid_css is True
|
||||
|
||||
def test_clone_allows_override(self):
|
||||
config = BrowserConfig(avoid_ads=True, avoid_css=False)
|
||||
cloned = config.clone(avoid_css=True)
|
||||
assert cloned.avoid_ads is True
|
||||
assert cloned.avoid_css is True
|
||||
# original unchanged
|
||||
assert config.avoid_css is False
|
||||
|
||||
def test_clone_can_disable_flag(self):
|
||||
config = BrowserConfig(avoid_ads=True, avoid_css=True)
|
||||
cloned = config.clone(avoid_ads=False)
|
||||
assert cloned.avoid_ads is False
|
||||
assert cloned.avoid_css is True
|
||||
Reference in New Issue
Block a user