From 11b45760da9ef4e8d2e9a5a5b6faf95d692d73ec Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 9 Mar 2026 14:52:58 +0000 Subject: [PATCH] fix: anti-bot false positive on browser JSON, URLPatternFilter prefix match, PDF deserialization - antibot_detector: add
 to content elements regex, detect
  browser-wrapped JSON in _looks_like_data() so httpbin-style
  responses are not flagged as blocked
- deep_crawling/filters: use urlparse().path for path-only prefix
  patterns (/docs/*) instead of matching against full URL, which
  always failed; full-URL prefixes still match correctly
- async_configs: add PDFContentScrapingStrategy to
  ALLOWED_DESERIALIZE_TYPES so /crawl API can deserialize it
- __init__: export PDFContentScrapingStrategy for type resolution
- tests: add 86-test suite covering all three fixes with adversarial
  and edge cases
---
 crawl4ai/__init__.py              |   1 +
 crawl4ai/antibot_detector.py      |  13 +-
 crawl4ai/async_configs.py         |   2 +-
 crawl4ai/deep_crawling/filters.py |  23 +-
 tests/test_cloud_bugs_batch.py    | 479 ++++++++++++++++++++++++++++++
 5 files changed, 500 insertions(+), 18 deletions(-)
 create mode 100644 tests/test_cloud_bugs_batch.py

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index af35e6a0..03e734de 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -10,6 +10,7 @@ from .content_scraping_strategy import (
     LXMLWebScrapingStrategy,
     WebScrapingStrategy,  # Backward compatibility alias
 )
+from .processors.pdf import PDFContentScrapingStrategy
 from .async_logger import (
     AsyncLoggerBase,
     AsyncLogger,
diff --git a/crawl4ai/antibot_detector.py b/crawl4ai/antibot_detector.py
index f2e8478e..228c1b25 100644
--- a/crawl4ai/antibot_detector.py
+++ b/crawl4ai/antibot_detector.py
@@ -103,7 +103,7 @@ _TIER2_MAX_SIZE = 10000  # Only check tier 2 patterns on pages under 10KB
 # ---------------------------------------------------------------------------
 _STRUCTURAL_MAX_SIZE = 50000  # Only check pages under 50KB
 _CONTENT_ELEMENTS_RE = re.compile(
-    r'<(?:p|h[1-6]|article|section|li|td|a)\b', re.IGNORECASE
+    r'<(?:p|h[1-6]|article|section|li|td|a|pre)\b', re.IGNORECASE
 )
 _SCRIPT_TAG_RE = re.compile(r'', re.IGNORECASE)
@@ -123,7 +123,16 @@ def _looks_like_data(html: str) -> bool:
     stripped = html.strip()
     if not stripped:
         return False
-    return stripped[0] in ('{', '[', '<' ) and not stripped.startswith('
{...}
+ if stripped[:10].lower().startswith((']*>\s*]*>\s*[{\[]', stripped[:500], re.IGNORECASE): + return True + return False + # Other XML-like content + return stripped[0] == '<' def _structural_integrity_check(html: str) -> Tuple[bool, str]: diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index da3df876..d7171559 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -135,7 +135,7 @@ ALLOWED_DESERIALIZE_TYPES = { "DefaultMarkdownGenerator", "PruningContentFilter", "BM25ContentFilter", "LLMContentFilter", # Scraping - "LXMLWebScrapingStrategy", + "LXMLWebScrapingStrategy", "PDFContentScrapingStrategy", # Chunking "RegexChunking", # Deep crawl diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index 2865767f..2fb819ec 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -216,10 +216,11 @@ class URLPatternFilter(URLFilter): @lru_cache(maxsize=10000) def apply(self, url: str) -> bool: + url_path = urlparse(url).path + # Quick suffix check (*.html) if self._simple_suffixes: - path = url.split("?")[0] - if path.split("/")[-1].split(".")[-1] in self._simple_suffixes: + if url_path.split("/")[-1].split(".")[-1] in self._simple_suffixes: result = True self._update_stats(result) return not result if self._reverse else result @@ -232,21 +233,13 @@ class URLPatternFilter(URLFilter): self._update_stats(result) return not result if self._reverse else result - # Prefix check (/foo/*) + # Prefix check (/foo/* or https://domain/foo/*) if self._simple_prefixes: - path = url.split("?")[0] - # if any(path.startswith(p) for p in self._simple_prefixes): - # result = True - # self._update_stats(result) - # return not result if self._reverse else result - #### - # Modified the prefix matching logic to ensure path boundary checking: - # - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path - # - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/` - #### for prefix in self._simple_prefixes: - if path.startswith(prefix): - if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']: + # Use url_path for path-only prefixes, full URL for absolute prefixes + match_against = url if '://' in prefix else url_path + if match_against.startswith(prefix): + if len(match_against) == len(prefix) or match_against[len(prefix)] in ['/', '?', '#']: result = True self._update_stats(result) return not result if self._reverse else result diff --git a/tests/test_cloud_bugs_batch.py b/tests/test_cloud_bugs_batch.py new file mode 100644 index 00000000..de61609f --- /dev/null +++ b/tests/test_cloud_bugs_batch.py @@ -0,0 +1,479 @@ +""" +Comprehensive test suite for cloud-reported bug fixes: + - Bug 1: Anti-bot false positives on browser-rendered JSON + - Bug 2: URLPatternFilter PREFIX match using full URL instead of path + - Bug 3: PDFContentScrapingStrategy not in ALLOWED_DESERIALIZE_TYPES + +Tests include: unit, edge case, adversarial, and regression checks. +""" + +import sys +import os +import re +import json + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from crawl4ai.antibot_detector import is_blocked, _looks_like_data, _structural_integrity_check +from crawl4ai.deep_crawling.filters import URLPatternFilter +from crawl4ai.async_configs import ALLOWED_DESERIALIZE_TYPES, to_serializable_dict, from_serializable_dict + +PASS = 0 +FAIL = 0 + +def check(name, actual, expected, detail=""): + global PASS, FAIL + ok = actual == expected + if ok: + PASS += 1 + else: + FAIL += 1 + print(f" FAIL: {name}") + print(f" got: {actual!r}") + print(f" expected: {expected!r}") + if detail: + print(f" detail: {detail}") + + +# ===================================================================== +# BUG 1: Anti-bot false positives on browser-rendered JSON +# ===================================================================== +print("\n" + "=" * 70) +print("BUG 1: Anti-bot false positives on browser-rendered JSON") +print("=" * 70) + +# --- 1A: _looks_like_data() unit tests --- +print("\n--- _looks_like_data() ---") + +check("raw JSON object", _looks_like_data('{"origin": "1.2.3.4"}'), True) +check("raw JSON array", _looks_like_data('[1, 2, 3]'), True) +check("raw XML", _looks_like_data(''), True) +check("empty string", _looks_like_data(''), False) +check("whitespace only", _looks_like_data(' \n '), False) +check("normal HTML page", _looks_like_data('

Hello

'), False) +check("", _looks_like_data(''), False) +check("", _looks_like_data(''), False) + +# Browser-wrapped JSON (the core bug) +check("browser-wrapped JSON object", + _looks_like_data('
{"origin": "1.2.3.4"}
'), + True) + +check("browser-wrapped JSON array", + _looks_like_data('
[{"id": 1}, {"id": 2}]
'), + True) + +check("browser-wrapped JSON (uppercase HTML)", + _looks_like_data('
{"key": "val"}
'), + True) + +check("browser-wrapped JSON (DOCTYPE)", + _looks_like_data('
{"x": 1}
'), + True) + +check("browser-wrapped JSON with whitespace before pre", + _looks_like_data(' \n
{"x": 1}
'), + True) + +# Should NOT detect as data — normal HTML with
+check("HTML page with code block (not JSON in pre)",
+    _looks_like_data('

Tutorial

def hello():\n    print("hi")
'), + False) + +check("HTML with
 but text content, not JSON",
+    _looks_like_data('
This is just preformatted text, not JSON.
'), + False) + +# --- 1B: is_blocked() integration tests for browser-wrapped JSON --- +print("\n--- is_blocked() with browser-rendered JSON ---") + +# httpbin.org /ip response (tiny — ~90 bytes HTML) +httpbin_ip = '
{"origin": "203.0.113.42"}
' +blocked, reason = is_blocked(200, httpbin_ip) +check("httpbin /ip (200, small browser-wrapped JSON)", blocked, False, reason) + +# httpbin.org /anything response (medium) +httpbin_anything = '
{"args": {}, "data": "", "files": {}, "form": {}, "headers": {"Accept": "*/*", "Host": "httpbin.org", "User-Agent": "Mozilla/5.0"}, "json": null, "method": "GET", "origin": "203.0.113.42", "url": "https://httpbin.org/anything"}
' +blocked, reason = is_blocked(200, httpbin_anything) +check("httpbin /anything (200, medium browser-wrapped JSON)", blocked, False, reason) + +# httpbin.org /delay/N response +httpbin_delay = '
{"args": {}, "data": "", "headers": {"Host": "httpbin.org"}, "origin": "1.2.3.4", "url": "https://httpbin.org/delay/2"}
' +blocked, reason = is_blocked(200, httpbin_delay) +check("httpbin /delay/2 (200, browser-wrapped JSON)", blocked, False, reason) + +# httpbin.org /headers response +httpbin_headers = '
{"headers": {"Accept": "text/html", "Accept-Encoding": "gzip", "Host": "httpbin.org", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)", "X-Forwarded-For": "1.2.3.4", "X-Forwarded-Proto": "https"}}
' +blocked, reason = is_blocked(200, httpbin_headers) +check("httpbin /headers (200, browser-wrapped JSON)", blocked, False, reason) + +# Browser-wrapped JSON array +json_array_page = '
[{"id": 1, "name": "foo"}, {"id": 2, "name": "bar"}]
' +blocked, reason = is_blocked(200, json_array_page) +check("browser-wrapped JSON array (200)", blocked, False, reason) + +# --- 1C: Ensure real block pages still detected --- +print("\n--- Regression: real block pages still detected ---") + +# Empty shell (should still be blocked) +blocked, reason = is_blocked(200, '') +check("empty shell (200, no content)", blocked, True, reason) + +# Anti-bot redirect page +blocked, reason = is_blocked(200, '') +check("script-only redirect (200)", blocked, True, reason) + +# Small page with no content elements (not JSON) +blocked, reason = is_blocked(200, '
x
') +check("tiny div-only page (200)", blocked, True, reason) + +# 403 with browser-wrapped JSON should NOT be blocked (it's data) +blocked, reason = is_blocked(403, '{"error": "forbidden", "code": 403}') +check("403 raw JSON (data response)", blocked, False, reason) + +# 403 with HTML should still be blocked +blocked, reason = is_blocked(403, '

Forbidden

') +check("403 HTML page (blocked)", blocked, True, reason) + +# --- 1D:
 now counts as content element ---
+print("\n--- 
 as content element ---")
+
+# Page with only 
 (code block) should not be flagged as "no content elements"
+html_with_pre = '
function hello() {\n  console.log("world");\n}\n\nThis is a code example that demonstrates JavaScript functions. It shows how to define and use basic functions with console output for debugging purposes.
' +blocked, reason = is_blocked(200, html_with_pre) +check("page with
 code block (200)", blocked, False, reason)
+
+# Page with 
 containing log output
+html_pre_logs = '
2024-01-15 10:30:45 INFO  Starting server on port 8080\n2024-01-15 10:30:46 INFO  Database connected successfully\n2024-01-15 10:30:47 INFO  Application ready to accept connections on all interfaces
' +blocked, reason = is_blocked(200, html_pre_logs) +check("page with
 log output (200)", blocked, False, reason)
+
+# --- 1E: Adversarial: attacker tries to bypass detection using 
 ---
+print("\n--- Adversarial: 
 shouldn't defeat real block detection ---")
+
+# Tier 1 pattern in page with 
 should still be detected
+blocked, reason = is_blocked(403, '
Reference #18.2d351ab8.1557333295.a4e16ab
') +check("Akamai ref in
 (403)", blocked, True, reason)
+
+blocked, reason = is_blocked(200, '
window._pxAppId = "PX123";
') +check("PerimeterX in
 (200)", blocked, True, reason)
+
+# Empty 
 should still trigger
+blocked, reason = is_blocked(200, '
')
+check("empty 
 (200, minimal text)", blocked, True, reason)
+
+# 
 with whitespace only
+blocked, reason = is_blocked(200, '
   
') +check("
 with only whitespace (200)", blocked, True, reason)
+
+
+# =====================================================================
+# BUG 2: URLPatternFilter PREFIX match
+# =====================================================================
+print("\n" + "=" * 70)
+print("BUG 2: URLPatternFilter PREFIX match")
+print("=" * 70)
+
+# --- 2A: Path-only prefix patterns (the original bug) ---
+print("\n--- Path-only prefix patterns ---")
+
+f = URLPatternFilter(patterns=["/docs/*"])
+check("/docs/* matches /docs/page1", f.apply("https://example.com/docs/page1"), True)
+check("/docs/* matches /docs/", f.apply("https://example.com/docs/"), True)
+check("/docs/* matches /docs/sub/page", f.apply("https://example.com/docs/sub/page"), True)
+f2 = URLPatternFilter(patterns=["/docs/*"])  # fresh filter (lru_cache)
+check("/docs/* no match /api/docs", f2.apply("https://example.com/api/docs"), False)
+f3 = URLPatternFilter(patterns=["/docs/*"])
+check("/docs/* no match /other", f3.apply("https://example.com/other"), False)
+
+# --- 2B: Full-URL prefix patterns (must still work) ---
+print("\n--- Full-URL prefix patterns ---")
+
+f4 = URLPatternFilter(patterns=["https://example.com/blog/*"])
+check("full URL prefix matches", f4.apply("https://example.com/blog/post-1"), True)
+check("full URL prefix matches subpath", f4.apply("https://example.com/blog/2024/post-1"), True)
+f5 = URLPatternFilter(patterns=["https://example.com/blog/*"])
+check("full URL prefix no match different domain", f5.apply("https://other.com/blog/post-1"), False)
+f6 = URLPatternFilter(patterns=["https://example.com/blog/*"])
+check("full URL prefix no match blogxx", f6.apply("https://example.com/blogxx/post-1"), False)
+
+# --- 2C: Path prefix with query strings ---
+print("\n--- Prefix with query strings ---")
+
+f7 = URLPatternFilter(patterns=["/api/*"])
+check("/api/* matches /api/v1", f7.apply("https://example.com/api/v1"), True)
+check("/api/* matches /api/v1?key=123", f7.apply("https://example.com/api/v1?key=123"), True)
+f8 = URLPatternFilter(patterns=["/api/*"])
+check("/api/* no match /apiv2/", f8.apply("https://example.com/apiv2/"), False)
+
+# --- 2D: Suffix patterns still work ---
+print("\n--- Suffix patterns ---")
+
+f9 = URLPatternFilter(patterns=["*.pdf"])
+check("*.pdf matches report.pdf", f9.apply("https://example.com/report.pdf"), True)
+check("*.pdf matches nested pdf", f9.apply("https://example.com/docs/report.pdf"), True)
+f10 = URLPatternFilter(patterns=["*.pdf"])
+check("*.pdf no match .html", f10.apply("https://example.com/page.html"), False)
+
+# --- 2E: Reverse mode ---
+print("\n--- Reverse mode ---")
+
+f11 = URLPatternFilter(patterns=["/private/*"], reverse=True)
+check("reverse: /private/* excludes /private/page", f11.apply("https://example.com/private/page"), False)
+f12 = URLPatternFilter(patterns=["/private/*"], reverse=True)
+check("reverse: /private/* allows /public/page", f12.apply("https://example.com/public/page"), True)
+
+# --- 2F: Adversarial URL patterns ---
+print("\n--- Adversarial URL edge cases ---")
+
+f13 = URLPatternFilter(patterns=["/docs/*"])
+check("prefix with port in URL", f13.apply("https://example.com:8080/docs/page"), True)
+f14 = URLPatternFilter(patterns=["/docs/*"])
+check("prefix with auth in URL", f14.apply("https://user:pass@example.com/docs/page"), True)
+f15 = URLPatternFilter(patterns=["/docs/*"])
+check("prefix with fragment", f15.apply("https://example.com/docs#section"), True)
+
+# URL-encoded path
+f16 = URLPatternFilter(patterns=["/docs/*"])
+check("prefix with encoded path", f16.apply("https://example.com/docs/my%20page"), True)
+
+# Multiple prefix patterns
+f17 = URLPatternFilter(patterns=["/docs/*", "/api/*"])
+check("multi-prefix: /docs/ matches", f17.apply("https://example.com/docs/page"), True)
+check("multi-prefix: /api/ matches", f17.apply("https://example.com/api/v1"), True)
+f18 = URLPatternFilter(patterns=["/docs/*", "/api/*"])
+check("multi-prefix: /other/ no match", f18.apply("https://example.com/other/page"), False)
+
+# --- 2G: Complex (PATH) patterns still work ---
+print("\n--- Complex glob patterns ---")
+
+f19 = URLPatternFilter(patterns=["*/docs/*/guide"])
+check("glob */docs/*/guide matches", f19.apply("https://example.com/docs/v2/guide"), True)
+f20 = URLPatternFilter(patterns=["*/docs/*/guide"])
+check("glob */docs/*/guide no match", f20.apply("https://example.com/docs/v2/tutorial"), False)
+
+# --- 2H: Domain patterns still work ---
+print("\n--- Domain patterns ---")
+
+# Note: *.example.com (without ://) is classified as SUFFIX, not DOMAIN.
+# Use http://*.example.com for domain matching.
+f21 = URLPatternFilter(patterns=["http://*.example.com/*"])
+check("domain http://*.example.com/* matches sub.example.com", f21.apply("http://sub.example.com/page"), True)
+f22 = URLPatternFilter(patterns=["http://*.example.com/*"])
+check("domain http://*.example.com/* no match other.com", f22.apply("http://other.com/page"), False)
+
+# --- 2I: Regex patterns still work ---
+print("\n--- Regex patterns ---")
+
+f23 = URLPatternFilter(patterns=[r"^https://example\.com/v\d+/"])
+check("regex matches /v1/", f23.apply("https://example.com/v1/page"), True)
+f24 = URLPatternFilter(patterns=[r"^https://example\.com/v\d+/"])
+check("regex no match /vx/", f24.apply("https://example.com/vx/page"), False)
+
+
+# =====================================================================
+# BUG 3: PDFContentScrapingStrategy deserialization
+# =====================================================================
+print("\n" + "=" * 70)
+print("BUG 3: PDFContentScrapingStrategy deserialization")
+print("=" * 70)
+
+# --- 3A: Type is in allowlist ---
+print("\n--- Allowlist check ---")
+
+check("PDFContentScrapingStrategy in ALLOWED_DESERIALIZE_TYPES",
+    "PDFContentScrapingStrategy" in ALLOWED_DESERIALIZE_TYPES, True)
+
+# --- 3B: Roundtrip serialization ---
+print("\n--- Serialization roundtrip ---")
+
+try:
+    from crawl4ai.processors.pdf import PDFContentScrapingStrategy
+
+    strategy = PDFContentScrapingStrategy(extract_images=False, batch_size=8)
+    serialized = to_serializable_dict(strategy)
+    check("serialization type field", serialized.get("type"), "PDFContentScrapingStrategy")
+    check("serialization has params", "params" in serialized, True)
+
+    # Deserialize back — verifies it resolves to the correct class (the original bug)
+    deserialized = from_serializable_dict(serialized)
+    check("deserialization type", type(deserialized).__name__, "PDFContentScrapingStrategy")
+    # The strategy passes params to NaivePDFProcessorStrategy internally,
+    # so verify via the inner processor
+    check("deserialization creates valid processor",
+        hasattr(deserialized, 'pdf_processor'), True)
+
+    print("  (roundtrip OK)")
+except ImportError as e:
+    print(f"  SKIP: PDFContentScrapingStrategy import failed: {e}")
+except Exception as e:
+    FAIL += 1
+    print(f"  FAIL: Serialization roundtrip failed: {e}")
+
+# --- 3C: CrawlerRunConfig with PDFContentScrapingStrategy ---
+print("\n--- CrawlerRunConfig with PDFContentScrapingStrategy ---")
+
+try:
+    from crawl4ai import CrawlerRunConfig
+    from crawl4ai.processors.pdf import PDFContentScrapingStrategy
+
+    config = CrawlerRunConfig(
+        scraping_strategy=PDFContentScrapingStrategy(extract_images=False, batch_size=4)
+    )
+    serialized = to_serializable_dict(config)
+    deserialized = from_serializable_dict(serialized)
+    check("CrawlerRunConfig roundtrip with PDF strategy",
+        type(deserialized.scraping_strategy).__name__, "PDFContentScrapingStrategy")
+    check("PDF strategy has processor after roundtrip",
+        hasattr(deserialized.scraping_strategy, 'pdf_processor'), True)
+
+    print("  (config roundtrip OK)")
+except ImportError as e:
+    print(f"  SKIP: Import failed: {e}")
+except Exception as e:
+    FAIL += 1
+    print(f"  FAIL: Config roundtrip failed: {e}")
+
+# --- 3D: Other types still deserialize correctly (regression) ---
+print("\n--- Regression: other types still work ---")
+
+try:
+    from crawl4ai import CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator
+    from crawl4ai import LXMLWebScrapingStrategy, RegexChunking
+
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        chunking_strategy=RegexChunking(),
+    )
+    serialized = to_serializable_dict(config)
+    deserialized = from_serializable_dict(serialized)
+    check("CrawlerRunConfig basic roundtrip type", type(deserialized).__name__, "CrawlerRunConfig")
+    check("CrawlerRunConfig cache_mode preserved", deserialized.cache_mode, CacheMode.BYPASS)
+
+    print("  (regression OK)")
+except Exception as e:
+    FAIL += 1
+    print(f"  FAIL: Regression roundtrip failed: {e}")
+
+
+# =====================================================================
+# ADVERSARIAL / EDGE CASES — Cross-cutting
+# =====================================================================
+print("\n" + "=" * 70)
+print("ADVERSARIAL / EDGE CASES")
+print("=" * 70)
+
+# --- Antibot: various browser JSON wrapping styles ---
+print("\n--- Browser JSON wrapping variants ---")
+
+# Chrome style
+chrome_json = '
{"origin": "1.2.3.4"}
' +blocked, reason = is_blocked(200, chrome_json) +check("Chrome-style JSON wrap", blocked, False, reason) + +# Firefox style (no inline style on pre) +firefox_json = '
{"origin": "1.2.3.4"}
' +blocked, reason = is_blocked(200, firefox_json) +check("Firefox-style JSON wrap", blocked, False, reason) + +# With extra whitespace/newlines +whitespace_json = '\n\n\n\n
\n{"origin": "1.2.3.4"}
\n\n' +blocked, reason = is_blocked(200, whitespace_json) +check("JSON wrap with newlines", blocked, False, reason) + +# Deeply nested JSON +big_json = '
' + json.dumps({"data": [{"id": i, "name": f"item_{i}", "values": list(range(10))} for i in range(100)]}) + '
' +blocked, reason = is_blocked(200, big_json) +check("large nested JSON in browser wrap", blocked, False, reason) + +# JSON with special chars +special_json = '
{"html": "

hello

", "url": "https://example.com?a=1&b=2"}
' +blocked, reason = is_blocked(200, special_json) +check("JSON with embedded HTML/URL", blocked, False, reason) + +# --- Antibot: responses that look similar but should still be blocked --- +print("\n--- Similar-looking pages that SHOULD be blocked ---") + +# HTML page that happens to have
 but isn't JSON
+blocked, reason = is_blocked(200, '
Access Denied
') +check("
Access Denied
(200, small)", blocked, True, reason) + +# Empty body with
 but no JSON
+blocked, reason = is_blocked(200, '
   
') +check("
 with whitespace (200)", blocked, True, reason)
+
+# 
 with non-JSON that starts with { but invalid
+blocked, reason = is_blocked(200, '
{not valid json at all, this is just text
') +# This is ambiguous — looks like it could be data. Our check just looks at { or [ prefix. +# It will be detected as data and NOT blocked, which is the safer choice. +check("
{non-json text} treated as data (200)", blocked, False, reason)
+
+# --- URLPatternFilter: empty and edge-case inputs ---
+print("\n--- URLPatternFilter edge cases ---")
+
+# Empty URL
+f_edge = URLPatternFilter(patterns=["/docs/*"])
+check("empty URL no match", f_edge.apply(""), False)
+
+# URL with no path
+f_edge2 = URLPatternFilter(patterns=["/docs/*"])
+check("domain-only URL no match", f_edge2.apply("https://example.com"), False)
+
+# Root path
+f_edge3 = URLPatternFilter(patterns=["/*"])
+check("/* matches any path", f_edge3.apply("https://example.com/anything"), True)
+
+# Exact prefix match (path equals prefix exactly)
+f_edge4 = URLPatternFilter(patterns=["/docs/*"])
+check("/docs/* matches /docs exactly (prefix == path)", f_edge4.apply("https://example.com/docs"), True)
+# /docs without trailing / matches because len(url_path) == len(prefix) is the exact-match case
+
+# Very long URL
+long_path = "/docs/" + "a" * 2000
+f_edge5 = URLPatternFilter(patterns=["/docs/*"])
+check("very long path matches", f_edge5.apply(f"https://example.com{long_path}"), True)
+
+# Unicode in path
+f_edge6 = URLPatternFilter(patterns=["/docs/*"])
+check("unicode path matches", f_edge6.apply("https://example.com/docs/页面"), True)
+
+# --- Deserialization: security edge cases ---
+print("\n--- Deserialization security ---")
+
+# Ensure disallowed types still raise
+try:
+    from_serializable_dict({"type": "os.system", "params": {"command": "whoami"}})
+    FAIL += 1
+    print("  FAIL: should have raised ValueError for disallowed type")
+except (ValueError, AttributeError):
+    PASS += 1
+    print("  PASS: disallowed type 'os.system' correctly rejected")
+except Exception as e:
+    PASS += 1  # Any error is fine, as long as it doesn't execute
+    print(f"  PASS: disallowed type rejected with {type(e).__name__}")
+
+try:
+    from_serializable_dict({"type": "__import__", "params": {"name": "os"}})
+    FAIL += 1
+    print("  FAIL: should have raised for __import__")
+except (ValueError, AttributeError):
+    PASS += 1
+    print("  PASS: disallowed type '__import__' correctly rejected")
+except Exception as e:
+    PASS += 1
+    print(f"  PASS: disallowed type rejected with {type(e).__name__}")
+
+
+# =====================================================================
+# SUMMARY
+# =====================================================================
+print(f"\n{'=' * 70}")
+print(f"RESULTS: {PASS} passed, {FAIL} failed out of {PASS + FAIL} tests")
+print(f"{'=' * 70}")
+if FAIL > 0:
+    print("SOME TESTS FAILED!")
+    sys.exit(1)
+else:
+    print("ALL TESTS PASSED!")