""" Unit tests for antibot_detector.is_blocked(). Tests are organized into: - TRUE POSITIVES: Real block pages that MUST be detected - TRUE NEGATIVES: Legitimate pages that MUST NOT be flagged - EDGE CASES: Boundary conditions """ import sys, os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../..")) from crawl4ai.antibot_detector import is_blocked PASS = 0 FAIL = 0 def check(name, result, expected_blocked, expected_substr=None): global PASS, FAIL blocked, reason = result ok = blocked == expected_blocked if expected_substr and blocked: ok = ok and expected_substr.lower() in reason.lower() status = "PASS" if ok else "FAIL" if not ok: FAIL += 1 print(f" {status}: {name}") print(f" got blocked={blocked}, reason={reason!r}") print(f" expected blocked={expected_blocked}" + (f", substr={expected_substr!r}" if expected_substr else "")) else: PASS += 1 if blocked: print(f" {status}: {name} -> {reason}") else: print(f" {status}: {name} -> not blocked") # ========================================================================= # TRUE POSITIVES — real block pages that MUST be detected # ========================================================================= print("\n=== TRUE POSITIVES (must detect as blocked) ===\n") # --- Akamai --- check("Akamai Reference #", is_blocked(403, '
Access Denied\nYour request was blocked.\nReference #18.2d351ab8.1557333295.a4e16ab'), True, "Akamai") check("Akamai Pardon Our Interruption", is_blocked(403, 'Please verify you are human
'), True, "Pardon") check("Akamai 403 short Access Denied", is_blocked(403, 'Access denied
'''), True, "Cloudflare firewall") check("Cloudflare IUAM script", is_blocked(403, ''), True, "Cloudflare JS challenge") check("Cloudflare Just a moment", is_blocked(403, 'blocked by network security
'), True, "Network security block") # --- HTTP 429 --- check("HTTP 429 rate limit", is_blocked(429, 'Rate limit exceeded'), True, "429") check("HTTP 429 empty body", is_blocked(429, ''), True, "429") # --- Empty 200 --- check("HTTP 200 empty page", is_blocked(200, ''), True, "empty") check("HTTP 200 whitespace only", is_blocked(200, ' \n\n '), True, "empty") # --- 403 near-empty --- check("HTTP 403 near-empty (10 bytes)", is_blocked(403, ''), True, "403") # ========================================================================= # TRUE NEGATIVES — legitimate pages that MUST NOT be flagged # ========================================================================= print("\n=== TRUE NEGATIVES (must NOT detect as blocked) ===\n") # --- Normal pages --- check("Normal 200 page (example.com size)", is_blocked(200, '' + 'x' * 500 + '
'), False) check("Normal 200 large page", is_blocked(200, '' + 'Some content here.
\n' * 5000 + ''), False) # --- Security articles (false positive trap!) --- check("Article about bot detection (large page)", is_blocked(200, 'Anti-bot solutions like DataDome, PerimeterX, and Cloudflare ' + 'help detect and block bot traffic. When a bot is detected, ' + 'services show a CAPTCHA or Access Denied page. ' + 'Common signals include blocked by security warnings.
' + 'The g-recaptcha and h-captcha widgets are used for challenges.
' + '' + 'More article content. ' * 500 + '
' + ''), False) check("DataDome marketing page (large)", is_blocked(200, 'DataDome protects websites from bot attacks. ' + 'Our solution detects automated traffic using advanced fingerprinting. ' + 'Competitors like PerimeterX use window._pxAppId for tracking.
' + '' + 'Marketing content. ' * 1000 + '
' + ''), False) # --- Login pages with CAPTCHA (not a block!) --- check("Login page with reCAPTCHA (large page)", is_blocked(200, '' + 'Page content. ' * 500 + '
' + ''), False) check("Signup page with hCaptcha (large page)", is_blocked(200, '' + '' + 'Registration info. ' * 500 + '
' + ''), False) # --- 403 pages — ALL non-data 403 HTML is now treated as blocked --- # Rationale: 403 is never the content the user wants. Even for legitimate # auth errors (Apache/Nginx), the fallback will also get 403 and we report # failure correctly. False positives are cheap; false negatives are catastrophic. check("Apache directory listing denied (403, large-ish)", is_blocked(403, 'You don\'t have permission to access this resource on this server.
' + '' + 'Server info. ' * 500 + '
' + ''), True, "403") check("Nginx 403 (large page)", is_blocked(403, '' + 'Content. ' * 500 + '
' + ''), True, "403") check("API 403 auth required (JSON)", is_blocked(403, '{"error": "Forbidden", "message": "Invalid API key", "code": 403}'), False) # --- Cloudflare-served normal pages (not blocked!) --- check("Cloudflare-served normal page with footer", is_blocked(200, '' + 'This is a normal page served through Cloudflare CDN.
' + '' + '' + 'Normal content. ' * 500 + '
' + ''), False) # --- Small but legitimate pages --- check("Small valid 200 page (with content element)", is_blocked(200, 'Your request was processed successfully. Everything is fine.
'), False) check("Small JSON 200 response", is_blocked(200, '{"status": "ok", "data": {"id": 123, "name": "test"}, "timestamp": "2024-01-01T00:00:00Z"}'), False) check("Redirect page 200", is_blocked(200, 'Redirecting to your dashboard. Please wait while we prepare your personalized experience.
'), False) # --- 503 pages — ALL non-data 503 HTML is now treated as blocked --- # Same rationale as 403: 503 is never desired content. Fallback rescues false positives. check("503 maintenance page (treated as blocked)", is_blocked(503, 'We are performing scheduled maintenance. Please try again later.
' + '' + 'Maintenance info. ' * 500 + '
' + ''), True, "503") # --- 200 with short but real content --- check("Short thank you page (200, 120 bytes)", is_blocked(200, 'Your order has been placed. Confirmation email sent.
'), False) # ========================================================================= # EDGE CASES # ========================================================================= print("\n=== EDGE CASES ===\n") check("None status code + empty html", is_blocked(None, ''), True, "no ") check("None status code + block content", is_blocked(None, 'Reference #18.2d351ab8.1557333295.a4e16ab'), True, "Akamai") check("200 + tier1 pattern (Imperva deceptive 200)", is_blocked(200, 'Request unsuccessful. Incapsula incident ID: 555-999'), True, "Incapsula") check("403 + 4999 bytes (just under threshold)", is_blocked(403, 'Access Denied' + 'x' * 4950 + ''), True, "Access Denied") check("403 + 5001 bytes (over old threshold, now blocked)", is_blocked(403, 'Some error page' + 'x' * 4960 + ''), True, "403") check("403 + 9999 bytes with generic block text", is_blocked(403, 'blocked by security' + 'x' * 9950 + ''), True, "Blocked by security") check("403 + 10001 bytes with generic block text (now detected regardless of size)", is_blocked(403, 'blocked by security' + 'x' * 9970 + ''), True, "Blocked by security") check("200 + whitespace-padded but 89 bytes content (above threshold for meaningful)", is_blocked(200, ' ' * 10 + 'x' * 89 + ' ' * 10), True, "empty") check("200 + exactly 100 bytes stripped (at threshold, no body = structural fail)", is_blocked(200, 'x' * 100), True, "no ") # ========================================================================= # SUMMARY # ========================================================================= print(f"\n{'=' * 60}") print(f"RESULTS: {PASS} passed, {FAIL} failed out of {PASS + FAIL} tests") print(f"{'=' * 60}") if FAIL > 0: print("SOME TESTS FAILED!") sys.exit(1) else: print("ALL TESTS PASSED!")