From 9b571bb947c5ce0750dff783296afcb0efcd2bd0 Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 16 Mar 2026 14:03:43 +0000 Subject: [PATCH] feat: HTTP strategy detects and saves file downloads (CSV, PDF, etc.) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HTTP crawler strategy now checks Content-Type and Content-Disposition headers to detect non-HTML file responses. When a file download is detected, raw bytes are saved to disk and the path is returned via downloaded_files. Text-based files (CSV, JSON, XML) also populate the html field for backward compatibility. Binary files (PDF, images) set html to empty string — content is only available via downloaded_files. Adds downloads_path to HTTPCrawlerConfig (defaults to ~/.crawl4ai/downloads/). --- .context/PR-TODOLIST.md | 10 +- crawl4ai/async_configs.py | 5 + crawl4ai/async_crawler_strategy.py | 115 ++++++- tests/async/test_http_file_download.py | 423 +++++++++++++++++++++++++ 4 files changed, 539 insertions(+), 14 deletions(-) create mode 100644 tests/async/test_http_file_download.py diff --git a/.context/PR-TODOLIST.md b/.context/PR-TODOLIST.md index c75fa993..c11b0178 100644 --- a/.context/PR-TODOLIST.md +++ b/.context/PR-TODOLIST.md @@ -1,6 +1,6 @@ # PR Review Todolist -> Last updated: 2026-03-07 | Total open PRs: 6 +> Last updated: 2026-03-13 | Total open PRs: 6 --- @@ -94,7 +94,13 @@ --- -## Resolved This Session (batch 5) +## Resolved This Session (batch 6) + +| PR | Author | Description | Date | +|----|--------|-------------|------| +| #1834 | ntohidi | fix: remove shared LOCK contention in monitor to prevent pod deadlock (#1754) | 2026-03-13 | + +## Resolved (batch 5) | PR | Author | Description | Date | |----|--------|-------------|------| diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index d7171559..de07f10c 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1068,6 +1068,7 @@ class HTTPCrawlerConfig: json: Optional[Dict[str, Any]] = None follow_redirects: bool = True verify_ssl: bool = True + downloads_path: Optional[str] = None def __init__( self, @@ -1077,6 +1078,7 @@ class HTTPCrawlerConfig: json: Optional[Dict[str, Any]] = None, follow_redirects: bool = True, verify_ssl: bool = True, + downloads_path: Optional[str] = None, ): self.method = method self.headers = headers @@ -1084,6 +1086,7 @@ class HTTPCrawlerConfig: self.json = json self.follow_redirects = follow_redirects self.verify_ssl = verify_ssl + self.downloads_path = downloads_path @staticmethod def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig": @@ -1094,6 +1097,7 @@ class HTTPCrawlerConfig: json=kwargs.get("json"), follow_redirects=kwargs.get("follow_redirects", True), verify_ssl=kwargs.get("verify_ssl", True), + downloads_path=kwargs.get("downloads_path"), ) def to_dict(self): @@ -1104,6 +1108,7 @@ class HTTPCrawlerConfig: "json": self.json, "follow_redirects": self.follow_redirects, "verify_ssl": self.verify_ssl, + "downloads_path": self.downloads_path, } def clone(self, **kwargs): diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 04434b58..0ec76dbc 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -2573,6 +2573,62 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): return server + # Content types treated as text (decoded into html field) + _TEXT_CONTENT_TYPES: Final = frozenset({ + 'text/csv', 'text/plain', 'text/tab-separated-values', 'text/xml', + 'application/json', 'application/xml', 'application/xhtml+xml', + 'application/rss+xml', 'application/atom+xml', 'application/ld+json', + 'application/x-ndjson', 'text/calendar', 'text/vcard', + }) + + def _is_file_download(self, content_type: str, content_disposition: str) -> bool: + """Detect if the HTTP response is a file download rather than an HTML page.""" + if 'attachment' in content_disposition: + return True + if not content_type or content_type == 'text/html': + return False + # Anything that isn't text/html is a file download + return True + + def _is_text_content(self, content_type: str) -> bool: + """Check if content type is text-based (safe to decode and put in html field).""" + if content_type in self._TEXT_CONTENT_TYPES: + return True + # Catch-all for text/* subtypes not in the explicit set + return content_type.startswith('text/') + + def _extract_filename(self, content_disposition: str, url: str, content_type: str) -> str: + """Extract filename from Content-Disposition header or URL path.""" + # Try Content-Disposition first + if content_disposition: + import re + # filename*=UTF-8''encoded_name (RFC 5987) + match = re.search(r"filename\*=(?:UTF-8''|utf-8'')(.+?)(?:;|$)", content_disposition) + if match: + from urllib.parse import unquote + return unquote(match.group(1).strip()) + # filename="name" or filename=name + match = re.search(r'filename="?([^";]+)"?', content_disposition) + if match: + return match.group(1).strip() + + # Fall back to URL path + path = urlparse(url).path + if path and '/' in path: + basename = path.rsplit('/', 1)[-1] + if '.' in basename and len(basename) <= 255: + return basename + + # Last resort: hash-based name with extension from content type + ext_map = { + 'text/csv': '.csv', 'application/pdf': '.pdf', + 'application/zip': '.zip', 'image/png': '.png', + 'image/jpeg': '.jpg', 'application/json': '.json', + 'text/plain': '.txt', 'application/xml': '.xml', + } + ext = ext_map.get(content_type, '') + return f"download_{hashlib.md5(url.encode()).hexdigest()[:10]}{ext}" + async def _handle_http( self, url: str, @@ -2612,26 +2668,61 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): try: async with session.request(self.browser_config.method, url, **request_kwargs) as response: - content = memoryview(await response.read()) - + raw_bytes = await response.read() + content = memoryview(raw_bytes) + if not (200 <= response.status < 300): raise HTTPStatusError( response.status, f"Unexpected status code for {url}" ) - - encoding = response.charset - if not encoding: - detection_result = await asyncio.to_thread(chardet.detect, content.tobytes()) - encoding = detection_result['encoding'] or 'utf-8' - + + response_headers = dict(response.headers) + content_type = response.content_type or 'text/html' + content_type = content_type.split(';')[0].strip().lower() + content_disposition = response_headers.get('Content-Disposition', '') + + downloaded_files = None + html = "" + + if self._is_file_download(content_type, content_disposition): + # Save file to disk + downloads_path = self.browser_config.downloads_path or os.path.join( + os.path.expanduser("~"), ".crawl4ai", "downloads" + ) + os.makedirs(downloads_path, exist_ok=True) + + filename = self._extract_filename(content_disposition, url, content_type) + filepath = os.path.join(downloads_path, filename) + + async with aiofiles.open(filepath, 'wb') as f: + await f.write(raw_bytes) + + downloaded_files = [filepath] + + # For text-based files, also decode into html (backward compatible) + if self._is_text_content(content_type): + encoding = response.charset + if not encoding: + detection_result = await asyncio.to_thread(chardet.detect, raw_bytes) + encoding = detection_result['encoding'] or 'utf-8' + html = raw_bytes.decode(encoding, errors='replace') + else: + # Standard HTML response — existing behavior + encoding = response.charset + if not encoding: + detection_result = await asyncio.to_thread(chardet.detect, content.tobytes()) + encoding = detection_result['encoding'] or 'utf-8' + html = content.tobytes().decode(encoding, errors='replace') + result = AsyncCrawlResponse( - html=content.tobytes().decode(encoding, errors='replace'), - response_headers=dict(response.headers), + html=html, + response_headers=response_headers, status_code=response.status, - redirected_url=str(response.url) + redirected_url=str(response.url), + downloaded_files=downloaded_files, ) - + await self.hooks['after_request'](result) return result diff --git a/tests/async/test_http_file_download.py b/tests/async/test_http_file_download.py new file mode 100644 index 00000000..6a9d7357 --- /dev/null +++ b/tests/async/test_http_file_download.py @@ -0,0 +1,423 @@ +""" +Tests for HTTP strategy file download detection and handling. + +Tests the Content-Type/Content-Disposition detection logic in AsyncHTTPCrawlerStrategy +that saves non-HTML responses to disk and populates downloaded_files. +""" + +import os +import sys +import asyncio +import tempfile +import shutil +import json +import socket + +import pytest +from aiohttp import web + +# Add parent to path so crawl4ai is importable +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, parent_dir) + +from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy +from crawl4ai.async_configs import HTTPCrawlerConfig, CrawlerRunConfig + + +# --------------------------------------------------------------------------- +# Test HTTP server +# --------------------------------------------------------------------------- + +async def handle_html(request): + return web.Response(text="Hello", content_type="text/html") + +async def handle_csv(request): + csv_data = "id,name,value\n1,alpha,100\n2,beta,200\n3,gamma,300\n" + return web.Response( + text=csv_data, + content_type="text/csv", + headers={"Content-Disposition": 'attachment; filename="data.csv"'}, + ) + +async def handle_csv_no_disposition(request): + return web.Response(text="col1,col2\na,b\nc,d\n", content_type="text/csv") + +async def handle_json(request): + return web.Response( + text=json.dumps({"key": "value", "items": [1, 2, 3]}), + content_type="application/json", + ) + +async def handle_pdf(request): + pdf_bytes = b"%PDF-1.4 fake pdf content " + (b"\x00\xff" * 500) + return web.Response( + body=pdf_bytes, + content_type="application/pdf", + headers={"Content-Disposition": 'attachment; filename="report.pdf"'}, + ) + +async def handle_binary_no_name(request): + return web.Response(body=b"\x89PNG\r\n" + b"\x00" * 100, content_type="image/png") + +async def handle_plain_text(request): + return web.Response(text="Just plain text content.", content_type="text/plain") + +async def handle_xml(request): + return web.Response( + text='test', + content_type="application/xml", + ) + +async def handle_attachment_html(request): + return web.Response( + text="download me", + content_type="text/html", + headers={"Content-Disposition": 'attachment; filename="page.html"'}, + ) + +async def handle_csv_url_filename(request): + return web.Response(text="x,y\n1,2\n", content_type="text/csv") + + +def _find_free_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +class _TestServer: + """Minimal test server lifecycle manager.""" + + def __init__(self): + self.port = _find_free_port() + self.runner = None + + @property + def base_url(self): + return f"http://127.0.0.1:{self.port}" + + def url(self, path): + return f"{self.base_url}{path}" + + async def start(self): + app = web.Application() + app.router.add_get("/page.html", handle_html) + app.router.add_get("/data.csv", handle_csv) + app.router.add_get("/inline.csv", handle_csv_no_disposition) + app.router.add_get("/api/data.json", handle_json) + app.router.add_get("/report.pdf", handle_pdf) + app.router.add_get("/image", handle_binary_no_name) + app.router.add_get("/readme.txt", handle_plain_text) + app.router.add_get("/feed.xml", handle_xml) + app.router.add_get("/attachment.html", handle_attachment_html) + app.router.add_get("/files/export.csv", handle_csv_url_filename) + + self.runner = web.AppRunner(app) + await self.runner.setup() + site = web.TCPSite(self.runner, "127.0.0.1", self.port) + await site.start() + + async def stop(self): + if self.runner: + await self.runner.cleanup() + + +# --------------------------------------------------------------------------- +# Helper to run an async crawl test +# --------------------------------------------------------------------------- + +async def _crawl(server, path, downloads_dir=None): + config = HTTPCrawlerConfig(downloads_path=downloads_dir) + strategy = AsyncHTTPCrawlerStrategy(browser_config=config) + return await strategy.crawl(server.url(path)) + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +class TestHTMLPassthrough: + """Normal HTML responses should behave exactly as before.""" + + def test_html_response_unchanged(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/page.html", dl_dir) + + assert "" in result.html + assert result.downloaded_files is None + assert result.status_code == 200 + assert len(os.listdir(dl_dir)) == 0 + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + +class TestTextFileDownloads: + """Text-based file downloads (CSV, JSON, XML, plain text).""" + + def test_csv_with_disposition(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/data.csv", dl_dir) + + assert result.downloaded_files is not None + assert len(result.downloaded_files) == 1 + filepath = result.downloaded_files[0] + assert filepath.endswith("data.csv") + assert os.path.isfile(filepath) + assert "alpha" in result.html + assert "id,name,value" in result.html + with open(filepath) as f: + assert "alpha" in f.read() + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + def test_csv_without_disposition(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/inline.csv", dl_dir) + + assert result.downloaded_files is not None + assert len(result.downloaded_files) == 1 + assert result.downloaded_files[0].endswith("inline.csv") + assert "col1,col2" in result.html + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + def test_json_download(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/api/data.json", dl_dir) + + assert result.downloaded_files is not None + filepath = result.downloaded_files[0] + assert filepath.endswith("data.json") + assert '"key"' in result.html + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + def test_plain_text(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/readme.txt", dl_dir) + + assert result.downloaded_files is not None + assert "Just plain text content." in result.html + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + def test_xml_download(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/feed.xml", dl_dir) + + assert result.downloaded_files is not None + assert "" in result.html + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + def test_csv_filename_from_url(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/files/export.csv", dl_dir) + + assert result.downloaded_files is not None + assert result.downloaded_files[0].endswith("export.csv") + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + +class TestBinaryFileDownloads: + """Binary file downloads (PDF, images) — html should be empty.""" + + def test_pdf_download(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/report.pdf", dl_dir) + + assert result.downloaded_files is not None + filepath = result.downloaded_files[0] + assert filepath.endswith("report.pdf") + assert os.path.isfile(filepath) + assert result.html == "" + with open(filepath, "rb") as f: + data = f.read() + assert data.startswith(b"%PDF") + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + def test_binary_no_filename(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/image", dl_dir) + + assert result.downloaded_files is not None + filepath = result.downloaded_files[0] + assert filepath.endswith(".png") + assert os.path.isfile(filepath) + assert result.html == "" + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + +class TestEdgeCases: + """Edge cases and backward compatibility.""" + + def test_attachment_html_treated_as_download(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/attachment.html", dl_dir) + + assert result.downloaded_files is not None + assert result.downloaded_files[0].endswith("page.html") + assert "download me" in result.html + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + def test_default_downloads_path(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + config = HTTPCrawlerConfig() # no downloads_path + strategy = AsyncHTTPCrawlerStrategy(browser_config=config) + result = await strategy.crawl(srv.url("/data.csv")) + + assert result.downloaded_files is not None + filepath = result.downloaded_files[0] + assert ".crawl4ai/downloads" in filepath + if os.path.isfile(filepath): + os.unlink(filepath) + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + def test_response_headers_contain_content_type(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/data.csv", dl_dir) + assert "text/csv" in result.response_headers.get("Content-Type", "") + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + def test_status_code_preserved(self, tmp_path): + async def _test(): + srv = _TestServer() + await srv.start() + try: + dl_dir = str(tmp_path / "dl") + os.makedirs(dl_dir) + result = await _crawl(srv, "/report.pdf", dl_dir) + assert result.status_code == 200 + finally: + await srv.stop() + + asyncio.get_event_loop().run_until_complete(_test()) + + +class TestDetectionHelpers: + """Unit tests for the detection helper methods.""" + + def test_is_file_download(self): + s = AsyncHTTPCrawlerStrategy() + assert s._is_file_download("text/csv", "") is True + assert s._is_file_download("application/pdf", "") is True + assert s._is_file_download("image/png", "") is True + assert s._is_file_download("text/html", "") is False + assert s._is_file_download("text/html", "attachment; filename=x") is True + assert s._is_file_download("", "") is False + + def test_is_text_content(self): + s = AsyncHTTPCrawlerStrategy() + assert s._is_text_content("text/csv") is True + assert s._is_text_content("text/plain") is True + assert s._is_text_content("application/json") is True + assert s._is_text_content("application/pdf") is False + assert s._is_text_content("image/png") is False + assert s._is_text_content("text/tab-separated-values") is True + + def test_extract_filename_from_disposition(self): + s = AsyncHTTPCrawlerStrategy() + assert s._extract_filename('attachment; filename="data.csv"', "http://x/y", "text/csv") == "data.csv" + assert s._extract_filename("attachment; filename=report.pdf", "http://x/y", "application/pdf") == "report.pdf" + + def test_extract_filename_from_url(self): + s = AsyncHTTPCrawlerStrategy() + assert s._extract_filename("", "http://example.com/files/export.csv", "text/csv") == "export.csv" + + def test_extract_filename_fallback(self): + s = AsyncHTTPCrawlerStrategy() + name = s._extract_filename("", "http://example.com/download", "application/pdf") + assert name.startswith("download_") + assert name.endswith(".pdf") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])