From f1188f1d5f91811ce156d52435b3bf9484997a0f Mon Sep 17 00:00:00 2001 From: Luke Mino-Altherr Date: Wed, 4 Mar 2026 16:49:44 -0800 Subject: [PATCH] Fix five code review issues 1. Seeder pause/resume: only resume after prompt execution if pause() returned True, preventing undo of user-initiated pauses. 2. Missing rollback in enrich_assets_batch: add sess.rollback() in exception handler to prevent broken session state for subsequent batch operations. 3. Hash checkpoint validation: store mtime_ns/file_size in HashCheckpoint and re-stat on resume instead of comparing the same stat result to itself. 4. Scan progress preserved: save _last_progress before clearing _progress in finally blocks so wait=true endpoint returns final stats instead of zeros. 5. Download XSS hardening: block dangerous MIME types (matching server.py) and add X-Content-Type-Options: nosniff header to asset content endpoint. Amp-Thread-ID: https://ampcode.com/threads/T-019cbb6b-e97b-776d-8c43-2de8acd0d09e Co-authored-by: Amp --- app/assets/api/routes.py | 8 ++++++++ app/assets/scanner.py | 23 ++++++++++++++++------- app/assets/seeder.py | 26 ++++++++++++++++++++------ app/assets/services/hashing.py | 7 +++++-- main.py | 5 +++-- 5 files changed, 52 insertions(+), 17 deletions(-) diff --git a/app/assets/api/routes.py b/app/assets/api/routes.py index d8a94ae96..40dee9f46 100644 --- a/app/assets/api/routes.py +++ b/app/assets/api/routes.py @@ -257,6 +257,13 @@ async def download_asset_content(request: web.Request) -> web.Response: 404, "FILE_NOT_FOUND", "Underlying file not found on disk." ) + _DANGEROUS_MIME_TYPES = { + "text/html", "text/html-sandboxed", "application/xhtml+xml", + "text/javascript", "text/css", + } + if content_type in _DANGEROUS_MIME_TYPES: + content_type = "application/octet-stream" + safe_name = (filename or "").replace("\r", "").replace("\n", "") encoded = urllib.parse.quote(safe_name) cd = f"{disposition}; filename*=UTF-8''{encoded}" @@ -287,6 +294,7 @@ async def download_asset_content(request: web.Request) -> web.Response: headers={ "Content-Disposition": cd, "Content-Length": str(file_size), + "X-Content-Type-Options": "nosniff", }, ) diff --git a/app/assets/scanner.py b/app/assets/scanner.py index 8ca38652a..2d2e63e38 100644 --- a/app/assets/scanner.py +++ b/app/assets/scanner.py @@ -412,8 +412,8 @@ def enrich_asset( asset_id: ID of the asset to update (for mime_type and hash) extract_metadata: If True, extract safetensors header and mime type compute_hash: If True, compute blake3 hash - interrupt_check: Optional callable that may block (e.g. while paused) - and returns True if the operation should be cancelled + interrupt_check: Optional non-blocking callable that returns True if + the operation should be interrupted (e.g. paused or cancelled) hash_checkpoints: Optional dict for saving/restoring hash progress across interruptions, keyed by file path @@ -446,14 +446,20 @@ def enrich_asset( if compute_hash: try: mtime_before = get_mtime_ns(stat_p) + size_before = stat_p.st_size # Restore checkpoint if available and file unchanged checkpoint = None if hash_checkpoints is not None: checkpoint = hash_checkpoints.get(file_path) - if checkpoint is not None and mtime_before != get_mtime_ns(stat_p): - checkpoint = None - hash_checkpoints.pop(file_path, None) + if checkpoint is not None: + cur_stat = os.stat(file_path, follow_symlinks=True) + if (checkpoint.mtime_ns != get_mtime_ns(cur_stat) + or checkpoint.file_size != cur_stat.st_size): + checkpoint = None + hash_checkpoints.pop(file_path, None) + else: + mtime_before = get_mtime_ns(cur_stat) digest, new_checkpoint = compute_blake3_hash( file_path, @@ -464,6 +470,8 @@ def enrich_asset( if digest is None: # Interrupted — save checkpoint for later resumption if hash_checkpoints is not None and new_checkpoint is not None: + new_checkpoint.mtime_ns = mtime_before + new_checkpoint.file_size = size_before hash_checkpoints[file_path] = new_checkpoint return new_level @@ -522,8 +530,8 @@ def enrich_assets_batch( rows: List of UnenrichedReferenceRow from get_unenriched_assets_for_roots extract_metadata: If True, extract metadata for each asset compute_hash: If True, compute hash for each asset - interrupt_check: Optional callable that may block (e.g. while paused) - and returns True if the operation should be cancelled + interrupt_check: Optional non-blocking callable that returns True if + the operation should be interrupted (e.g. paused or cancelled) hash_checkpoints: Optional dict for saving/restoring hash progress across interruptions, keyed by file path @@ -555,6 +563,7 @@ def enrich_assets_batch( failed_ids.append(row.reference_id) except Exception as e: logging.warning("Failed to enrich %s: %s", row.file_path, e) + sess.rollback() failed_ids.append(row.reference_id) return enriched, failed_ids diff --git a/app/assets/seeder.py b/app/assets/seeder.py index 0e3d4811e..029448464 100644 --- a/app/assets/seeder.py +++ b/app/assets/seeder.py @@ -80,6 +80,7 @@ class _AssetSeeder: self._lock = threading.Lock() self._state = State.IDLE self._progress: Progress | None = None + self._last_progress: Progress | None = None self._errors: list[str] = [] self._thread: threading.Thread | None = None self._cancel_event = threading.Event() @@ -315,15 +316,16 @@ class _AssetSeeder: def get_status(self) -> ScanStatus: """Get the current status and progress of the seeder.""" with self._lock: + src = self._progress or self._last_progress return ScanStatus( state=self._state, progress=Progress( - scanned=self._progress.scanned, - total=self._progress.total, - created=self._progress.created, - skipped=self._progress.skipped, + scanned=src.scanned, + total=src.total, + created=src.created, + skipped=src.skipped, ) - if self._progress + if src else None, errors=list(self._errors), ) @@ -379,6 +381,7 @@ class _AssetSeeder: return marked finally: with self._lock: + self._last_progress = self._progress self._state = State.IDLE self._progress = None @@ -386,6 +389,16 @@ class _AssetSeeder: """Check if cancellation has been requested.""" return self._cancel_event.is_set() + def _is_paused_or_cancelled(self) -> bool: + """Non-blocking check: True if paused or cancelled. + + Use as interrupt_check for I/O-bound work (e.g. hashing) so that + file handles are released immediately on pause rather than held + open while blocked. The caller is responsible for blocking on + _check_pause_and_cancel() afterward. + """ + return not self._run_gate.is_set() or self._cancel_event.is_set() + def _check_pause_and_cancel(self) -> bool: """Block while paused, then check if cancelled. @@ -581,6 +594,7 @@ class _AssetSeeder: }, ) with self._lock: + self._last_progress = self._progress self._state = State.IDLE self._progress = None @@ -745,7 +759,7 @@ class _AssetSeeder: unenriched, extract_metadata=True, compute_hash=self._compute_hashes, - interrupt_check=self._check_pause_and_cancel, + interrupt_check=self._is_paused_or_cancelled, hash_checkpoints=hash_checkpoints, ) total_enriched += enriched diff --git a/app/assets/services/hashing.py b/app/assets/services/hashing.py index fe1e3a502..f31f3a006 100644 --- a/app/assets/services/hashing.py +++ b/app/assets/services/hashing.py @@ -16,6 +16,8 @@ class HashCheckpoint: bytes_processed: int hasher: Any # blake3 hasher instance + mtime_ns: int = 0 + file_size: int = 0 def compute_blake3_hash( @@ -29,8 +31,9 @@ def compute_blake3_hash( Args: fp: File path or file-like object chunk_size: Size of chunks to read at a time - interrupt_check: Optional callable that may block (e.g. while paused) - and returns True if the operation should be cancelled. Checked + interrupt_check: Optional callable that returns True if the operation + should be interrupted (e.g. paused or cancelled). Must be + non-blocking so file handles are released immediately. Checked between chunk reads. checkpoint: Optional checkpoint to resume from (file paths only) diff --git a/main.py b/main.py index 2ce1f0662..2aba54e14 100644 --- a/main.py +++ b/main.py @@ -261,11 +261,12 @@ def prompt_worker(q, server_instance): for k in sensitive: extra_data[k] = sensitive[k] - asset_seeder.pause() + was_paused = asset_seeder.pause() try: e.execute(item[2], prompt_id, extra_data, item[4]) finally: - asset_seeder.resume() + if was_paused: + asset_seeder.resume() need_gc = True remove_sensitive = lambda prompt: prompt[:5] + prompt[6:]