Fix asset API security and correctness issues

- Content-Disposition: drop raw filename= parameter, use only RFC 5987
  filename*=UTF-8'' to prevent header injection via ; and special chars
- delete_asset: default delete_content to False (non-destructive) when
  query parameter is omitted
- create_asset_from_hash: return 400 MISSING_INPUT instead of 404 when
  hash not found and no file uploaded (client input error, not missing resource)
- seeder: clear _progress when returning to IDLE so get_status() does not
  return stale progress after scan completion
- hashing: handle non-seekable streams in _hash_file_obj by checking
  seekable() before attempting tell/seek
- bulk_ingest: filter lost_paths to only include paths tied to actually
  inserted asset IDs, preventing inflated counts from ON CONFLICT drops

Amp-Thread-ID: https://ampcode.com/threads/T-019cb67a-9822-7438-ab05-d09991a9f7f3
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Luke Mino-Altherr
2026-03-03 18:47:48 -08:00
parent f26384f371
commit d653b86bd7
4 changed files with 53 additions and 30 deletions

View File

@@ -187,16 +187,18 @@ def batch_insert_seed_assets(
inserted_asset_ids = get_existing_asset_ids(
session, [r["asset_id"] for r in reference_rows]
)
reference_rows = [
r for r in reference_rows if r["asset_id"] in inserted_asset_ids
]
reference_rows = [r for r in reference_rows if r["asset_id"] in inserted_asset_ids]
bulk_insert_references_ignore_conflicts(session, reference_rows)
restore_references_by_paths(session, absolute_path_list)
winning_paths = get_references_by_paths_and_asset_ids(session, path_to_asset_id)
all_paths_set = set(absolute_path_list)
losing_paths = all_paths_set - winning_paths
inserted_paths = {
path
for path in absolute_path_list
if path_to_asset_id[path] in inserted_asset_ids
}
losing_paths = inserted_paths - winning_paths
lost_asset_ids = [path_to_asset_id[path] for path in losing_paths]
if lost_asset_ids:

View File

@@ -1,3 +1,4 @@
import io
import os
from typing import IO
@@ -21,12 +22,19 @@ def _hash_file_obj(file_obj: IO, chunk_size: int = DEFAULT_CHUNK) -> str:
if chunk_size <= 0:
chunk_size = DEFAULT_CHUNK
orig_pos = file_obj.tell()
seekable = getattr(file_obj, "seekable", lambda: False)()
orig_pos = None
if seekable:
try:
orig_pos = file_obj.tell()
if orig_pos != 0:
file_obj.seek(0)
except io.UnsupportedOperation:
seekable = False
orig_pos = None
try:
if orig_pos != 0:
file_obj.seek(0)
h = blake3()
while True:
chunk = file_obj.read(chunk_size)
@@ -35,4 +43,5 @@ def _hash_file_obj(file_obj: IO, chunk_size: int = DEFAULT_CHUNK) -> str:
h.update(chunk)
return h.hexdigest()
finally:
file_obj.seek(orig_pos)
if seekable and orig_pos is not None:
file_obj.seek(orig_pos)