feat: soft-delete for AssetReference with scanner persistence

- Add deleted_at column to AssetReference model and migration
- soft_delete_reference_by_id sets deleted_at instead of removing rows
- DELETE /api/assets/{id} defaults to soft-delete; delete_content=true
  for hard-delete
- Add deleted_at IS NULL filters to read queries, tag queries, and
  scanner queries so soft-deleted refs are invisible
- restore_references_by_paths skips soft-deleted refs
- upsert_reference clears deleted_at on explicit re-ingest
- Add tests for soft-delete API behavior, scanner persistence, bulk
  insert, enrichment exclusion, and seed asset garbage collection

Amp-Thread-ID: https://ampcode.com/threads/T-019cb6fc-c05c-761f-b855-6d5d1c9defa2
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Luke Mino-Altherr
2026-03-03 20:04:01 -08:00
parent ce17e303fc
commit d5cac66405
12 changed files with 247 additions and 17 deletions

View File

@@ -212,7 +212,7 @@ def asset_factory(http: requests.Session, api_base: str):
for aid in created:
with contextlib.suppress(Exception):
http.delete(f"{api_base}/api/assets/{aid}", timeout=30)
http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=30)
@pytest.fixture
@@ -258,4 +258,4 @@ def autoclean_unit_test_assets(http: requests.Session, api_base: str):
break
for aid in ids:
with contextlib.suppress(Exception):
http.delete(f"{api_base}/api/assets/{aid}", timeout=30)
http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=30)

View File

@@ -140,7 +140,7 @@ class TestUpdateAssetMetadata:
class TestDeleteAssetReference:
def test_deletes_reference(self, mock_create_session, session: Session):
def test_soft_deletes_reference(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
ref_id = ref.id
@@ -153,7 +153,11 @@ class TestDeleteAssetReference:
)
assert result is True
assert session.get(AssetReference, ref_id) is None
# Row still exists but is marked as soft-deleted
session.expire_all()
row = session.get(AssetReference, ref_id)
assert row is not None
assert row.deleted_at is not None
def test_returns_false_for_nonexistent(self, mock_create_session):
result = delete_asset_reference(

View File

@@ -42,8 +42,8 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse
assert "user_metadata" in detail
assert "filename" in detail["user_metadata"]
# DELETE
rd = http.delete(f"{api_base}/api/assets/{aid}", timeout=120)
# DELETE (hard delete to also remove underlying asset and file)
rd = http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120)
assert rd.status_code == 204
# GET again -> 404
@@ -53,6 +53,35 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse
assert body["error"]["code"] == "ASSET_NOT_FOUND"
def test_soft_delete_hides_from_get(http: requests.Session, api_base: str, seeded_asset: dict):
aid = seeded_asset["id"]
asset_hash = seeded_asset["asset_hash"]
# Soft-delete (default, no delete_content param)
rd = http.delete(f"{api_base}/api/assets/{aid}", timeout=120)
assert rd.status_code == 204
# GET by reference ID -> 404 (soft-deleted references are hidden)
rg = http.get(f"{api_base}/api/assets/{aid}", timeout=120)
assert rg.status_code == 404
# Asset identity is preserved (underlying content still exists)
rh = http.head(f"{api_base}/api/assets/hash/{asset_hash}", timeout=120)
assert rh.status_code == 200
# Soft-deleted reference should not appear in listings
rl = http.get(
f"{api_base}/api/assets",
params={"include_tags": "unit-tests", "limit": "500"},
timeout=120,
)
ids = [a["id"] for a in rl.json().get("assets", [])]
assert aid not in ids
# Clean up: hard-delete the soft-deleted reference and orphaned asset
http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120)
def test_delete_upon_reference_count(
http: requests.Session, api_base: str, seeded_asset: dict
):
@@ -70,21 +99,32 @@ def test_delete_upon_reference_count(
assert copy["asset_hash"] == src_hash
assert copy["created_new"] is False
# Delete original reference -> asset identity must remain
# Soft-delete original reference (default) -> asset identity must remain
aid1 = seeded_asset["id"]
rd1 = http.delete(f"{api_base}/api/assets/{aid1}", timeout=120)
assert rd1.status_code == 204
rh1 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120)
assert rh1.status_code == 200 # identity still present
assert rh1.status_code == 200 # identity still present (second ref exists)
# Delete the last reference with default semantics -> identity and cached files removed
# Soft-delete the last reference -> asset identity preserved (no hard delete)
aid2 = copy["id"]
rd2 = http.delete(f"{api_base}/api/assets/{aid2}", timeout=120)
assert rd2.status_code == 204
rh2 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120)
assert rh2.status_code == 404 # orphan content removed
assert rh2.status_code == 200 # asset identity preserved (soft delete)
# Re-associate via from-hash, then hard-delete -> orphan content removed
r3 = http.post(f"{api_base}/api/assets/from-hash", json=payload, timeout=120)
assert r3.status_code == 201, r3.json()
aid3 = r3.json()["id"]
rd3 = http.delete(f"{api_base}/api/assets/{aid3}?delete_content=true", timeout=120)
assert rd3.status_code == 204
rh3 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120)
assert rh3.status_code == 404 # orphan content removed
def test_update_asset_fields(http: requests.Session, api_base: str, seeded_asset: dict):

View File

@@ -117,7 +117,7 @@ def test_download_missing_file_returns_404(
assert body["error"]["code"] == "FILE_NOT_FOUND"
finally:
# We created asset without the "unit-tests" tag(see `autoclean_unit_test_assets`), we need to clear it manually.
dr = http.delete(f"{api_base}/api/assets/{aid}", timeout=120)
dr = http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120)
dr.content

View File

@@ -2,6 +2,7 @@
import os
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import patch
@@ -16,6 +17,12 @@ from app.assets.database.models import (
Base,
Tag,
)
from app.assets.database.queries.asset_reference import (
bulk_insert_references_ignore_conflicts,
get_references_for_prefixes,
get_unenriched_references,
restore_references_by_paths,
)
from app.assets.scanner import sync_references_with_filesystem
from app.assets.services.file_utils import get_mtime_ns
@@ -348,3 +355,128 @@ def test_no_references_is_noop(session, temp_dir):
session.commit()
assert survivors == set()
# ---------------------------------------------------------------------------
# Soft-delete persistence across scanner operations
# ---------------------------------------------------------------------------
def _soft_delete_ref(session: Session, ref_id: str) -> None:
"""Mark a reference as soft-deleted (mimics the API DELETE behaviour)."""
ref = session.get(AssetReference, ref_id)
ref.deleted_at = datetime(2025, 1, 1)
session.flush()
def test_soft_deleted_ref_excluded_from_get_references_for_prefixes(session, temp_dir):
"""get_references_for_prefixes skips soft-deleted references."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
rows = get_references_for_prefixes(session, [str(temp_dir)], include_missing=True)
assert len(rows) == 0
def test_sync_does_not_resurrect_soft_deleted_ref(session, temp_dir):
"""Scanner sync leaves soft-deleted refs untouched even when file exists on disk."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
ref = session.get(AssetReference, "r1")
assert ref.deleted_at is not None, "soft-deleted ref must stay deleted after sync"
def test_bulk_insert_does_not_overwrite_soft_deleted_ref(session, temp_dir):
"""bulk_insert_references_ignore_conflicts cannot replace a soft-deleted row."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
now = datetime.now(tz=None)
bulk_insert_references_ignore_conflicts(session, [
{
"id": "r_new",
"asset_id": "a1",
"file_path": fp,
"name": "model.bin",
"owner_id": "",
"mtime_ns": mtime,
"preview_id": None,
"user_metadata": None,
"created_at": now,
"updated_at": now,
"last_access_time": now,
}
])
session.commit()
session.expire_all()
# Original row is still the soft-deleted one
ref = session.get(AssetReference, "r1")
assert ref is not None
assert ref.deleted_at is not None
# The new row was not inserted (conflict on file_path)
assert session.get(AssetReference, "r_new") is None
def test_restore_references_by_paths_skips_soft_deleted(session, temp_dir):
"""restore_references_by_paths does not clear is_missing on soft-deleted refs."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(
session, "a1", fp, "r1",
asset_hash="blake3:abc", mtime_ns=mtime, is_missing=True,
)
_soft_delete_ref(session, "r1")
session.commit()
restored = restore_references_by_paths(session, [fp])
session.commit()
assert restored == 0
session.expire_all()
ref = session.get(AssetReference, "r1")
assert ref.is_missing is True, "is_missing must not be cleared on soft-deleted ref"
assert ref.deleted_at is not None
def test_get_unenriched_references_excludes_soft_deleted(session, temp_dir):
"""Enrichment queries do not pick up soft-deleted references."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
rows = get_unenriched_references(session, [str(temp_dir)], max_level=2)
assert len(rows) == 0
def test_sync_ignores_soft_deleted_seed_asset(session, temp_dir):
"""Soft-deleted seed ref is not garbage-collected even when file is missing."""
fp = str(temp_dir / "gone.bin") # file does not exist
_make_asset(session, "seed1", fp, "r1", asset_hash=None, mtime_ns=999)
_soft_delete_ref(session, "r1")
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
# Asset and ref must still exist — scanner did not see the soft-deleted row
assert session.get(Asset, "seed1") is not None
assert session.get(AssetReference, "r1") is not None

View File

@@ -69,8 +69,8 @@ def test_tags_empty_usage(http: requests.Session, api_base: str, asset_factory,
used_names = [t["name"] for t in body2["tags"]]
assert custom_tag in used_names
# Delete the asset so the tag usage drops to zero
rd = http.delete(f"{api_base}/api/assets/{_asset['id']}", timeout=120)
# Hard-delete the asset so the tag usage drops to zero
rd = http.delete(f"{api_base}/api/assets/{_asset['id']}?delete_content=true", timeout=120)
assert rd.status_code == 204
# Now the custom tag must not be returned when include_zero=false