feat: non-destructive asset pruning with is_missing flag

- Add is_missing column to AssetCacheState for soft-delete
- Replace hard-delete pruning with mark_cache_states_missing_outside_prefixes
- Auto-restore missing cache states when files are re-scanned
- Filter out missing cache states from queries by default
- Rename functions for clarity:
  - mark_cache_states_missing_outside_prefixes (was delete_cache_states_outside_prefixes)
  - get_unreferenced_unhashed_asset_ids (was get_orphaned_seed_asset_ids)
  - mark_assets_missing_outside_prefixes (was prune_orphaned_assets)
  - mark_missing_outside_prefixes_safely (was prune_orphans_safely)
- Add restore_cache_states_by_paths for explicit restoration
- Add cleanup_unreferenced_assets for explicit hard-delete when needed
- Update API endpoint /api/assets/prune to use new soft-delete behavior

This preserves user metadata (tags, etc.) when base directories change,
allowing assets to be restored when the original paths become available again.

Amp-Thread-ID: https://ampcode.com/threads/T-019c3114-bf28-73a9-a4d2-85b208fd5462
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Luke Mino-Altherr
2026-02-05 21:21:46 -08:00
parent 6443cf016e
commit f13b7aeba2
11 changed files with 282 additions and 103 deletions

View File

@@ -6,8 +6,7 @@ from app.assets.database.models import Asset, AssetCacheState, AssetInfo
from app.assets.database.queries import (
list_cache_states_by_asset_id,
upsert_cache_state,
delete_cache_states_outside_prefixes,
get_orphaned_seed_asset_ids,
get_unreferenced_unhashed_asset_ids,
delete_assets_by_ids,
get_cache_states_for_prefixes,
bulk_set_needs_verify,
@@ -15,6 +14,8 @@ from app.assets.database.queries import (
delete_orphaned_seed_asset,
bulk_insert_cache_states_ignore_conflicts,
get_cache_states_by_paths_and_asset_ids,
mark_cache_states_missing_outside_prefixes,
restore_cache_states_by_paths,
)
from app.assets.helpers import select_best_live_path, get_utc_now
@@ -168,9 +169,51 @@ class TestUpsertCacheState:
state = session.query(AssetCacheState).filter_by(file_path=file_path).one()
assert state.mtime_ns == final_mtime
def test_upsert_restores_missing_state(self, session: Session):
"""Upserting a cache state that was marked missing should restore it."""
asset = _make_asset(session, "hash1")
file_path = "/restored/file.bin"
class TestDeleteCacheStatesOutsidePrefixes:
def test_deletes_states_outside_prefixes(self, session: Session, tmp_path):
state = _make_cache_state(session, asset, file_path, mtime_ns=100)
state.is_missing = True
session.commit()
created, updated = upsert_cache_state(
session, asset_id=asset.id, file_path=file_path, mtime_ns=100
)
session.commit()
assert created is False
assert updated is True
restored_state = session.query(AssetCacheState).filter_by(file_path=file_path).one()
assert restored_state.is_missing is False
class TestRestoreCacheStatesByPaths:
def test_restores_missing_states(self, session: Session):
asset = _make_asset(session, "hash1")
missing_path = "/missing/file.bin"
active_path = "/active/file.bin"
missing_state = _make_cache_state(session, asset, missing_path)
missing_state.is_missing = True
_make_cache_state(session, asset, active_path)
session.commit()
restored = restore_cache_states_by_paths(session, [missing_path])
session.commit()
assert restored == 1
state = session.query(AssetCacheState).filter_by(file_path=missing_path).one()
assert state.is_missing is False
def test_empty_list_restores_nothing(self, session: Session):
restored = restore_cache_states_by_paths(session, [])
assert restored == 0
class TestMarkCacheStatesMissingOutsidePrefixes:
def test_marks_states_missing_outside_prefixes(self, session: Session, tmp_path):
asset = _make_asset(session, "hash1")
valid_dir = tmp_path / "valid"
valid_dir.mkdir()
@@ -184,39 +227,48 @@ class TestDeleteCacheStatesOutsidePrefixes:
_make_cache_state(session, asset, invalid_path)
session.commit()
deleted = delete_cache_states_outside_prefixes(session, [str(valid_dir)])
marked = mark_cache_states_missing_outside_prefixes(session, [str(valid_dir)])
session.commit()
assert deleted == 1
remaining = session.query(AssetCacheState).all()
assert len(remaining) == 1
assert remaining[0].file_path == valid_path
assert marked == 1
all_states = session.query(AssetCacheState).all()
assert len(all_states) == 2
def test_empty_prefixes_deletes_nothing(self, session: Session):
valid_state = next(s for s in all_states if s.file_path == valid_path)
invalid_state = next(s for s in all_states if s.file_path == invalid_path)
assert valid_state.is_missing is False
assert invalid_state.is_missing is True
def test_empty_prefixes_marks_nothing(self, session: Session):
asset = _make_asset(session, "hash1")
_make_cache_state(session, asset, "/some/path.bin")
session.commit()
deleted = delete_cache_states_outside_prefixes(session, [])
marked = mark_cache_states_missing_outside_prefixes(session, [])
assert deleted == 0
assert marked == 0
class TestGetOrphanedSeedAssetIds:
def test_returns_orphaned_seed_assets(self, session: Session):
# Seed asset (hash=None) with no cache states
orphan = _make_asset(session, hash_val=None)
# Seed asset with cache state (not orphaned)
with_state = _make_asset(session, hash_val=None)
_make_cache_state(session, with_state, "/has/state.bin")
class TestGetUnreferencedUnhashedAssetIds:
def test_returns_unreferenced_unhashed_assets(self, session: Session):
# Unhashed asset (hash=None) with no cache states
no_states = _make_asset(session, hash_val=None)
# Unhashed asset with active cache state (not unreferenced)
with_active_state = _make_asset(session, hash_val=None)
_make_cache_state(session, with_active_state, "/has/state.bin")
# Unhashed asset with only missing cache state (should be unreferenced)
with_missing_state = _make_asset(session, hash_val=None)
missing_state = _make_cache_state(session, with_missing_state, "/missing/state.bin")
missing_state.is_missing = True
# Regular asset (hash not None) - should not be returned
_make_asset(session, hash_val="blake3:regular")
session.commit()
orphaned = get_orphaned_seed_asset_ids(session)
unreferenced = get_unreferenced_unhashed_asset_ids(session)
assert orphan.id in orphaned
assert with_state.id not in orphaned
assert no_states.id in unreferenced
assert with_missing_state.id in unreferenced
assert with_active_state.id not in unreferenced
class TestDeleteAssetsByIds:

View File

@@ -349,10 +349,10 @@ class TestSeederThreadSafety:
)
class TestSeederPruneOrphans:
"""Test prune_orphans behavior."""
class TestSeederMarkMissing:
"""Test mark_missing_outside_prefixes behavior."""
def test_prune_orphans_when_idle(self, fresh_seeder: AssetSeeder):
def test_mark_missing_when_idle(self, fresh_seeder: AssetSeeder):
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch(
@@ -360,14 +360,14 @@ class TestSeederPruneOrphans:
return_value=["/models", "/input", "/output"],
),
patch(
"app.assets.seeder.prune_orphans_safely", return_value=5
) as mock_prune,
"app.assets.seeder.mark_missing_outside_prefixes_safely", return_value=5
) as mock_mark,
):
result = fresh_seeder.prune_orphans()
result = fresh_seeder.mark_missing_outside_prefixes()
assert result == 5
mock_prune.assert_called_once_with(["/models", "/input", "/output"])
mock_mark.assert_called_once_with(["/models", "/input", "/output"])
def test_prune_orphans_returns_zero_when_running(
def test_mark_missing_returns_zero_when_running(
self, fresh_seeder: AssetSeeder, mock_dependencies
):
barrier = threading.Event()
@@ -382,35 +382,35 @@ class TestSeederPruneOrphans:
fresh_seeder.start(roots=("models",))
time.sleep(0.05)
result = fresh_seeder.prune_orphans()
result = fresh_seeder.mark_missing_outside_prefixes()
assert result == 0
barrier.set()
def test_prune_orphans_returns_zero_when_dependencies_unavailable(
def test_mark_missing_returns_zero_when_dependencies_unavailable(
self, fresh_seeder: AssetSeeder
):
with patch("app.assets.seeder.dependencies_available", return_value=False):
result = fresh_seeder.prune_orphans()
result = fresh_seeder.mark_missing_outside_prefixes()
assert result == 0
def test_prune_first_flag_triggers_pruning_before_scan(
def test_prune_first_flag_triggers_mark_missing_before_scan(
self, fresh_seeder: AssetSeeder
):
prune_call_order = []
call_order = []
def track_prune(prefixes):
prune_call_order.append("prune")
def track_mark(prefixes):
call_order.append("mark_missing")
return 3
def track_sync(root):
prune_call_order.append(f"sync_{root}")
call_order.append(f"sync_{root}")
return set()
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.get_all_known_prefixes", return_value=["/models"]),
patch("app.assets.seeder.prune_orphans_safely", side_effect=track_prune),
patch("app.assets.seeder.mark_missing_outside_prefixes_safely", side_effect=track_mark),
patch("app.assets.seeder.sync_root_safely", side_effect=track_sync),
patch("app.assets.seeder.collect_paths_for_roots", return_value=[]),
patch("app.assets.seeder.build_asset_specs", return_value=([], set(), 0)),
@@ -419,5 +419,5 @@ class TestSeederPruneOrphans:
fresh_seeder.start(roots=("models",), prune_first=True)
fresh_seeder.wait(timeout=5.0)
assert prune_call_order[0] == "prune"
assert "sync_models" in prune_call_order
assert call_order[0] == "mark_missing"
assert "sync_models" in call_order