refactor(assets): modular architecture + async two-phase scanner & background seeder (#12621)

This commit is contained in:
Luke Mino-Altherr
2026-03-07 17:37:25 -08:00
committed by GitHub
parent a7a6335be5
commit 29b24cb517
62 changed files with 10737 additions and 2878 deletions

View File

@@ -108,7 +108,7 @@ def comfy_url_and_proc(comfy_tmp_base_dir: Path, request: pytest.FixtureRequest)
"main.py",
f"--base-directory={str(comfy_tmp_base_dir)}",
f"--database-url={db_url}",
"--disable-assets-autoscan",
"--enable-assets",
"--listen",
"127.0.0.1",
"--port",
@@ -212,7 +212,7 @@ def asset_factory(http: requests.Session, api_base: str):
for aid in created:
with contextlib.suppress(Exception):
http.delete(f"{api_base}/api/assets/{aid}", timeout=30)
http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=30)
@pytest.fixture
@@ -258,14 +258,4 @@ def autoclean_unit_test_assets(http: requests.Session, api_base: str):
break
for aid in ids:
with contextlib.suppress(Exception):
http.delete(f"{api_base}/api/assets/{aid}", timeout=30)
def trigger_sync_seed_assets(session: requests.Session, base_url: str) -> None:
"""Force a fast sync/seed pass by calling the seed endpoint."""
session.post(base_url + "/api/assets/seed", json={"roots": ["models", "input", "output"]}, timeout=30)
time.sleep(0.2)
def get_asset_filename(asset_hash: str, extension: str) -> str:
return asset_hash.removeprefix("blake3:") + extension
http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=30)

View File

@@ -0,0 +1,28 @@
"""Helper functions for assets integration tests."""
import time
import requests
def trigger_sync_seed_assets(session: requests.Session, base_url: str) -> None:
"""Force a synchronous sync/seed pass by calling the seed endpoint with wait=true.
Retries on 409 (already running) until the previous scan finishes.
"""
deadline = time.monotonic() + 60
while True:
r = session.post(
base_url + "/api/assets/seed?wait=true",
json={"roots": ["models", "input", "output"]},
timeout=60,
)
if r.status_code != 409:
assert r.status_code == 200, f"seed endpoint returned {r.status_code}: {r.text}"
return
if time.monotonic() > deadline:
raise TimeoutError("seed endpoint stuck in 409 (already running)")
time.sleep(0.25)
def get_asset_filename(asset_hash: str, extension: str) -> str:
return asset_hash.removeprefix("blake3:") + extension

View File

@@ -0,0 +1,20 @@
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from app.assets.database.models import Base
@pytest.fixture
def session():
"""In-memory SQLite session for fast unit tests."""
engine = create_engine("sqlite:///:memory:")
Base.metadata.create_all(engine)
with Session(engine) as sess:
yield sess
@pytest.fixture(autouse=True)
def autoclean_unit_test_assets():
"""Override parent autouse fixture - query tests don't need server cleanup."""
yield

View File

@@ -0,0 +1,144 @@
import uuid
import pytest
from sqlalchemy.orm import Session
from app.assets.helpers import get_utc_now
from app.assets.database.models import Asset
from app.assets.database.queries import (
asset_exists_by_hash,
get_asset_by_hash,
upsert_asset,
bulk_insert_assets,
)
class TestAssetExistsByHash:
@pytest.mark.parametrize(
"setup_hash,query_hash,expected",
[
(None, "nonexistent", False), # No asset exists
("blake3:abc123", "blake3:abc123", True), # Asset exists with matching hash
(None, "", False), # Null hash in DB doesn't match empty string
],
ids=["nonexistent", "existing", "null_hash_no_match"],
)
def test_exists_by_hash(self, session: Session, setup_hash, query_hash, expected):
if setup_hash is not None or query_hash == "":
asset = Asset(hash=setup_hash, size_bytes=100)
session.add(asset)
session.commit()
assert asset_exists_by_hash(session, asset_hash=query_hash) is expected
class TestGetAssetByHash:
@pytest.mark.parametrize(
"setup_hash,query_hash,should_find",
[
(None, "nonexistent", False),
("blake3:def456", "blake3:def456", True),
],
ids=["nonexistent", "existing"],
)
def test_get_by_hash(self, session: Session, setup_hash, query_hash, should_find):
if setup_hash is not None:
asset = Asset(hash=setup_hash, size_bytes=200, mime_type="image/png")
session.add(asset)
session.commit()
result = get_asset_by_hash(session, asset_hash=query_hash)
if should_find:
assert result is not None
assert result.size_bytes == 200
assert result.mime_type == "image/png"
else:
assert result is None
class TestUpsertAsset:
@pytest.mark.parametrize(
"first_size,first_mime,second_size,second_mime,expect_created,expect_updated,final_size,final_mime",
[
# New asset creation
(None, None, 1024, "application/octet-stream", True, False, 1024, "application/octet-stream"),
# Existing asset, same values - no update
(500, "text/plain", 500, "text/plain", False, False, 500, "text/plain"),
# Existing asset with size 0, update with new values
(0, None, 2048, "image/png", False, True, 2048, "image/png"),
# Existing asset, second call with size 0 - no update
(1000, None, 0, None, False, False, 1000, None),
],
ids=["new_asset", "existing_no_change", "update_from_zero", "zero_size_no_update"],
)
def test_upsert_scenarios(
self,
session: Session,
first_size,
first_mime,
second_size,
second_mime,
expect_created,
expect_updated,
final_size,
final_mime,
):
asset_hash = f"blake3:test_{first_size}_{second_size}"
# First upsert (if first_size is not None, we're testing the second call)
if first_size is not None:
upsert_asset(
session,
asset_hash=asset_hash,
size_bytes=first_size,
mime_type=first_mime,
)
session.commit()
# The upsert call we're testing
asset, created, updated = upsert_asset(
session,
asset_hash=asset_hash,
size_bytes=second_size,
mime_type=second_mime,
)
session.commit()
assert created is expect_created
assert updated is expect_updated
assert asset.size_bytes == final_size
assert asset.mime_type == final_mime
class TestBulkInsertAssets:
def test_inserts_multiple_assets(self, session: Session):
now = get_utc_now()
rows = [
{"id": str(uuid.uuid4()), "hash": "blake3:bulk1", "size_bytes": 100, "mime_type": "text/plain", "created_at": now},
{"id": str(uuid.uuid4()), "hash": "blake3:bulk2", "size_bytes": 200, "mime_type": "image/png", "created_at": now},
{"id": str(uuid.uuid4()), "hash": "blake3:bulk3", "size_bytes": 300, "mime_type": None, "created_at": now},
]
bulk_insert_assets(session, rows)
session.commit()
assets = session.query(Asset).all()
assert len(assets) == 3
hashes = {a.hash for a in assets}
assert hashes == {"blake3:bulk1", "blake3:bulk2", "blake3:bulk3"}
def test_empty_list_is_noop(self, session: Session):
bulk_insert_assets(session, [])
session.commit()
assert session.query(Asset).count() == 0
def test_handles_large_batch(self, session: Session):
"""Test chunking logic with more rows than MAX_BIND_PARAMS allows."""
now = get_utc_now()
rows = [
{"id": str(uuid.uuid4()), "hash": f"blake3:large{i}", "size_bytes": i, "mime_type": None, "created_at": now}
for i in range(200)
]
bulk_insert_assets(session, rows)
session.commit()
assert session.query(Asset).count() == 200

View File

@@ -0,0 +1,517 @@
import time
import uuid
import pytest
from sqlalchemy.orm import Session
from app.assets.database.models import Asset, AssetReference, AssetReferenceMeta
from app.assets.database.queries import (
reference_exists_for_asset_id,
get_reference_by_id,
insert_reference,
get_or_create_reference,
update_reference_timestamps,
list_references_page,
fetch_reference_asset_and_tags,
fetch_reference_and_asset,
update_reference_access_time,
set_reference_metadata,
delete_reference_by_id,
set_reference_preview,
bulk_insert_references_ignore_conflicts,
get_reference_ids_by_ids,
ensure_tags_exist,
add_tags_to_reference,
)
from app.assets.helpers import get_utc_now
def _make_asset(session: Session, hash_val: str | None = None, size: int = 1024) -> Asset:
asset = Asset(hash=hash_val, size_bytes=size, mime_type="application/octet-stream")
session.add(asset)
session.flush()
return asset
def _make_reference(
session: Session,
asset: Asset,
name: str = "test",
owner_id: str = "",
) -> AssetReference:
now = get_utc_now()
ref = AssetReference(
owner_id=owner_id,
name=name,
asset_id=asset.id,
created_at=now,
updated_at=now,
last_access_time=now,
)
session.add(ref)
session.flush()
return ref
class TestReferenceExistsForAssetId:
def test_returns_false_when_no_reference(self, session: Session):
asset = _make_asset(session, "hash1")
assert reference_exists_for_asset_id(session, asset_id=asset.id) is False
def test_returns_true_when_reference_exists(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset)
assert reference_exists_for_asset_id(session, asset_id=asset.id) is True
class TestGetReferenceById:
def test_returns_none_for_nonexistent(self, session: Session):
assert get_reference_by_id(session, reference_id="nonexistent") is None
def test_returns_reference(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset, name="myfile.txt")
result = get_reference_by_id(session, reference_id=ref.id)
assert result is not None
assert result.name == "myfile.txt"
class TestListReferencesPage:
def test_empty_db(self, session: Session):
refs, tag_map, total = list_references_page(session)
assert refs == []
assert tag_map == {}
assert total == 0
def test_returns_references_with_tags(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset, name="test.bin")
ensure_tags_exist(session, ["alpha", "beta"])
add_tags_to_reference(session, reference_id=ref.id, tags=["alpha", "beta"])
session.commit()
refs, tag_map, total = list_references_page(session)
assert len(refs) == 1
assert refs[0].id == ref.id
assert set(tag_map[ref.id]) == {"alpha", "beta"}
assert total == 1
def test_name_contains_filter(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, name="model_v1.safetensors")
_make_reference(session, asset, name="config.json")
session.commit()
refs, _, total = list_references_page(session, name_contains="model")
assert total == 1
assert refs[0].name == "model_v1.safetensors"
def test_owner_visibility(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, name="public", owner_id="")
_make_reference(session, asset, name="private", owner_id="user1")
session.commit()
# Empty owner sees only public
refs, _, total = list_references_page(session, owner_id="")
assert total == 1
assert refs[0].name == "public"
# Owner sees both
refs, _, total = list_references_page(session, owner_id="user1")
assert total == 2
def test_include_tags_filter(self, session: Session):
asset = _make_asset(session, "hash1")
ref1 = _make_reference(session, asset, name="tagged")
_make_reference(session, asset, name="untagged")
ensure_tags_exist(session, ["wanted"])
add_tags_to_reference(session, reference_id=ref1.id, tags=["wanted"])
session.commit()
refs, _, total = list_references_page(session, include_tags=["wanted"])
assert total == 1
assert refs[0].name == "tagged"
def test_exclude_tags_filter(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, name="keep")
ref_exclude = _make_reference(session, asset, name="exclude")
ensure_tags_exist(session, ["bad"])
add_tags_to_reference(session, reference_id=ref_exclude.id, tags=["bad"])
session.commit()
refs, _, total = list_references_page(session, exclude_tags=["bad"])
assert total == 1
assert refs[0].name == "keep"
def test_sorting(self, session: Session):
asset = _make_asset(session, "hash1", size=100)
asset2 = _make_asset(session, "hash2", size=500)
_make_reference(session, asset, name="small")
_make_reference(session, asset2, name="large")
session.commit()
refs, _, _ = list_references_page(session, sort="size", order="desc")
assert refs[0].name == "large"
refs, _, _ = list_references_page(session, sort="name", order="asc")
assert refs[0].name == "large"
class TestFetchReferenceAssetAndTags:
def test_returns_none_for_nonexistent(self, session: Session):
result = fetch_reference_asset_and_tags(session, "nonexistent")
assert result is None
def test_returns_tuple(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset, name="test.bin")
ensure_tags_exist(session, ["tag1"])
add_tags_to_reference(session, reference_id=ref.id, tags=["tag1"])
session.commit()
result = fetch_reference_asset_and_tags(session, ref.id)
assert result is not None
ret_ref, ret_asset, ret_tags = result
assert ret_ref.id == ref.id
assert ret_asset.id == asset.id
assert ret_tags == ["tag1"]
class TestFetchReferenceAndAsset:
def test_returns_none_for_nonexistent(self, session: Session):
result = fetch_reference_and_asset(session, reference_id="nonexistent")
assert result is None
def test_returns_tuple(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
session.commit()
result = fetch_reference_and_asset(session, reference_id=ref.id)
assert result is not None
ret_ref, ret_asset = result
assert ret_ref.id == ref.id
assert ret_asset.id == asset.id
class TestUpdateReferenceAccessTime:
def test_updates_last_access_time(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
original_time = ref.last_access_time
session.commit()
import time
time.sleep(0.01)
update_reference_access_time(session, reference_id=ref.id)
session.commit()
session.refresh(ref)
assert ref.last_access_time > original_time
class TestDeleteReferenceById:
def test_deletes_existing(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
session.commit()
result = delete_reference_by_id(session, reference_id=ref.id, owner_id="")
assert result is True
assert get_reference_by_id(session, reference_id=ref.id) is None
def test_returns_false_for_nonexistent(self, session: Session):
result = delete_reference_by_id(session, reference_id="nonexistent", owner_id="")
assert result is False
def test_respects_owner_visibility(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset, owner_id="user1")
session.commit()
result = delete_reference_by_id(session, reference_id=ref.id, owner_id="user2")
assert result is False
assert get_reference_by_id(session, reference_id=ref.id) is not None
class TestSetReferencePreview:
def test_sets_preview(self, session: Session):
asset = _make_asset(session, "hash1")
preview_asset = _make_asset(session, "preview_hash")
ref = _make_reference(session, asset)
session.commit()
set_reference_preview(session, reference_id=ref.id, preview_asset_id=preview_asset.id)
session.commit()
session.refresh(ref)
assert ref.preview_id == preview_asset.id
def test_clears_preview(self, session: Session):
asset = _make_asset(session, "hash1")
preview_asset = _make_asset(session, "preview_hash")
ref = _make_reference(session, asset)
ref.preview_id = preview_asset.id
session.commit()
set_reference_preview(session, reference_id=ref.id, preview_asset_id=None)
session.commit()
session.refresh(ref)
assert ref.preview_id is None
def test_raises_for_nonexistent_reference(self, session: Session):
with pytest.raises(ValueError, match="not found"):
set_reference_preview(session, reference_id="nonexistent", preview_asset_id=None)
def test_raises_for_nonexistent_preview(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
session.commit()
with pytest.raises(ValueError, match="Preview Asset"):
set_reference_preview(session, reference_id=ref.id, preview_asset_id="nonexistent")
class TestInsertReference:
def test_creates_new_reference(self, session: Session):
asset = _make_asset(session, "hash1")
ref = insert_reference(
session, asset_id=asset.id, owner_id="user1", name="test.bin"
)
session.commit()
assert ref is not None
assert ref.name == "test.bin"
assert ref.owner_id == "user1"
def test_allows_duplicate_names(self, session: Session):
asset = _make_asset(session, "hash1")
ref1 = insert_reference(session, asset_id=asset.id, owner_id="user1", name="dup.bin")
session.commit()
# Duplicate names are now allowed
ref2 = insert_reference(
session, asset_id=asset.id, owner_id="user1", name="dup.bin"
)
session.commit()
assert ref1 is not None
assert ref2 is not None
assert ref1.id != ref2.id
class TestGetOrCreateReference:
def test_creates_new_reference(self, session: Session):
asset = _make_asset(session, "hash1")
ref, created = get_or_create_reference(
session, asset_id=asset.id, owner_id="user1", name="new.bin"
)
session.commit()
assert created is True
assert ref.name == "new.bin"
def test_always_creates_new_reference(self, session: Session):
asset = _make_asset(session, "hash1")
ref1, created1 = get_or_create_reference(
session, asset_id=asset.id, owner_id="user1", name="existing.bin"
)
session.commit()
# Duplicate names are allowed, so always creates new
ref2, created2 = get_or_create_reference(
session, asset_id=asset.id, owner_id="user1", name="existing.bin"
)
session.commit()
assert created1 is True
assert created2 is True
assert ref1.id != ref2.id
class TestUpdateReferenceTimestamps:
def test_updates_timestamps(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
original_updated_at = ref.updated_at
session.commit()
time.sleep(0.01)
update_reference_timestamps(session, ref)
session.commit()
session.refresh(ref)
assert ref.updated_at > original_updated_at
def test_updates_preview_id(self, session: Session):
asset = _make_asset(session, "hash1")
preview_asset = _make_asset(session, "preview_hash")
ref = _make_reference(session, asset)
session.commit()
update_reference_timestamps(session, ref, preview_id=preview_asset.id)
session.commit()
session.refresh(ref)
assert ref.preview_id == preview_asset.id
class TestSetReferenceMetadata:
def test_sets_metadata(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
session.commit()
set_reference_metadata(
session, reference_id=ref.id, user_metadata={"key": "value"}
)
session.commit()
session.refresh(ref)
assert ref.user_metadata == {"key": "value"}
# Check metadata table
meta = session.query(AssetReferenceMeta).filter_by(asset_reference_id=ref.id).all()
assert len(meta) == 1
assert meta[0].key == "key"
assert meta[0].val_str == "value"
def test_replaces_existing_metadata(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
session.commit()
set_reference_metadata(
session, reference_id=ref.id, user_metadata={"old": "data"}
)
session.commit()
set_reference_metadata(
session, reference_id=ref.id, user_metadata={"new": "data"}
)
session.commit()
meta = session.query(AssetReferenceMeta).filter_by(asset_reference_id=ref.id).all()
assert len(meta) == 1
assert meta[0].key == "new"
def test_clears_metadata_with_empty_dict(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
session.commit()
set_reference_metadata(
session, reference_id=ref.id, user_metadata={"key": "value"}
)
session.commit()
set_reference_metadata(
session, reference_id=ref.id, user_metadata={}
)
session.commit()
session.refresh(ref)
assert ref.user_metadata == {}
meta = session.query(AssetReferenceMeta).filter_by(asset_reference_id=ref.id).all()
assert len(meta) == 0
def test_raises_for_nonexistent(self, session: Session):
with pytest.raises(ValueError, match="not found"):
set_reference_metadata(
session, reference_id="nonexistent", user_metadata={"key": "value"}
)
class TestBulkInsertReferencesIgnoreConflicts:
def test_inserts_multiple_references(self, session: Session):
asset = _make_asset(session, "hash1")
now = get_utc_now()
rows = [
{
"id": str(uuid.uuid4()),
"owner_id": "",
"name": "bulk1.bin",
"asset_id": asset.id,
"preview_id": None,
"user_metadata": {},
"created_at": now,
"updated_at": now,
"last_access_time": now,
},
{
"id": str(uuid.uuid4()),
"owner_id": "",
"name": "bulk2.bin",
"asset_id": asset.id,
"preview_id": None,
"user_metadata": {},
"created_at": now,
"updated_at": now,
"last_access_time": now,
},
]
bulk_insert_references_ignore_conflicts(session, rows)
session.commit()
refs = session.query(AssetReference).all()
assert len(refs) == 2
def test_allows_duplicate_names(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, name="existing.bin", owner_id="")
session.commit()
now = get_utc_now()
rows = [
{
"id": str(uuid.uuid4()),
"owner_id": "",
"name": "existing.bin",
"asset_id": asset.id,
"preview_id": None,
"user_metadata": {},
"created_at": now,
"updated_at": now,
"last_access_time": now,
},
{
"id": str(uuid.uuid4()),
"owner_id": "",
"name": "new.bin",
"asset_id": asset.id,
"preview_id": None,
"user_metadata": {},
"created_at": now,
"updated_at": now,
"last_access_time": now,
},
]
bulk_insert_references_ignore_conflicts(session, rows)
session.commit()
# Duplicate names allowed, so all 3 rows exist
refs = session.query(AssetReference).all()
assert len(refs) == 3
def test_empty_list_is_noop(self, session: Session):
bulk_insert_references_ignore_conflicts(session, [])
assert session.query(AssetReference).count() == 0
class TestGetReferenceIdsByIds:
def test_returns_existing_ids(self, session: Session):
asset = _make_asset(session, "hash1")
ref1 = _make_reference(session, asset, name="a.bin")
ref2 = _make_reference(session, asset, name="b.bin")
session.commit()
found = get_reference_ids_by_ids(session, [ref1.id, ref2.id, "nonexistent"])
assert found == {ref1.id, ref2.id}
def test_empty_list_returns_empty(self, session: Session):
found = get_reference_ids_by_ids(session, [])
assert found == set()

View File

@@ -0,0 +1,499 @@
"""Tests for cache_state (AssetReference file path) query functions."""
import pytest
from sqlalchemy.orm import Session
from app.assets.database.models import Asset, AssetReference
from app.assets.database.queries import (
list_references_by_asset_id,
upsert_reference,
get_unreferenced_unhashed_asset_ids,
delete_assets_by_ids,
get_references_for_prefixes,
bulk_update_needs_verify,
delete_references_by_ids,
delete_orphaned_seed_asset,
bulk_insert_references_ignore_conflicts,
get_references_by_paths_and_asset_ids,
mark_references_missing_outside_prefixes,
restore_references_by_paths,
)
from app.assets.helpers import select_best_live_path, get_utc_now
def _make_asset(session: Session, hash_val: str | None = None, size: int = 1024) -> Asset:
asset = Asset(hash=hash_val, size_bytes=size)
session.add(asset)
session.flush()
return asset
def _make_reference(
session: Session,
asset: Asset,
file_path: str,
name: str = "test",
mtime_ns: int | None = None,
needs_verify: bool = False,
) -> AssetReference:
now = get_utc_now()
ref = AssetReference(
asset_id=asset.id,
file_path=file_path,
name=name,
mtime_ns=mtime_ns,
needs_verify=needs_verify,
created_at=now,
updated_at=now,
last_access_time=now,
)
session.add(ref)
session.flush()
return ref
class TestListReferencesByAssetId:
def test_returns_empty_for_no_references(self, session: Session):
asset = _make_asset(session, "hash1")
refs = list_references_by_asset_id(session, asset_id=asset.id)
assert list(refs) == []
def test_returns_references_for_asset(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "/path/a.bin", name="a")
_make_reference(session, asset, "/path/b.bin", name="b")
session.commit()
refs = list_references_by_asset_id(session, asset_id=asset.id)
paths = [r.file_path for r in refs]
assert set(paths) == {"/path/a.bin", "/path/b.bin"}
def test_does_not_return_other_assets_references(self, session: Session):
asset1 = _make_asset(session, "hash1")
asset2 = _make_asset(session, "hash2")
_make_reference(session, asset1, "/path/asset1.bin", name="a1")
_make_reference(session, asset2, "/path/asset2.bin", name="a2")
session.commit()
refs = list_references_by_asset_id(session, asset_id=asset1.id)
paths = [r.file_path for r in refs]
assert paths == ["/path/asset1.bin"]
class TestSelectBestLivePath:
def test_returns_empty_for_empty_list(self):
result = select_best_live_path([])
assert result == ""
def test_returns_empty_when_no_files_exist(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset, "/nonexistent/path.bin")
session.commit()
result = select_best_live_path([ref])
assert result == ""
def test_prefers_verified_path(self, session: Session, tmp_path):
"""needs_verify=False should be preferred."""
asset = _make_asset(session, "hash1")
verified_file = tmp_path / "verified.bin"
verified_file.write_bytes(b"data")
unverified_file = tmp_path / "unverified.bin"
unverified_file.write_bytes(b"data")
ref_verified = _make_reference(
session, asset, str(verified_file), name="verified", needs_verify=False
)
ref_unverified = _make_reference(
session, asset, str(unverified_file), name="unverified", needs_verify=True
)
session.commit()
refs = [ref_unverified, ref_verified]
result = select_best_live_path(refs)
assert result == str(verified_file)
def test_falls_back_to_existing_unverified(self, session: Session, tmp_path):
"""If all references need verification, return first existing path."""
asset = _make_asset(session, "hash1")
existing_file = tmp_path / "exists.bin"
existing_file.write_bytes(b"data")
ref = _make_reference(session, asset, str(existing_file), needs_verify=True)
session.commit()
result = select_best_live_path([ref])
assert result == str(existing_file)
class TestSelectBestLivePathWithMocking:
def test_handles_missing_file_path_attr(self):
"""Gracefully handle references with None file_path."""
class MockRef:
file_path = None
needs_verify = False
result = select_best_live_path([MockRef()])
assert result == ""
class TestUpsertReference:
@pytest.mark.parametrize(
"initial_mtime,second_mtime,expect_created,expect_updated,final_mtime",
[
# New reference creation
(None, 12345, True, False, 12345),
# Existing reference, same mtime - no update
(100, 100, False, False, 100),
# Existing reference, different mtime - update
(100, 200, False, True, 200),
],
ids=["new_reference", "existing_no_change", "existing_update_mtime"],
)
def test_upsert_scenarios(
self, session: Session, initial_mtime, second_mtime, expect_created, expect_updated, final_mtime
):
asset = _make_asset(session, "hash1")
file_path = f"/path_{initial_mtime}_{second_mtime}.bin"
name = f"file_{initial_mtime}_{second_mtime}"
# Create initial reference if needed
if initial_mtime is not None:
upsert_reference(session, asset_id=asset.id, file_path=file_path, name=name, mtime_ns=initial_mtime)
session.commit()
# The upsert call we're testing
created, updated = upsert_reference(
session, asset_id=asset.id, file_path=file_path, name=name, mtime_ns=second_mtime
)
session.commit()
assert created is expect_created
assert updated is expect_updated
ref = session.query(AssetReference).filter_by(file_path=file_path).one()
assert ref.mtime_ns == final_mtime
def test_upsert_restores_missing_reference(self, session: Session):
"""Upserting a reference that was marked missing should restore it."""
asset = _make_asset(session, "hash1")
file_path = "/restored/file.bin"
ref = _make_reference(session, asset, file_path, mtime_ns=100)
ref.is_missing = True
session.commit()
created, updated = upsert_reference(
session, asset_id=asset.id, file_path=file_path, name="restored", mtime_ns=100
)
session.commit()
assert created is False
assert updated is True
restored_ref = session.query(AssetReference).filter_by(file_path=file_path).one()
assert restored_ref.is_missing is False
class TestRestoreReferencesByPaths:
def test_restores_missing_references(self, session: Session):
asset = _make_asset(session, "hash1")
missing_path = "/missing/file.bin"
active_path = "/active/file.bin"
missing_ref = _make_reference(session, asset, missing_path, name="missing")
missing_ref.is_missing = True
_make_reference(session, asset, active_path, name="active")
session.commit()
restored = restore_references_by_paths(session, [missing_path])
session.commit()
assert restored == 1
ref = session.query(AssetReference).filter_by(file_path=missing_path).one()
assert ref.is_missing is False
def test_empty_list_restores_nothing(self, session: Session):
restored = restore_references_by_paths(session, [])
assert restored == 0
class TestMarkReferencesMissingOutsidePrefixes:
def test_marks_references_missing_outside_prefixes(self, session: Session, tmp_path):
asset = _make_asset(session, "hash1")
valid_dir = tmp_path / "valid"
valid_dir.mkdir()
invalid_dir = tmp_path / "invalid"
invalid_dir.mkdir()
valid_path = str(valid_dir / "file.bin")
invalid_path = str(invalid_dir / "file.bin")
_make_reference(session, asset, valid_path, name="valid")
_make_reference(session, asset, invalid_path, name="invalid")
session.commit()
marked = mark_references_missing_outside_prefixes(session, [str(valid_dir)])
session.commit()
assert marked == 1
all_refs = session.query(AssetReference).all()
assert len(all_refs) == 2
valid_ref = next(r for r in all_refs if r.file_path == valid_path)
invalid_ref = next(r for r in all_refs if r.file_path == invalid_path)
assert valid_ref.is_missing is False
assert invalid_ref.is_missing is True
def test_empty_prefixes_marks_nothing(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "/some/path.bin")
session.commit()
marked = mark_references_missing_outside_prefixes(session, [])
assert marked == 0
class TestGetUnreferencedUnhashedAssetIds:
def test_returns_unreferenced_unhashed_assets(self, session: Session):
# Unhashed asset (hash=None) with no references (no file_path)
no_refs = _make_asset(session, hash_val=None)
# Unhashed asset with active reference (not unreferenced)
with_active_ref = _make_asset(session, hash_val=None)
_make_reference(session, with_active_ref, "/has/ref.bin", name="has_ref")
# Unhashed asset with only missing reference (should be unreferenced)
with_missing_ref = _make_asset(session, hash_val=None)
missing_ref = _make_reference(session, with_missing_ref, "/missing/ref.bin", name="missing_ref")
missing_ref.is_missing = True
# Regular asset (hash not None) - should not be returned
_make_asset(session, hash_val="blake3:regular")
session.commit()
unreferenced = get_unreferenced_unhashed_asset_ids(session)
assert no_refs.id in unreferenced
assert with_missing_ref.id in unreferenced
assert with_active_ref.id not in unreferenced
class TestDeleteAssetsByIds:
def test_deletes_assets_and_references(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "/test/path.bin", name="test")
session.commit()
deleted = delete_assets_by_ids(session, [asset.id])
session.commit()
assert deleted == 1
assert session.query(Asset).count() == 0
assert session.query(AssetReference).count() == 0
def test_empty_list_deletes_nothing(self, session: Session):
_make_asset(session, "hash1")
session.commit()
deleted = delete_assets_by_ids(session, [])
assert deleted == 0
assert session.query(Asset).count() == 1
class TestGetReferencesForPrefixes:
def test_returns_references_matching_prefix(self, session: Session, tmp_path):
asset = _make_asset(session, "hash1")
dir1 = tmp_path / "dir1"
dir1.mkdir()
dir2 = tmp_path / "dir2"
dir2.mkdir()
path1 = str(dir1 / "file.bin")
path2 = str(dir2 / "file.bin")
_make_reference(session, asset, path1, name="file1", mtime_ns=100)
_make_reference(session, asset, path2, name="file2", mtime_ns=200)
session.commit()
rows = get_references_for_prefixes(session, [str(dir1)])
assert len(rows) == 1
assert rows[0].file_path == path1
def test_empty_prefixes_returns_empty(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "/some/path.bin")
session.commit()
rows = get_references_for_prefixes(session, [])
assert rows == []
class TestBulkSetNeedsVerify:
def test_sets_needs_verify_flag(self, session: Session):
asset = _make_asset(session, "hash1")
ref1 = _make_reference(session, asset, "/path1.bin", needs_verify=False)
ref2 = _make_reference(session, asset, "/path2.bin", needs_verify=False)
session.commit()
updated = bulk_update_needs_verify(session, [ref1.id, ref2.id], True)
session.commit()
assert updated == 2
session.refresh(ref1)
session.refresh(ref2)
assert ref1.needs_verify is True
assert ref2.needs_verify is True
def test_empty_list_updates_nothing(self, session: Session):
updated = bulk_update_needs_verify(session, [], True)
assert updated == 0
class TestDeleteReferencesByIds:
def test_deletes_references_by_id(self, session: Session):
asset = _make_asset(session, "hash1")
ref1 = _make_reference(session, asset, "/path1.bin")
_make_reference(session, asset, "/path2.bin")
session.commit()
deleted = delete_references_by_ids(session, [ref1.id])
session.commit()
assert deleted == 1
assert session.query(AssetReference).count() == 1
def test_empty_list_deletes_nothing(self, session: Session):
deleted = delete_references_by_ids(session, [])
assert deleted == 0
class TestDeleteOrphanedSeedAsset:
@pytest.mark.parametrize(
"create_asset,expected_deleted,expected_count",
[
(True, True, 0), # Existing asset gets deleted
(False, False, 0), # Nonexistent returns False
],
ids=["deletes_existing", "nonexistent_returns_false"],
)
def test_delete_orphaned_seed_asset(
self, session: Session, create_asset, expected_deleted, expected_count
):
asset_id = "nonexistent-id"
if create_asset:
asset = _make_asset(session, hash_val=None)
asset_id = asset.id
_make_reference(session, asset, "/test/path.bin", name="test")
session.commit()
deleted = delete_orphaned_seed_asset(session, asset_id)
if create_asset:
session.commit()
assert deleted is expected_deleted
assert session.query(Asset).count() == expected_count
class TestBulkInsertReferencesIgnoreConflicts:
def test_inserts_multiple_references(self, session: Session):
asset = _make_asset(session, "hash1")
now = get_utc_now()
rows = [
{
"asset_id": asset.id,
"file_path": "/bulk1.bin",
"name": "bulk1",
"mtime_ns": 100,
"created_at": now,
"updated_at": now,
"last_access_time": now,
},
{
"asset_id": asset.id,
"file_path": "/bulk2.bin",
"name": "bulk2",
"mtime_ns": 200,
"created_at": now,
"updated_at": now,
"last_access_time": now,
},
]
bulk_insert_references_ignore_conflicts(session, rows)
session.commit()
assert session.query(AssetReference).count() == 2
def test_ignores_conflicts(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "/existing.bin", mtime_ns=100)
session.commit()
now = get_utc_now()
rows = [
{
"asset_id": asset.id,
"file_path": "/existing.bin",
"name": "existing",
"mtime_ns": 999,
"created_at": now,
"updated_at": now,
"last_access_time": now,
},
{
"asset_id": asset.id,
"file_path": "/new.bin",
"name": "new",
"mtime_ns": 200,
"created_at": now,
"updated_at": now,
"last_access_time": now,
},
]
bulk_insert_references_ignore_conflicts(session, rows)
session.commit()
assert session.query(AssetReference).count() == 2
existing = session.query(AssetReference).filter_by(file_path="/existing.bin").one()
assert existing.mtime_ns == 100 # Original value preserved
def test_empty_list_is_noop(self, session: Session):
bulk_insert_references_ignore_conflicts(session, [])
assert session.query(AssetReference).count() == 0
class TestGetReferencesByPathsAndAssetIds:
def test_returns_matching_paths(self, session: Session):
asset1 = _make_asset(session, "hash1")
asset2 = _make_asset(session, "hash2")
_make_reference(session, asset1, "/path1.bin")
_make_reference(session, asset2, "/path2.bin")
session.commit()
path_to_asset = {
"/path1.bin": asset1.id,
"/path2.bin": asset2.id,
}
winners = get_references_by_paths_and_asset_ids(session, path_to_asset)
assert winners == {"/path1.bin", "/path2.bin"}
def test_excludes_non_matching_asset_ids(self, session: Session):
asset1 = _make_asset(session, "hash1")
asset2 = _make_asset(session, "hash2")
_make_reference(session, asset1, "/path1.bin")
session.commit()
# Path exists but with different asset_id
path_to_asset = {"/path1.bin": asset2.id}
winners = get_references_by_paths_and_asset_ids(session, path_to_asset)
assert winners == set()
def test_empty_dict_returns_empty(self, session: Session):
winners = get_references_by_paths_and_asset_ids(session, {})
assert winners == set()

View File

@@ -0,0 +1,184 @@
"""Tests for metadata filtering logic in asset_reference queries."""
import pytest
from sqlalchemy.orm import Session
from app.assets.database.models import Asset, AssetReference, AssetReferenceMeta
from app.assets.database.queries import list_references_page
from app.assets.database.queries.asset_reference import convert_metadata_to_rows
from app.assets.helpers import get_utc_now
def _make_asset(session: Session, hash_val: str) -> Asset:
asset = Asset(hash=hash_val, size_bytes=1024)
session.add(asset)
session.flush()
return asset
def _make_reference(
session: Session,
asset: Asset,
name: str,
metadata: dict | None = None,
) -> AssetReference:
now = get_utc_now()
ref = AssetReference(
owner_id="",
name=name,
asset_id=asset.id,
user_metadata=metadata,
created_at=now,
updated_at=now,
last_access_time=now,
)
session.add(ref)
session.flush()
if metadata:
for key, val in metadata.items():
for row in convert_metadata_to_rows(key, val):
meta_row = AssetReferenceMeta(
asset_reference_id=ref.id,
key=row["key"],
ordinal=row.get("ordinal", 0),
val_str=row.get("val_str"),
val_num=row.get("val_num"),
val_bool=row.get("val_bool"),
val_json=row.get("val_json"),
)
session.add(meta_row)
session.flush()
return ref
class TestMetadataFilterByType:
"""Table-driven tests for metadata filtering by different value types."""
@pytest.mark.parametrize(
"match_meta,nomatch_meta,filter_key,filter_val",
[
# String matching
({"category": "models"}, {"category": "images"}, "category", "models"),
# Integer matching
({"epoch": 5}, {"epoch": 10}, "epoch", 5),
# Float matching
({"score": 0.95}, {"score": 0.5}, "score", 0.95),
# Boolean True matching
({"enabled": True}, {"enabled": False}, "enabled", True),
# Boolean False matching
({"enabled": False}, {"enabled": True}, "enabled", False),
],
ids=["string", "int", "float", "bool_true", "bool_false"],
)
def test_filter_matches_correct_value(
self, session: Session, match_meta, nomatch_meta, filter_key, filter_val
):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "match", match_meta)
_make_reference(session, asset, "nomatch", nomatch_meta)
session.commit()
refs, _, total = list_references_page(
session, metadata_filter={filter_key: filter_val}
)
assert total == 1
assert refs[0].name == "match"
@pytest.mark.parametrize(
"stored_meta,filter_key,filter_val",
[
# String no match
({"category": "models"}, "category", "other"),
# Int no match
({"epoch": 5}, "epoch", 99),
# Float no match
({"score": 0.5}, "score", 0.99),
],
ids=["string_no_match", "int_no_match", "float_no_match"],
)
def test_filter_returns_empty_when_no_match(
self, session: Session, stored_meta, filter_key, filter_val
):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "item", stored_meta)
session.commit()
refs, _, total = list_references_page(
session, metadata_filter={filter_key: filter_val}
)
assert total == 0
class TestMetadataFilterNull:
"""Tests for null/missing key filtering."""
@pytest.mark.parametrize(
"match_name,match_meta,nomatch_name,nomatch_meta,filter_key",
[
# Null matches missing key
("missing_key", {}, "has_key", {"optional": "value"}, "optional"),
# Null matches explicit null
("explicit_null", {"nullable": None}, "has_value", {"nullable": "present"}, "nullable"),
],
ids=["missing_key", "explicit_null"],
)
def test_null_filter_matches(
self, session: Session, match_name, match_meta, nomatch_name, nomatch_meta, filter_key
):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, match_name, match_meta)
_make_reference(session, asset, nomatch_name, nomatch_meta)
session.commit()
refs, _, total = list_references_page(session, metadata_filter={filter_key: None})
assert total == 1
assert refs[0].name == match_name
class TestMetadataFilterList:
"""Tests for list-based (OR) filtering."""
def test_filter_by_list_matches_any(self, session: Session):
"""List values should match ANY of the values (OR)."""
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "cat_a", {"category": "a"})
_make_reference(session, asset, "cat_b", {"category": "b"})
_make_reference(session, asset, "cat_c", {"category": "c"})
session.commit()
refs, _, total = list_references_page(session, metadata_filter={"category": ["a", "b"]})
assert total == 2
names = {r.name for r in refs}
assert names == {"cat_a", "cat_b"}
class TestMetadataFilterMultipleKeys:
"""Tests for multiple filter keys (AND semantics)."""
def test_multiple_keys_must_all_match(self, session: Session):
"""Multiple keys should ALL match (AND)."""
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "match", {"type": "model", "version": 2})
_make_reference(session, asset, "wrong_type", {"type": "config", "version": 2})
_make_reference(session, asset, "wrong_version", {"type": "model", "version": 1})
session.commit()
refs, _, total = list_references_page(
session, metadata_filter={"type": "model", "version": 2}
)
assert total == 1
assert refs[0].name == "match"
class TestMetadataFilterEmptyDict:
"""Tests for empty filter behavior."""
def test_empty_filter_returns_all(self, session: Session):
asset = _make_asset(session, "hash1")
_make_reference(session, asset, "a", {"key": "val"})
_make_reference(session, asset, "b", {})
session.commit()
refs, _, total = list_references_page(session, metadata_filter={})
assert total == 2

View File

@@ -0,0 +1,366 @@
import pytest
from sqlalchemy.orm import Session
from app.assets.database.models import Asset, AssetReference, AssetReferenceTag, AssetReferenceMeta, Tag
from app.assets.database.queries import (
ensure_tags_exist,
get_reference_tags,
set_reference_tags,
add_tags_to_reference,
remove_tags_from_reference,
add_missing_tag_for_asset_id,
remove_missing_tag_for_asset_id,
list_tags_with_usage,
bulk_insert_tags_and_meta,
)
from app.assets.helpers import get_utc_now
def _make_asset(session: Session, hash_val: str | None = None) -> Asset:
asset = Asset(hash=hash_val, size_bytes=1024)
session.add(asset)
session.flush()
return asset
def _make_reference(session: Session, asset: Asset, name: str = "test", owner_id: str = "") -> AssetReference:
now = get_utc_now()
ref = AssetReference(
owner_id=owner_id,
name=name,
asset_id=asset.id,
created_at=now,
updated_at=now,
last_access_time=now,
)
session.add(ref)
session.flush()
return ref
class TestEnsureTagsExist:
def test_creates_new_tags(self, session: Session):
ensure_tags_exist(session, ["alpha", "beta"], tag_type="user")
session.commit()
tags = session.query(Tag).all()
assert {t.name for t in tags} == {"alpha", "beta"}
def test_is_idempotent(self, session: Session):
ensure_tags_exist(session, ["alpha"], tag_type="user")
ensure_tags_exist(session, ["alpha"], tag_type="user")
session.commit()
assert session.query(Tag).count() == 1
def test_normalizes_tags(self, session: Session):
ensure_tags_exist(session, [" ALPHA ", "Beta", "alpha"])
session.commit()
tags = session.query(Tag).all()
assert {t.name for t in tags} == {"alpha", "beta"}
def test_empty_list_is_noop(self, session: Session):
ensure_tags_exist(session, [])
session.commit()
assert session.query(Tag).count() == 0
def test_tag_type_is_set(self, session: Session):
ensure_tags_exist(session, ["system-tag"], tag_type="system")
session.commit()
tag = session.query(Tag).filter_by(name="system-tag").one()
assert tag.tag_type == "system"
class TestGetReferenceTags:
def test_returns_empty_for_no_tags(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
tags = get_reference_tags(session, reference_id=ref.id)
assert tags == []
def test_returns_tags_for_reference(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["tag1", "tag2"])
session.add_all([
AssetReferenceTag(asset_reference_id=ref.id, tag_name="tag1", origin="manual", added_at=get_utc_now()),
AssetReferenceTag(asset_reference_id=ref.id, tag_name="tag2", origin="manual", added_at=get_utc_now()),
])
session.flush()
tags = get_reference_tags(session, reference_id=ref.id)
assert set(tags) == {"tag1", "tag2"}
class TestSetReferenceTags:
def test_adds_new_tags(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
result = set_reference_tags(session, reference_id=ref.id, tags=["a", "b"])
session.commit()
assert set(result.added) == {"a", "b"}
assert result.removed == []
assert set(result.total) == {"a", "b"}
def test_removes_old_tags(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
set_reference_tags(session, reference_id=ref.id, tags=["a", "b", "c"])
result = set_reference_tags(session, reference_id=ref.id, tags=["a"])
session.commit()
assert result.added == []
assert set(result.removed) == {"b", "c"}
assert result.total == ["a"]
def test_replaces_tags(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
set_reference_tags(session, reference_id=ref.id, tags=["a", "b"])
result = set_reference_tags(session, reference_id=ref.id, tags=["b", "c"])
session.commit()
assert result.added == ["c"]
assert result.removed == ["a"]
assert set(result.total) == {"b", "c"}
class TestAddTagsToReference:
def test_adds_tags(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
result = add_tags_to_reference(session, reference_id=ref.id, tags=["x", "y"])
session.commit()
assert set(result.added) == {"x", "y"}
assert result.already_present == []
def test_reports_already_present(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
add_tags_to_reference(session, reference_id=ref.id, tags=["x"])
result = add_tags_to_reference(session, reference_id=ref.id, tags=["x", "y"])
session.commit()
assert result.added == ["y"]
assert result.already_present == ["x"]
def test_raises_for_missing_reference(self, session: Session):
with pytest.raises(ValueError, match="not found"):
add_tags_to_reference(session, reference_id="nonexistent", tags=["x"])
class TestRemoveTagsFromReference:
def test_removes_tags(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
add_tags_to_reference(session, reference_id=ref.id, tags=["a", "b", "c"])
result = remove_tags_from_reference(session, reference_id=ref.id, tags=["a", "b"])
session.commit()
assert set(result.removed) == {"a", "b"}
assert result.not_present == []
assert result.total_tags == ["c"]
def test_reports_not_present(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
add_tags_to_reference(session, reference_id=ref.id, tags=["a"])
result = remove_tags_from_reference(session, reference_id=ref.id, tags=["a", "x"])
session.commit()
assert result.removed == ["a"]
assert result.not_present == ["x"]
def test_raises_for_missing_reference(self, session: Session):
with pytest.raises(ValueError, match="not found"):
remove_tags_from_reference(session, reference_id="nonexistent", tags=["x"])
class TestMissingTagFunctions:
def test_add_missing_tag_for_asset_id(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["missing"], tag_type="system")
add_missing_tag_for_asset_id(session, asset_id=asset.id)
session.commit()
tags = get_reference_tags(session, reference_id=ref.id)
assert "missing" in tags
def test_add_missing_tag_is_idempotent(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["missing"], tag_type="system")
add_missing_tag_for_asset_id(session, asset_id=asset.id)
add_missing_tag_for_asset_id(session, asset_id=asset.id)
session.commit()
links = session.query(AssetReferenceTag).filter_by(asset_reference_id=ref.id, tag_name="missing").all()
assert len(links) == 1
def test_remove_missing_tag_for_asset_id(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["missing"], tag_type="system")
add_missing_tag_for_asset_id(session, asset_id=asset.id)
remove_missing_tag_for_asset_id(session, asset_id=asset.id)
session.commit()
tags = get_reference_tags(session, reference_id=ref.id)
assert "missing" not in tags
class TestListTagsWithUsage:
def test_returns_tags_with_counts(self, session: Session):
ensure_tags_exist(session, ["used", "unused"])
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
add_tags_to_reference(session, reference_id=ref.id, tags=["used"])
session.commit()
rows, total = list_tags_with_usage(session)
tag_dict = {name: count for name, _, count in rows}
assert tag_dict["used"] == 1
assert tag_dict["unused"] == 0
assert total == 2
def test_exclude_zero_counts(self, session: Session):
ensure_tags_exist(session, ["used", "unused"])
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
add_tags_to_reference(session, reference_id=ref.id, tags=["used"])
session.commit()
rows, total = list_tags_with_usage(session, include_zero=False)
tag_names = {name for name, _, _ in rows}
assert "used" in tag_names
assert "unused" not in tag_names
def test_prefix_filter(self, session: Session):
ensure_tags_exist(session, ["alpha", "beta", "alphabet"])
session.commit()
rows, total = list_tags_with_usage(session, prefix="alph")
tag_names = {name for name, _, _ in rows}
assert tag_names == {"alpha", "alphabet"}
def test_order_by_name(self, session: Session):
ensure_tags_exist(session, ["zebra", "alpha", "middle"])
session.commit()
rows, _ = list_tags_with_usage(session, order="name_asc")
names = [name for name, _, _ in rows]
assert names == ["alpha", "middle", "zebra"]
def test_owner_visibility(self, session: Session):
ensure_tags_exist(session, ["shared-tag", "owner-tag"])
asset = _make_asset(session, "hash1")
shared_ref = _make_reference(session, asset, name="shared", owner_id="")
owner_ref = _make_reference(session, asset, name="owned", owner_id="user1")
add_tags_to_reference(session, reference_id=shared_ref.id, tags=["shared-tag"])
add_tags_to_reference(session, reference_id=owner_ref.id, tags=["owner-tag"])
session.commit()
# Empty owner sees only shared
rows, _ = list_tags_with_usage(session, owner_id="", include_zero=False)
tag_dict = {name: count for name, _, count in rows}
assert tag_dict.get("shared-tag", 0) == 1
assert tag_dict.get("owner-tag", 0) == 0
# User1 sees both
rows, _ = list_tags_with_usage(session, owner_id="user1", include_zero=False)
tag_dict = {name: count for name, _, count in rows}
assert tag_dict.get("shared-tag", 0) == 1
assert tag_dict.get("owner-tag", 0) == 1
class TestBulkInsertTagsAndMeta:
def test_inserts_tags(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["bulk-tag1", "bulk-tag2"])
session.commit()
now = get_utc_now()
tag_rows = [
{"asset_reference_id": ref.id, "tag_name": "bulk-tag1", "origin": "manual", "added_at": now},
{"asset_reference_id": ref.id, "tag_name": "bulk-tag2", "origin": "manual", "added_at": now},
]
bulk_insert_tags_and_meta(session, tag_rows=tag_rows, meta_rows=[])
session.commit()
tags = get_reference_tags(session, reference_id=ref.id)
assert set(tags) == {"bulk-tag1", "bulk-tag2"}
def test_inserts_meta(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
session.commit()
meta_rows = [
{
"asset_reference_id": ref.id,
"key": "meta-key",
"ordinal": 0,
"val_str": "meta-value",
"val_num": None,
"val_bool": None,
"val_json": None,
},
]
bulk_insert_tags_and_meta(session, tag_rows=[], meta_rows=meta_rows)
session.commit()
meta = session.query(AssetReferenceMeta).filter_by(asset_reference_id=ref.id).all()
assert len(meta) == 1
assert meta[0].key == "meta-key"
assert meta[0].val_str == "meta-value"
def test_ignores_conflicts(self, session: Session):
asset = _make_asset(session, "hash1")
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["existing-tag"])
add_tags_to_reference(session, reference_id=ref.id, tags=["existing-tag"])
session.commit()
now = get_utc_now()
tag_rows = [
{"asset_reference_id": ref.id, "tag_name": "existing-tag", "origin": "duplicate", "added_at": now},
]
bulk_insert_tags_and_meta(session, tag_rows=tag_rows, meta_rows=[])
session.commit()
# Should still have only one tag link
links = session.query(AssetReferenceTag).filter_by(asset_reference_id=ref.id, tag_name="existing-tag").all()
assert len(links) == 1
# Origin should be original, not overwritten
assert links[0].origin == "manual"
def test_empty_lists_is_noop(self, session: Session):
bulk_insert_tags_and_meta(session, tag_rows=[], meta_rows=[])
assert session.query(AssetReferenceTag).count() == 0
assert session.query(AssetReferenceMeta).count() == 0

View File

@@ -0,0 +1 @@
# Service layer tests

View File

@@ -0,0 +1,54 @@
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from app.assets.database.models import Base
@pytest.fixture(autouse=True)
def autoclean_unit_test_assets():
"""Override parent autouse fixture - service unit tests don't need server cleanup."""
yield
@pytest.fixture
def db_engine():
"""In-memory SQLite engine for fast unit tests."""
engine = create_engine("sqlite:///:memory:")
Base.metadata.create_all(engine)
return engine
@pytest.fixture
def session(db_engine):
"""Session fixture for tests that need direct DB access."""
with Session(db_engine) as sess:
yield sess
@pytest.fixture
def mock_create_session(db_engine):
"""Patch create_session to use our in-memory database."""
from contextlib import contextmanager
from sqlalchemy.orm import Session as SASession
@contextmanager
def _create_session():
with SASession(db_engine) as sess:
yield sess
with patch("app.assets.services.ingest.create_session", _create_session), \
patch("app.assets.services.asset_management.create_session", _create_session), \
patch("app.assets.services.tagging.create_session", _create_session):
yield _create_session
@pytest.fixture
def temp_dir():
"""Temporary directory for file operations."""
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)

View File

@@ -0,0 +1,268 @@
"""Tests for asset_management services."""
import pytest
from sqlalchemy.orm import Session
from app.assets.database.models import Asset, AssetReference
from app.assets.database.queries import ensure_tags_exist, add_tags_to_reference
from app.assets.helpers import get_utc_now
from app.assets.services import (
get_asset_detail,
update_asset_metadata,
delete_asset_reference,
set_asset_preview,
)
def _make_asset(session: Session, hash_val: str = "blake3:test", size: int = 1024) -> Asset:
asset = Asset(hash=hash_val, size_bytes=size, mime_type="application/octet-stream")
session.add(asset)
session.flush()
return asset
def _make_reference(
session: Session,
asset: Asset,
name: str = "test",
owner_id: str = "",
) -> AssetReference:
now = get_utc_now()
ref = AssetReference(
owner_id=owner_id,
name=name,
asset_id=asset.id,
created_at=now,
updated_at=now,
last_access_time=now,
)
session.add(ref)
session.flush()
return ref
class TestGetAssetDetail:
def test_returns_none_for_nonexistent(self, mock_create_session):
result = get_asset_detail(reference_id="nonexistent")
assert result is None
def test_returns_asset_with_tags(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset, name="test.bin")
ensure_tags_exist(session, ["alpha", "beta"])
add_tags_to_reference(session, reference_id=ref.id, tags=["alpha", "beta"])
session.commit()
result = get_asset_detail(reference_id=ref.id)
assert result is not None
assert result.ref.id == ref.id
assert result.asset.hash == asset.hash
assert set(result.tags) == {"alpha", "beta"}
def test_respects_owner_visibility(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset, owner_id="user1")
session.commit()
# Wrong owner cannot see
result = get_asset_detail(reference_id=ref.id, owner_id="user2")
assert result is None
# Correct owner can see
result = get_asset_detail(reference_id=ref.id, owner_id="user1")
assert result is not None
class TestUpdateAssetMetadata:
def test_updates_name(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset, name="old_name.bin")
ref_id = ref.id
session.commit()
update_asset_metadata(
reference_id=ref_id,
name="new_name.bin",
)
# Verify by re-fetching from DB
session.expire_all()
updated_ref = session.get(AssetReference, ref_id)
assert updated_ref.name == "new_name.bin"
def test_updates_tags(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["old"])
add_tags_to_reference(session, reference_id=ref.id, tags=["old"])
session.commit()
result = update_asset_metadata(
reference_id=ref.id,
tags=["new1", "new2"],
)
assert set(result.tags) == {"new1", "new2"}
assert "old" not in result.tags
def test_updates_user_metadata(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
ref_id = ref.id
session.commit()
update_asset_metadata(
reference_id=ref_id,
user_metadata={"key": "value", "num": 42},
)
# Verify by re-fetching from DB
session.expire_all()
updated_ref = session.get(AssetReference, ref_id)
assert updated_ref.user_metadata["key"] == "value"
assert updated_ref.user_metadata["num"] == 42
def test_raises_for_nonexistent(self, mock_create_session):
with pytest.raises(ValueError, match="not found"):
update_asset_metadata(reference_id="nonexistent", name="fail")
def test_raises_for_wrong_owner(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset, owner_id="user1")
session.commit()
with pytest.raises(PermissionError, match="not owner"):
update_asset_metadata(
reference_id=ref.id,
name="new",
owner_id="user2",
)
class TestDeleteAssetReference:
def test_soft_deletes_reference(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
ref_id = ref.id
session.commit()
result = delete_asset_reference(
reference_id=ref_id,
owner_id="",
delete_content_if_orphan=False,
)
assert result is True
# Row still exists but is marked as soft-deleted
session.expire_all()
row = session.get(AssetReference, ref_id)
assert row is not None
assert row.deleted_at is not None
def test_returns_false_for_nonexistent(self, mock_create_session):
result = delete_asset_reference(
reference_id="nonexistent",
owner_id="",
)
assert result is False
def test_returns_false_for_wrong_owner(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset, owner_id="user1")
ref_id = ref.id
session.commit()
result = delete_asset_reference(
reference_id=ref_id,
owner_id="user2",
)
assert result is False
assert session.get(AssetReference, ref_id) is not None
def test_keeps_asset_if_other_references_exist(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref1 = _make_reference(session, asset, name="ref1")
_make_reference(session, asset, name="ref2") # Second ref keeps asset alive
asset_id = asset.id
session.commit()
delete_asset_reference(
reference_id=ref1.id,
owner_id="",
delete_content_if_orphan=True,
)
# Asset should still exist
assert session.get(Asset, asset_id) is not None
def test_deletes_orphaned_asset(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
asset_id = asset.id
ref_id = ref.id
session.commit()
delete_asset_reference(
reference_id=ref_id,
owner_id="",
delete_content_if_orphan=True,
)
# Both ref and asset should be gone
assert session.get(AssetReference, ref_id) is None
assert session.get(Asset, asset_id) is None
class TestSetAssetPreview:
def test_sets_preview(self, mock_create_session, session: Session):
asset = _make_asset(session, hash_val="blake3:main")
preview_asset = _make_asset(session, hash_val="blake3:preview")
ref = _make_reference(session, asset)
ref_id = ref.id
preview_id = preview_asset.id
session.commit()
set_asset_preview(
reference_id=ref_id,
preview_asset_id=preview_id,
)
# Verify by re-fetching from DB
session.expire_all()
updated_ref = session.get(AssetReference, ref_id)
assert updated_ref.preview_id == preview_id
def test_clears_preview(self, mock_create_session, session: Session):
asset = _make_asset(session)
preview_asset = _make_asset(session, hash_val="blake3:preview")
ref = _make_reference(session, asset)
ref.preview_id = preview_asset.id
ref_id = ref.id
session.commit()
set_asset_preview(
reference_id=ref_id,
preview_asset_id=None,
)
# Verify by re-fetching from DB
session.expire_all()
updated_ref = session.get(AssetReference, ref_id)
assert updated_ref.preview_id is None
def test_raises_for_nonexistent_ref(self, mock_create_session):
with pytest.raises(ValueError, match="not found"):
set_asset_preview(reference_id="nonexistent")
def test_raises_for_wrong_owner(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset, owner_id="user1")
session.commit()
with pytest.raises(PermissionError, match="not owner"):
set_asset_preview(
reference_id=ref.id,
preview_asset_id=None,
owner_id="user2",
)

View File

@@ -0,0 +1,137 @@
"""Tests for bulk ingest services."""
from pathlib import Path
from sqlalchemy.orm import Session
from app.assets.database.models import Asset, AssetReference
from app.assets.services.bulk_ingest import SeedAssetSpec, batch_insert_seed_assets
class TestBatchInsertSeedAssets:
def test_populates_mime_type_for_model_files(self, session: Session, temp_dir: Path):
"""Verify mime_type is stored in the Asset table for model files."""
file_path = temp_dir / "model.safetensors"
file_path.write_bytes(b"fake safetensors content")
specs: list[SeedAssetSpec] = [
{
"abs_path": str(file_path),
"size_bytes": 24,
"mtime_ns": 1234567890000000000,
"info_name": "Test Model",
"tags": ["models"],
"fname": "model.safetensors",
"metadata": None,
"hash": None,
"mime_type": "application/safetensors",
}
]
result = batch_insert_seed_assets(session, specs=specs, owner_id="")
assert result.inserted_refs == 1
# Verify Asset has mime_type populated
assets = session.query(Asset).all()
assert len(assets) == 1
assert assets[0].mime_type == "application/safetensors"
def test_mime_type_none_when_not_provided(self, session: Session, temp_dir: Path):
"""Verify mime_type is None when not provided in spec."""
file_path = temp_dir / "unknown.bin"
file_path.write_bytes(b"binary data")
specs: list[SeedAssetSpec] = [
{
"abs_path": str(file_path),
"size_bytes": 11,
"mtime_ns": 1234567890000000000,
"info_name": "Unknown File",
"tags": [],
"fname": "unknown.bin",
"metadata": None,
"hash": None,
"mime_type": None,
}
]
result = batch_insert_seed_assets(session, specs=specs, owner_id="")
assert result.inserted_refs == 1
assets = session.query(Asset).all()
assert len(assets) == 1
assert assets[0].mime_type is None
def test_various_model_mime_types(self, session: Session, temp_dir: Path):
"""Verify various model file types get correct mime_type."""
test_cases = [
("model.safetensors", "application/safetensors"),
("model.pt", "application/pytorch"),
("model.ckpt", "application/pickle"),
("model.gguf", "application/gguf"),
]
specs: list[SeedAssetSpec] = []
for filename, mime_type in test_cases:
file_path = temp_dir / filename
file_path.write_bytes(b"content")
specs.append(
{
"abs_path": str(file_path),
"size_bytes": 7,
"mtime_ns": 1234567890000000000,
"info_name": filename,
"tags": [],
"fname": filename,
"metadata": None,
"hash": None,
"mime_type": mime_type,
}
)
result = batch_insert_seed_assets(session, specs=specs, owner_id="")
assert result.inserted_refs == len(test_cases)
for filename, expected_mime in test_cases:
ref = session.query(AssetReference).filter_by(name=filename).first()
assert ref is not None
asset = session.query(Asset).filter_by(id=ref.asset_id).first()
assert asset.mime_type == expected_mime, f"Expected {expected_mime} for {filename}, got {asset.mime_type}"
class TestMetadataExtraction:
def test_extracts_mime_type_for_model_files(self, temp_dir: Path):
"""Verify metadata extraction returns correct mime_type for model files."""
from app.assets.services.metadata_extract import extract_file_metadata
file_path = temp_dir / "model.safetensors"
file_path.write_bytes(b"fake safetensors content")
meta = extract_file_metadata(str(file_path))
assert meta.content_type == "application/safetensors"
def test_mime_type_for_various_model_formats(self, temp_dir: Path):
"""Verify various model file types get correct mime_type from metadata."""
from app.assets.services.metadata_extract import extract_file_metadata
test_cases = [
("model.safetensors", "application/safetensors"),
("model.sft", "application/safetensors"),
("model.pt", "application/pytorch"),
("model.pth", "application/pytorch"),
("model.ckpt", "application/pickle"),
("model.pkl", "application/pickle"),
("model.gguf", "application/gguf"),
]
for filename, expected_mime in test_cases:
file_path = temp_dir / filename
file_path.write_bytes(b"content")
meta = extract_file_metadata(str(file_path))
assert meta.content_type == expected_mime, f"Expected {expected_mime} for {filename}, got {meta.content_type}"

View File

@@ -0,0 +1,207 @@
"""Tests for asset enrichment (mime_type and hash population)."""
from pathlib import Path
from sqlalchemy.orm import Session
from app.assets.database.models import Asset, AssetReference
from app.assets.scanner import (
ENRICHMENT_HASHED,
ENRICHMENT_METADATA,
ENRICHMENT_STUB,
enrich_asset,
)
def _create_stub_asset(
session: Session,
file_path: str,
asset_id: str = "test-asset-id",
reference_id: str = "test-ref-id",
name: str | None = None,
) -> tuple[Asset, AssetReference]:
"""Create a stub asset with reference for testing enrichment."""
asset = Asset(
id=asset_id,
hash=None,
size_bytes=100,
mime_type=None,
)
session.add(asset)
session.flush()
ref = AssetReference(
id=reference_id,
asset_id=asset_id,
name=name or f"test-asset-{asset_id}",
owner_id="system",
file_path=file_path,
mtime_ns=1234567890000000000,
enrichment_level=ENRICHMENT_STUB,
)
session.add(ref)
session.flush()
return asset, ref
class TestEnrichAsset:
def test_extracts_mime_type_and_updates_asset(
self, db_engine, temp_dir: Path, session: Session
):
"""Verify mime_type is written to the Asset table during enrichment."""
file_path = temp_dir / "model.safetensors"
file_path.write_bytes(b"\x00" * 100)
asset, ref = _create_stub_asset(
session, str(file_path), "asset-1", "ref-1"
)
session.commit()
new_level = enrich_asset(
session,
file_path=str(file_path),
reference_id=ref.id,
asset_id=asset.id,
extract_metadata=True,
compute_hash=False,
)
assert new_level == ENRICHMENT_METADATA
session.expire_all()
updated_asset = session.get(Asset, "asset-1")
assert updated_asset is not None
assert updated_asset.mime_type == "application/safetensors"
def test_computes_hash_and_updates_asset(
self, db_engine, temp_dir: Path, session: Session
):
"""Verify hash is written to the Asset table during enrichment."""
file_path = temp_dir / "data.bin"
file_path.write_bytes(b"test content for hashing")
asset, ref = _create_stub_asset(
session, str(file_path), "asset-2", "ref-2"
)
session.commit()
new_level = enrich_asset(
session,
file_path=str(file_path),
reference_id=ref.id,
asset_id=asset.id,
extract_metadata=True,
compute_hash=True,
)
assert new_level == ENRICHMENT_HASHED
session.expire_all()
updated_asset = session.get(Asset, "asset-2")
assert updated_asset is not None
assert updated_asset.hash is not None
assert updated_asset.hash.startswith("blake3:")
def test_enrichment_updates_both_mime_and_hash(
self, db_engine, temp_dir: Path, session: Session
):
"""Verify both mime_type and hash are set when full enrichment runs."""
file_path = temp_dir / "model.safetensors"
file_path.write_bytes(b"\x00" * 50)
asset, ref = _create_stub_asset(
session, str(file_path), "asset-3", "ref-3"
)
session.commit()
enrich_asset(
session,
file_path=str(file_path),
reference_id=ref.id,
asset_id=asset.id,
extract_metadata=True,
compute_hash=True,
)
session.expire_all()
updated_asset = session.get(Asset, "asset-3")
assert updated_asset is not None
assert updated_asset.mime_type == "application/safetensors"
assert updated_asset.hash is not None
assert updated_asset.hash.startswith("blake3:")
def test_missing_file_returns_stub_level(
self, db_engine, temp_dir: Path, session: Session
):
"""Verify missing files don't cause errors and return STUB level."""
file_path = temp_dir / "nonexistent.bin"
asset, ref = _create_stub_asset(
session, str(file_path), "asset-4", "ref-4"
)
session.commit()
new_level = enrich_asset(
session,
file_path=str(file_path),
reference_id=ref.id,
asset_id=asset.id,
extract_metadata=True,
compute_hash=True,
)
assert new_level == ENRICHMENT_STUB
session.expire_all()
updated_asset = session.get(Asset, "asset-4")
assert updated_asset.mime_type is None
assert updated_asset.hash is None
def test_duplicate_hash_merges_into_existing_asset(
self, db_engine, temp_dir: Path, session: Session
):
"""Verify duplicate files merge into existing asset instead of failing."""
file_path_1 = temp_dir / "file1.bin"
file_path_2 = temp_dir / "file2.bin"
content = b"identical content"
file_path_1.write_bytes(content)
file_path_2.write_bytes(content)
asset1, ref1 = _create_stub_asset(
session, str(file_path_1), "asset-dup-1", "ref-dup-1"
)
asset2, ref2 = _create_stub_asset(
session, str(file_path_2), "asset-dup-2", "ref-dup-2"
)
session.commit()
enrich_asset(
session,
file_path=str(file_path_1),
reference_id=ref1.id,
asset_id=asset1.id,
extract_metadata=True,
compute_hash=True,
)
enrich_asset(
session,
file_path=str(file_path_2),
reference_id=ref2.id,
asset_id=asset2.id,
extract_metadata=True,
compute_hash=True,
)
session.expire_all()
updated_asset1 = session.get(Asset, "asset-dup-1")
assert updated_asset1 is not None
assert updated_asset1.hash is not None
updated_asset2 = session.get(Asset, "asset-dup-2")
assert updated_asset2 is None
updated_ref2 = session.get(AssetReference, "ref-dup-2")
assert updated_ref2 is not None
assert updated_ref2.asset_id == "asset-dup-1"

View File

@@ -0,0 +1,229 @@
"""Tests for ingest services."""
from pathlib import Path
import pytest
from sqlalchemy.orm import Session
from app.assets.database.models import Asset, AssetReference, Tag
from app.assets.database.queries import get_reference_tags
from app.assets.services.ingest import _ingest_file_from_path, _register_existing_asset
class TestIngestFileFromPath:
def test_creates_asset_and_reference(self, mock_create_session, temp_dir: Path, session: Session):
file_path = temp_dir / "test_file.bin"
file_path.write_bytes(b"test content")
result = _ingest_file_from_path(
abs_path=str(file_path),
asset_hash="blake3:abc123",
size_bytes=12,
mtime_ns=1234567890000000000,
mime_type="application/octet-stream",
)
assert result.asset_created is True
assert result.ref_created is True
assert result.reference_id is not None
# Verify DB state
assets = session.query(Asset).all()
assert len(assets) == 1
assert assets[0].hash == "blake3:abc123"
refs = session.query(AssetReference).all()
assert len(refs) == 1
assert refs[0].file_path == str(file_path)
def test_creates_reference_when_name_provided(self, mock_create_session, temp_dir: Path, session: Session):
file_path = temp_dir / "model.safetensors"
file_path.write_bytes(b"model data")
result = _ingest_file_from_path(
abs_path=str(file_path),
asset_hash="blake3:def456",
size_bytes=10,
mtime_ns=1234567890000000000,
mime_type="application/octet-stream",
info_name="My Model",
owner_id="user1",
)
assert result.asset_created is True
assert result.reference_id is not None
ref = session.query(AssetReference).first()
assert ref is not None
assert ref.name == "My Model"
assert ref.owner_id == "user1"
def test_creates_tags_when_provided(self, mock_create_session, temp_dir: Path, session: Session):
file_path = temp_dir / "tagged.bin"
file_path.write_bytes(b"data")
result = _ingest_file_from_path(
abs_path=str(file_path),
asset_hash="blake3:ghi789",
size_bytes=4,
mtime_ns=1234567890000000000,
info_name="Tagged Asset",
tags=["models", "checkpoints"],
)
assert result.reference_id is not None
# Verify tags were created and linked
tags = session.query(Tag).all()
tag_names = {t.name for t in tags}
assert "models" in tag_names
assert "checkpoints" in tag_names
ref_tags = get_reference_tags(session, reference_id=result.reference_id)
assert set(ref_tags) == {"models", "checkpoints"}
def test_idempotent_upsert(self, mock_create_session, temp_dir: Path, session: Session):
file_path = temp_dir / "dup.bin"
file_path.write_bytes(b"content")
# First ingest
r1 = _ingest_file_from_path(
abs_path=str(file_path),
asset_hash="blake3:repeat",
size_bytes=7,
mtime_ns=1234567890000000000,
)
assert r1.asset_created is True
# Second ingest with same hash - should update, not create
r2 = _ingest_file_from_path(
abs_path=str(file_path),
asset_hash="blake3:repeat",
size_bytes=7,
mtime_ns=1234567890000000001, # different mtime
)
assert r2.asset_created is False
assert r2.ref_created is False
assert r2.ref_updated is True
# Still only one asset
assets = session.query(Asset).all()
assert len(assets) == 1
def test_validates_preview_id(self, mock_create_session, temp_dir: Path, session: Session):
file_path = temp_dir / "with_preview.bin"
file_path.write_bytes(b"data")
# Create a preview asset first
preview_asset = Asset(hash="blake3:preview", size_bytes=100)
session.add(preview_asset)
session.commit()
preview_id = preview_asset.id
result = _ingest_file_from_path(
abs_path=str(file_path),
asset_hash="blake3:main",
size_bytes=4,
mtime_ns=1234567890000000000,
info_name="With Preview",
preview_id=preview_id,
)
assert result.reference_id is not None
ref = session.query(AssetReference).filter_by(id=result.reference_id).first()
assert ref.preview_id == preview_id
def test_invalid_preview_id_is_cleared(self, mock_create_session, temp_dir: Path, session: Session):
file_path = temp_dir / "bad_preview.bin"
file_path.write_bytes(b"data")
result = _ingest_file_from_path(
abs_path=str(file_path),
asset_hash="blake3:badpreview",
size_bytes=4,
mtime_ns=1234567890000000000,
info_name="Bad Preview",
preview_id="nonexistent-uuid",
)
assert result.reference_id is not None
ref = session.query(AssetReference).filter_by(id=result.reference_id).first()
assert ref.preview_id is None
class TestRegisterExistingAsset:
def test_creates_reference_for_existing_asset(self, mock_create_session, session: Session):
# Create existing asset
asset = Asset(hash="blake3:existing", size_bytes=1024, mime_type="image/png")
session.add(asset)
session.commit()
result = _register_existing_asset(
asset_hash="blake3:existing",
name="Registered Asset",
user_metadata={"key": "value"},
tags=["models"],
)
assert result.created is True
assert "models" in result.tags
# Verify by re-fetching from DB
session.expire_all()
refs = session.query(AssetReference).filter_by(name="Registered Asset").all()
assert len(refs) == 1
def test_creates_new_reference_even_with_same_name(self, mock_create_session, session: Session):
# Create asset and reference
asset = Asset(hash="blake3:withref", size_bytes=512)
session.add(asset)
session.flush()
from app.assets.helpers import get_utc_now
ref = AssetReference(
owner_id="",
name="Existing Ref",
asset_id=asset.id,
created_at=get_utc_now(),
updated_at=get_utc_now(),
last_access_time=get_utc_now(),
)
session.add(ref)
session.flush()
ref_id = ref.id
session.commit()
result = _register_existing_asset(
asset_hash="blake3:withref",
name="Existing Ref",
owner_id="",
)
# Multiple files with same name are allowed
assert result.created is True
# Verify two AssetReferences exist for this name
session.expire_all()
refs = session.query(AssetReference).filter_by(name="Existing Ref").all()
assert len(refs) == 2
assert ref_id in [r.id for r in refs]
def test_raises_for_nonexistent_hash(self, mock_create_session):
with pytest.raises(ValueError, match="No asset with hash"):
_register_existing_asset(
asset_hash="blake3:doesnotexist",
name="Fail",
)
def test_applies_tags_to_new_reference(self, mock_create_session, session: Session):
asset = Asset(hash="blake3:tagged", size_bytes=256)
session.add(asset)
session.commit()
result = _register_existing_asset(
asset_hash="blake3:tagged",
name="Tagged Ref",
tags=["alpha", "beta"],
)
assert result.created is True
assert set(result.tags) == {"alpha", "beta"}

View File

@@ -0,0 +1,197 @@
"""Tests for tagging services."""
import pytest
from sqlalchemy.orm import Session
from app.assets.database.models import Asset, AssetReference
from app.assets.database.queries import ensure_tags_exist, add_tags_to_reference
from app.assets.helpers import get_utc_now
from app.assets.services import apply_tags, remove_tags, list_tags
def _make_asset(session: Session, hash_val: str = "blake3:test") -> Asset:
asset = Asset(hash=hash_val, size_bytes=1024)
session.add(asset)
session.flush()
return asset
def _make_reference(
session: Session,
asset: Asset,
name: str = "test",
owner_id: str = "",
) -> AssetReference:
now = get_utc_now()
ref = AssetReference(
owner_id=owner_id,
name=name,
asset_id=asset.id,
created_at=now,
updated_at=now,
last_access_time=now,
)
session.add(ref)
session.flush()
return ref
class TestApplyTags:
def test_adds_new_tags(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
session.commit()
result = apply_tags(
reference_id=ref.id,
tags=["alpha", "beta"],
)
assert set(result.added) == {"alpha", "beta"}
assert result.already_present == []
assert set(result.total_tags) == {"alpha", "beta"}
def test_reports_already_present(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["existing"])
add_tags_to_reference(session, reference_id=ref.id, tags=["existing"])
session.commit()
result = apply_tags(
reference_id=ref.id,
tags=["existing", "new"],
)
assert result.added == ["new"]
assert result.already_present == ["existing"]
def test_raises_for_nonexistent_ref(self, mock_create_session):
with pytest.raises(ValueError, match="not found"):
apply_tags(reference_id="nonexistent", tags=["x"])
def test_raises_for_wrong_owner(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset, owner_id="user1")
session.commit()
with pytest.raises(PermissionError, match="not owner"):
apply_tags(
reference_id=ref.id,
tags=["new"],
owner_id="user2",
)
class TestRemoveTags:
def test_removes_tags(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["a", "b", "c"])
add_tags_to_reference(session, reference_id=ref.id, tags=["a", "b", "c"])
session.commit()
result = remove_tags(
reference_id=ref.id,
tags=["a", "b"],
)
assert set(result.removed) == {"a", "b"}
assert result.not_present == []
assert result.total_tags == ["c"]
def test_reports_not_present(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset)
ensure_tags_exist(session, ["present"])
add_tags_to_reference(session, reference_id=ref.id, tags=["present"])
session.commit()
result = remove_tags(
reference_id=ref.id,
tags=["present", "absent"],
)
assert result.removed == ["present"]
assert result.not_present == ["absent"]
def test_raises_for_nonexistent_ref(self, mock_create_session):
with pytest.raises(ValueError, match="not found"):
remove_tags(reference_id="nonexistent", tags=["x"])
def test_raises_for_wrong_owner(self, mock_create_session, session: Session):
asset = _make_asset(session)
ref = _make_reference(session, asset, owner_id="user1")
session.commit()
with pytest.raises(PermissionError, match="not owner"):
remove_tags(
reference_id=ref.id,
tags=["x"],
owner_id="user2",
)
class TestListTags:
def test_returns_tags_with_counts(self, mock_create_session, session: Session):
ensure_tags_exist(session, ["used", "unused"])
asset = _make_asset(session)
ref = _make_reference(session, asset)
add_tags_to_reference(session, reference_id=ref.id, tags=["used"])
session.commit()
rows, total = list_tags()
tag_dict = {name: count for name, _, count in rows}
assert tag_dict["used"] == 1
assert tag_dict["unused"] == 0
assert total == 2
def test_excludes_zero_counts(self, mock_create_session, session: Session):
ensure_tags_exist(session, ["used", "unused"])
asset = _make_asset(session)
ref = _make_reference(session, asset)
add_tags_to_reference(session, reference_id=ref.id, tags=["used"])
session.commit()
rows, total = list_tags(include_zero=False)
tag_names = {name for name, _, _ in rows}
assert "used" in tag_names
assert "unused" not in tag_names
def test_prefix_filter(self, mock_create_session, session: Session):
ensure_tags_exist(session, ["alpha", "beta", "alphabet"])
session.commit()
rows, _ = list_tags(prefix="alph")
tag_names = {name for name, _, _ in rows}
assert tag_names == {"alpha", "alphabet"}
def test_order_by_name(self, mock_create_session, session: Session):
ensure_tags_exist(session, ["zebra", "alpha", "middle"])
session.commit()
rows, _ = list_tags(order="name_asc")
names = [name for name, _, _ in rows]
assert names == ["alpha", "middle", "zebra"]
def test_pagination(self, mock_create_session, session: Session):
ensure_tags_exist(session, ["a", "b", "c", "d", "e"])
session.commit()
rows, total = list_tags(limit=2, offset=1, order="name_asc")
assert total == 5
assert len(rows) == 2
names = [name for name, _, _ in rows]
assert names == ["b", "c"]
def test_clamps_limit(self, mock_create_session, session: Session):
ensure_tags_exist(session, ["a"])
session.commit()
# Service should clamp limit to max 1000
rows, _ = list_tags(limit=2000)
assert len(rows) <= 1000

View File

@@ -4,7 +4,7 @@ from pathlib import Path
import pytest
import requests
from conftest import get_asset_filename, trigger_sync_seed_assets
from helpers import get_asset_filename, trigger_sync_seed_assets

View File

@@ -4,7 +4,7 @@ from pathlib import Path
import pytest
import requests
from conftest import get_asset_filename, trigger_sync_seed_assets
from helpers import get_asset_filename, trigger_sync_seed_assets
def test_create_from_hash_success(
@@ -24,11 +24,11 @@ def test_create_from_hash_success(
assert b1["created_new"] is False
aid = b1["id"]
# Calling again with the same name should return the same AssetInfo id
# Calling again with the same name creates a new AssetReference (duplicates allowed)
r2 = http.post(f"{api_base}/api/assets/from-hash", json=payload, timeout=120)
b2 = r2.json()
assert r2.status_code == 201, b2
assert b2["id"] == aid
assert b2["id"] != aid # new reference, not the same one
def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asset: dict):
@@ -42,8 +42,8 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse
assert "user_metadata" in detail
assert "filename" in detail["user_metadata"]
# DELETE
rd = http.delete(f"{api_base}/api/assets/{aid}", timeout=120)
# DELETE (hard delete to also remove underlying asset and file)
rd = http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120)
assert rd.status_code == 204
# GET again -> 404
@@ -53,6 +53,35 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse
assert body["error"]["code"] == "ASSET_NOT_FOUND"
def test_soft_delete_hides_from_get(http: requests.Session, api_base: str, seeded_asset: dict):
aid = seeded_asset["id"]
asset_hash = seeded_asset["asset_hash"]
# Soft-delete (default, no delete_content param)
rd = http.delete(f"{api_base}/api/assets/{aid}", timeout=120)
assert rd.status_code == 204
# GET by reference ID -> 404 (soft-deleted references are hidden)
rg = http.get(f"{api_base}/api/assets/{aid}", timeout=120)
assert rg.status_code == 404
# Asset identity is preserved (underlying content still exists)
rh = http.head(f"{api_base}/api/assets/hash/{asset_hash}", timeout=120)
assert rh.status_code == 200
# Soft-deleted reference should not appear in listings
rl = http.get(
f"{api_base}/api/assets",
params={"include_tags": "unit-tests", "limit": "500"},
timeout=120,
)
ids = [a["id"] for a in rl.json().get("assets", [])]
assert aid not in ids
# Clean up: hard-delete the soft-deleted reference and orphaned asset
http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120)
def test_delete_upon_reference_count(
http: requests.Session, api_base: str, seeded_asset: dict
):
@@ -70,21 +99,32 @@ def test_delete_upon_reference_count(
assert copy["asset_hash"] == src_hash
assert copy["created_new"] is False
# Delete original reference -> asset identity must remain
# Soft-delete original reference (default) -> asset identity must remain
aid1 = seeded_asset["id"]
rd1 = http.delete(f"{api_base}/api/assets/{aid1}", timeout=120)
assert rd1.status_code == 204
rh1 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120)
assert rh1.status_code == 200 # identity still present
assert rh1.status_code == 200 # identity still present (second ref exists)
# Delete the last reference with default semantics -> identity and cached files removed
# Soft-delete the last reference -> asset identity preserved (no hard delete)
aid2 = copy["id"]
rd2 = http.delete(f"{api_base}/api/assets/{aid2}", timeout=120)
assert rd2.status_code == 204
rh2 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120)
assert rh2.status_code == 404 # orphan content removed
assert rh2.status_code == 200 # asset identity preserved (soft delete)
# Re-associate via from-hash, then hard-delete -> orphan content removed
r3 = http.post(f"{api_base}/api/assets/from-hash", json=payload, timeout=120)
assert r3.status_code == 201, r3.json()
aid3 = r3.json()["id"]
rd3 = http.delete(f"{api_base}/api/assets/{aid3}?delete_content=true", timeout=120)
assert rd3.status_code == 204
rh3 = http.head(f"{api_base}/api/assets/hash/{src_hash}", timeout=120)
assert rh3.status_code == 404 # orphan content removed
def test_update_asset_fields(http: requests.Session, api_base: str, seeded_asset: dict):
@@ -126,42 +166,52 @@ def test_head_asset_bad_hash_returns_400_and_no_body(http: requests.Session, api
assert body == b""
def test_delete_nonexistent_returns_404(http: requests.Session, api_base: str):
bogus = str(uuid.uuid4())
r = http.delete(f"{api_base}/api/assets/{bogus}", timeout=120)
@pytest.mark.parametrize(
"method,endpoint_template,payload,expected_status,error_code",
[
# Delete nonexistent asset
("delete", "/api/assets/{uuid}", None, 404, "ASSET_NOT_FOUND"),
# Bad hash algorithm in from-hash
(
"post",
"/api/assets/from-hash",
{"hash": "sha256:" + "0" * 64, "name": "x.bin", "tags": ["models", "checkpoints", "unit-tests"]},
400,
"INVALID_BODY",
),
# Get with bad UUID format
("get", "/api/assets/not-a-uuid", None, 404, None),
# Get content with bad UUID format
("get", "/api/assets/not-a-uuid/content", None, 404, None),
],
ids=["delete_nonexistent", "bad_hash_algorithm", "get_bad_uuid", "content_bad_uuid"],
)
def test_error_responses(
http: requests.Session, api_base: str, method, endpoint_template, payload, expected_status, error_code
):
# Replace {uuid} placeholder with a random UUID for delete test
endpoint = endpoint_template.replace("{uuid}", str(uuid.uuid4()))
url = f"{api_base}{endpoint}"
if method == "get":
r = http.get(url, timeout=120)
elif method == "post":
r = http.post(url, json=payload, timeout=120)
elif method == "delete":
r = http.delete(url, timeout=120)
assert r.status_code == expected_status
if error_code:
body = r.json()
assert body["error"]["code"] == error_code
def test_create_from_hash_invalid_json(http: requests.Session, api_base: str):
"""Invalid JSON body requires special handling (data= instead of json=)."""
r = http.post(f"{api_base}/api/assets/from-hash", data=b"{not json}", timeout=120)
body = r.json()
assert r.status_code == 404
assert body["error"]["code"] == "ASSET_NOT_FOUND"
def test_create_from_hash_invalids(http: requests.Session, api_base: str):
# Bad hash algorithm
bad = {
"hash": "sha256:" + "0" * 64,
"name": "x.bin",
"tags": ["models", "checkpoints", "unit-tests"],
}
r1 = http.post(f"{api_base}/api/assets/from-hash", json=bad, timeout=120)
b1 = r1.json()
assert r1.status_code == 400
assert b1["error"]["code"] == "INVALID_BODY"
# Invalid JSON body
r2 = http.post(f"{api_base}/api/assets/from-hash", data=b"{not json}", timeout=120)
b2 = r2.json()
assert r2.status_code == 400
assert b2["error"]["code"] == "INVALID_JSON"
def test_get_update_download_bad_ids(http: requests.Session, api_base: str):
# All endpoints should be not found, as we UUID regex directly in the route definition.
bad_id = "not-a-uuid"
r1 = http.get(f"{api_base}/api/assets/{bad_id}", timeout=120)
assert r1.status_code == 404
r3 = http.get(f"{api_base}/api/assets/{bad_id}/content", timeout=120)
assert r3.status_code == 404
assert r.status_code == 400
assert body["error"]["code"] == "INVALID_JSON"
def test_update_requires_at_least_one_field(http: requests.Session, api_base: str, seeded_asset: dict):

View File

@@ -6,7 +6,7 @@ from typing import Optional
import pytest
import requests
from conftest import get_asset_filename, trigger_sync_seed_assets
from helpers import get_asset_filename, trigger_sync_seed_assets
def test_download_attachment_and_inline(http: requests.Session, api_base: str, seeded_asset: dict):
@@ -117,7 +117,7 @@ def test_download_missing_file_returns_404(
assert body["error"]["code"] == "FILE_NOT_FOUND"
finally:
# We created asset without the "unit-tests" tag(see `autoclean_unit_test_assets`), we need to clear it manually.
dr = http.delete(f"{api_base}/api/assets/{aid}", timeout=120)
dr = http.delete(f"{api_base}/api/assets/{aid}?delete_content=true", timeout=120)
dr.content

View File

@@ -0,0 +1,121 @@
import os
import sys
import pytest
from app.assets.services.file_utils import is_visible, list_files_recursively
class TestIsVisible:
def test_visible_file(self):
assert is_visible("file.txt") is True
def test_hidden_file(self):
assert is_visible(".hidden") is False
def test_hidden_directory(self):
assert is_visible(".git") is False
def test_visible_directory(self):
assert is_visible("src") is True
def test_dotdot_is_hidden(self):
assert is_visible("..") is False
def test_dot_is_hidden(self):
assert is_visible(".") is False
class TestListFilesRecursively:
def test_skips_hidden_files(self, tmp_path):
(tmp_path / "visible.txt").write_text("a")
(tmp_path / ".hidden").write_text("b")
result = list_files_recursively(str(tmp_path))
assert len(result) == 1
assert result[0].endswith("visible.txt")
def test_skips_hidden_directories(self, tmp_path):
hidden_dir = tmp_path / ".hidden_dir"
hidden_dir.mkdir()
(hidden_dir / "file.txt").write_text("a")
visible_dir = tmp_path / "visible_dir"
visible_dir.mkdir()
(visible_dir / "file.txt").write_text("b")
result = list_files_recursively(str(tmp_path))
assert len(result) == 1
assert "visible_dir" in result[0]
assert ".hidden_dir" not in result[0]
def test_empty_directory(self, tmp_path):
result = list_files_recursively(str(tmp_path))
assert result == []
def test_nonexistent_directory(self, tmp_path):
result = list_files_recursively(str(tmp_path / "nonexistent"))
assert result == []
@pytest.mark.skipif(sys.platform == "win32", reason="symlinks need privileges on Windows")
def test_follows_symlinked_directories(self, tmp_path):
target = tmp_path / "real_dir"
target.mkdir()
(target / "model.safetensors").write_text("data")
root = tmp_path / "root"
root.mkdir()
(root / "link").symlink_to(target)
result = list_files_recursively(str(root))
assert len(result) == 1
assert result[0].endswith("model.safetensors")
assert "link" in result[0]
@pytest.mark.skipif(sys.platform == "win32", reason="symlinks need privileges on Windows")
def test_follows_symlinked_files(self, tmp_path):
real_file = tmp_path / "real.txt"
real_file.write_text("content")
root = tmp_path / "root"
root.mkdir()
(root / "link.txt").symlink_to(real_file)
result = list_files_recursively(str(root))
assert len(result) == 1
assert result[0].endswith("link.txt")
@pytest.mark.skipif(sys.platform == "win32", reason="symlinks need privileges on Windows")
def test_circular_symlinks_do_not_loop(self, tmp_path):
dir_a = tmp_path / "a"
dir_a.mkdir()
(dir_a / "file.txt").write_text("a")
# a/b -> a (circular)
(dir_a / "b").symlink_to(dir_a)
result = list_files_recursively(str(dir_a))
assert len(result) == 1
assert result[0].endswith("file.txt")
@pytest.mark.skipif(sys.platform == "win32", reason="symlinks need privileges on Windows")
def test_mutual_circular_symlinks(self, tmp_path):
dir_a = tmp_path / "a"
dir_b = tmp_path / "b"
dir_a.mkdir()
dir_b.mkdir()
(dir_a / "file_a.txt").write_text("a")
(dir_b / "file_b.txt").write_text("b")
# a/link_b -> b and b/link_a -> a
(dir_a / "link_b").symlink_to(dir_b)
(dir_b / "link_a").symlink_to(dir_a)
result = list_files_recursively(str(dir_a))
basenames = sorted(os.path.basename(p) for p in result)
assert "file_a.txt" in basenames
assert "file_b.txt" in basenames

View File

@@ -1,6 +1,7 @@
import time
import uuid
import pytest
import requests
@@ -283,30 +284,21 @@ def test_list_assets_offset_beyond_total_and_limit_boundary(http, api_base, asse
assert b2["has_more"] is False
def test_list_assets_offset_negative_and_limit_nonint_rejected(http, api_base):
r1 = http.get(api_base + "/api/assets", params={"offset": "-1"}, timeout=120)
b1 = r1.json()
assert r1.status_code == 400
assert b1["error"]["code"] == "INVALID_QUERY"
r2 = http.get(api_base + "/api/assets", params={"limit": "abc"}, timeout=120)
b2 = r2.json()
assert r2.status_code == 400
assert b2["error"]["code"] == "INVALID_QUERY"
def test_list_assets_invalid_query_rejected(http: requests.Session, api_base: str):
# limit too small
r1 = http.get(api_base + "/api/assets", params={"limit": "0"}, timeout=120)
b1 = r1.json()
assert r1.status_code == 400
assert b1["error"]["code"] == "INVALID_QUERY"
# bad metadata JSON
r2 = http.get(api_base + "/api/assets", params={"metadata_filter": "{not json"}, timeout=120)
b2 = r2.json()
assert r2.status_code == 400
assert b2["error"]["code"] == "INVALID_QUERY"
@pytest.mark.parametrize(
"params,error_code",
[
({"offset": "-1"}, "INVALID_QUERY"),
({"limit": "abc"}, "INVALID_QUERY"),
({"limit": "0"}, "INVALID_QUERY"),
({"metadata_filter": "{not json"}, "INVALID_QUERY"),
],
ids=["negative_offset", "non_int_limit", "zero_limit", "invalid_metadata_json"],
)
def test_list_assets_invalid_query_rejected(http: requests.Session, api_base: str, params, error_code):
r = http.get(api_base + "/api/assets", params=params, timeout=120)
body = r.json()
assert r.status_code == 400
assert body["error"]["code"] == error_code
def test_list_assets_name_contains_literal_underscore(

View File

@@ -3,7 +3,7 @@ from pathlib import Path
import pytest
import requests
from conftest import get_asset_filename, trigger_sync_seed_assets
from helpers import get_asset_filename, trigger_sync_seed_assets
@pytest.fixture

View File

@@ -0,0 +1,482 @@
"""Tests for sync_references_with_filesystem in scanner.py."""
import os
import tempfile
from datetime import datetime
from pathlib import Path
from unittest.mock import patch
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from app.assets.database.models import (
Asset,
AssetReference,
AssetReferenceTag,
Base,
Tag,
)
from app.assets.database.queries.asset_reference import (
bulk_insert_references_ignore_conflicts,
get_references_for_prefixes,
get_unenriched_references,
restore_references_by_paths,
)
from app.assets.scanner import sync_references_with_filesystem
from app.assets.services.file_utils import get_mtime_ns
@pytest.fixture
def db_engine():
engine = create_engine("sqlite:///:memory:")
Base.metadata.create_all(engine)
return engine
@pytest.fixture
def session(db_engine):
with Session(db_engine) as sess:
yield sess
@pytest.fixture
def temp_dir():
with tempfile.TemporaryDirectory() as tmpdir:
yield Path(tmpdir)
def _create_file(temp_dir: Path, name: str, content: bytes = b"\x00" * 100) -> str:
"""Create a file and return its absolute path (no symlink resolution)."""
p = temp_dir / name
p.parent.mkdir(parents=True, exist_ok=True)
p.write_bytes(content)
return os.path.abspath(str(p))
def _stat_mtime_ns(path: str) -> int:
return get_mtime_ns(os.stat(path, follow_symlinks=True))
def _make_asset(
session: Session,
asset_id: str,
file_path: str,
ref_id: str,
*,
asset_hash: str | None = None,
size_bytes: int = 100,
mtime_ns: int | None = None,
needs_verify: bool = False,
is_missing: bool = False,
) -> tuple[Asset, AssetReference]:
"""Insert an Asset + AssetReference and flush."""
asset = session.get(Asset, asset_id)
if asset is None:
asset = Asset(id=asset_id, hash=asset_hash, size_bytes=size_bytes)
session.add(asset)
session.flush()
ref = AssetReference(
id=ref_id,
asset_id=asset_id,
name=f"test-{ref_id}",
owner_id="system",
file_path=file_path,
mtime_ns=mtime_ns,
needs_verify=needs_verify,
is_missing=is_missing,
)
session.add(ref)
session.flush()
return asset, ref
def _ensure_missing_tag(session: Session):
"""Ensure the 'missing' tag exists."""
if not session.get(Tag, "missing"):
session.add(Tag(name="missing", tag_type="system"))
session.flush()
class _VerifyCase:
def __init__(self, id, stat_unchanged, needs_verify_before, expect_needs_verify):
self.id = id
self.stat_unchanged = stat_unchanged
self.needs_verify_before = needs_verify_before
self.expect_needs_verify = expect_needs_verify
VERIFY_CASES = [
_VerifyCase(
id="unchanged_clears_verify",
stat_unchanged=True,
needs_verify_before=True,
expect_needs_verify=False,
),
_VerifyCase(
id="unchanged_keeps_clear",
stat_unchanged=True,
needs_verify_before=False,
expect_needs_verify=False,
),
_VerifyCase(
id="changed_sets_verify",
stat_unchanged=False,
needs_verify_before=False,
expect_needs_verify=True,
),
_VerifyCase(
id="changed_keeps_verify",
stat_unchanged=False,
needs_verify_before=True,
expect_needs_verify=True,
),
]
@pytest.mark.parametrize("case", VERIFY_CASES, ids=lambda c: c.id)
def test_needs_verify_toggling(session, temp_dir, case):
"""needs_verify is set/cleared based on mtime+size match."""
fp = _create_file(temp_dir, "model.bin")
real_mtime = _stat_mtime_ns(fp)
mtime_for_db = real_mtime if case.stat_unchanged else real_mtime + 1
_make_asset(
session, "a1", fp, "r1",
asset_hash="blake3:abc",
mtime_ns=mtime_for_db,
needs_verify=case.needs_verify_before,
)
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
ref = session.get(AssetReference, "r1")
assert ref.needs_verify is case.expect_needs_verify
class _MissingCase:
def __init__(self, id, file_exists, expect_is_missing):
self.id = id
self.file_exists = file_exists
self.expect_is_missing = expect_is_missing
MISSING_CASES = [
_MissingCase(id="existing_file_not_missing", file_exists=True, expect_is_missing=False),
_MissingCase(id="missing_file_marked_missing", file_exists=False, expect_is_missing=True),
]
@pytest.mark.parametrize("case", MISSING_CASES, ids=lambda c: c.id)
def test_is_missing_flag(session, temp_dir, case):
"""is_missing reflects whether the file exists on disk."""
if case.file_exists:
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
else:
fp = str(temp_dir / "gone.bin")
mtime = 999
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
ref = session.get(AssetReference, "r1")
assert ref.is_missing is case.expect_is_missing
def test_seed_asset_all_missing_deletes_asset(session, temp_dir):
"""Seed asset with all refs missing gets deleted entirely."""
fp = str(temp_dir / "gone.bin")
_make_asset(session, "seed1", fp, "r1", asset_hash=None, mtime_ns=999)
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
assert session.get(Asset, "seed1") is None
assert session.get(AssetReference, "r1") is None
def test_seed_asset_some_exist_returns_survivors(session, temp_dir):
"""Seed asset with at least one existing ref survives and is returned."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "seed1", fp, "r1", asset_hash=None, mtime_ns=mtime)
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
survivors = sync_references_with_filesystem(
session, "models", collect_existing_paths=True,
)
session.commit()
assert session.get(Asset, "seed1") is not None
assert os.path.abspath(fp) in survivors
def test_hashed_asset_prunes_missing_refs_when_one_is_ok(session, temp_dir):
"""Hashed asset with one stat-unchanged ref deletes missing refs."""
fp_ok = _create_file(temp_dir, "good.bin")
fp_gone = str(temp_dir / "gone.bin")
mtime = _stat_mtime_ns(fp_ok)
_make_asset(session, "h1", fp_ok, "r_ok", asset_hash="blake3:aaa", mtime_ns=mtime)
# Second ref on same asset, file missing
ref_gone = AssetReference(
id="r_gone", asset_id="h1", name="gone",
owner_id="system", file_path=fp_gone, mtime_ns=999,
)
session.add(ref_gone)
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
assert session.get(AssetReference, "r_ok") is not None
assert session.get(AssetReference, "r_gone") is None
def test_hashed_asset_all_missing_keeps_refs(session, temp_dir):
"""Hashed asset with all refs missing keeps refs (no pruning)."""
fp = str(temp_dir / "gone.bin")
_make_asset(session, "h1", fp, "r1", asset_hash="blake3:aaa", mtime_ns=999)
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
assert session.get(AssetReference, "r1") is not None
ref = session.get(AssetReference, "r1")
assert ref.is_missing is True
def test_missing_tag_added_when_all_refs_gone(session, temp_dir):
"""Missing tag is added to hashed asset when all refs are missing."""
_ensure_missing_tag(session)
fp = str(temp_dir / "gone.bin")
_make_asset(session, "h1", fp, "r1", asset_hash="blake3:aaa", mtime_ns=999)
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(
session, "models", update_missing_tags=True,
)
session.commit()
session.expire_all()
tag_link = session.get(AssetReferenceTag, ("r1", "missing"))
assert tag_link is not None
def test_missing_tag_removed_when_ref_ok(session, temp_dir):
"""Missing tag is removed from hashed asset when a ref is stat-unchanged."""
_ensure_missing_tag(session)
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "h1", fp, "r1", asset_hash="blake3:aaa", mtime_ns=mtime)
# Pre-add a stale missing tag
session.add(AssetReferenceTag(
asset_reference_id="r1", tag_name="missing", origin="automatic",
))
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(
session, "models", update_missing_tags=True,
)
session.commit()
session.expire_all()
tag_link = session.get(AssetReferenceTag, ("r1", "missing"))
assert tag_link is None
def test_missing_tags_not_touched_when_flag_false(session, temp_dir):
"""Missing tags are not modified when update_missing_tags=False."""
_ensure_missing_tag(session)
fp = str(temp_dir / "gone.bin")
_make_asset(session, "h1", fp, "r1", asset_hash="blake3:aaa", mtime_ns=999)
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(
session, "models", update_missing_tags=False,
)
session.commit()
tag_link = session.get(AssetReferenceTag, ("r1", "missing"))
assert tag_link is None # tag was never added
def test_returns_none_when_collect_false(session, temp_dir):
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
result = sync_references_with_filesystem(
session, "models", collect_existing_paths=False,
)
assert result is None
def test_returns_empty_set_for_no_prefixes(session):
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[]):
result = sync_references_with_filesystem(
session, "models", collect_existing_paths=True,
)
assert result == set()
def test_no_references_is_noop(session, temp_dir):
"""No crash and no side effects when there are no references."""
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
survivors = sync_references_with_filesystem(
session, "models", collect_existing_paths=True,
)
session.commit()
assert survivors == set()
# ---------------------------------------------------------------------------
# Soft-delete persistence across scanner operations
# ---------------------------------------------------------------------------
def _soft_delete_ref(session: Session, ref_id: str) -> None:
"""Mark a reference as soft-deleted (mimics the API DELETE behaviour)."""
ref = session.get(AssetReference, ref_id)
ref.deleted_at = datetime(2025, 1, 1)
session.flush()
def test_soft_deleted_ref_excluded_from_get_references_for_prefixes(session, temp_dir):
"""get_references_for_prefixes skips soft-deleted references."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
rows = get_references_for_prefixes(session, [str(temp_dir)], include_missing=True)
assert len(rows) == 0
def test_sync_does_not_resurrect_soft_deleted_ref(session, temp_dir):
"""Scanner sync leaves soft-deleted refs untouched even when file exists on disk."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
ref = session.get(AssetReference, "r1")
assert ref.deleted_at is not None, "soft-deleted ref must stay deleted after sync"
def test_bulk_insert_does_not_overwrite_soft_deleted_ref(session, temp_dir):
"""bulk_insert_references_ignore_conflicts cannot replace a soft-deleted row."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
now = datetime.now(tz=None)
bulk_insert_references_ignore_conflicts(session, [
{
"id": "r_new",
"asset_id": "a1",
"file_path": fp,
"name": "model.bin",
"owner_id": "",
"mtime_ns": mtime,
"preview_id": None,
"user_metadata": None,
"created_at": now,
"updated_at": now,
"last_access_time": now,
}
])
session.commit()
session.expire_all()
# Original row is still the soft-deleted one
ref = session.get(AssetReference, "r1")
assert ref is not None
assert ref.deleted_at is not None
# The new row was not inserted (conflict on file_path)
assert session.get(AssetReference, "r_new") is None
def test_restore_references_by_paths_skips_soft_deleted(session, temp_dir):
"""restore_references_by_paths does not clear is_missing on soft-deleted refs."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(
session, "a1", fp, "r1",
asset_hash="blake3:abc", mtime_ns=mtime, is_missing=True,
)
_soft_delete_ref(session, "r1")
session.commit()
restored = restore_references_by_paths(session, [fp])
session.commit()
assert restored == 0
session.expire_all()
ref = session.get(AssetReference, "r1")
assert ref.is_missing is True, "is_missing must not be cleared on soft-deleted ref"
assert ref.deleted_at is not None
def test_get_unenriched_references_excludes_soft_deleted(session, temp_dir):
"""Enrichment queries do not pick up soft-deleted references."""
fp = _create_file(temp_dir, "model.bin")
mtime = _stat_mtime_ns(fp)
_make_asset(session, "a1", fp, "r1", asset_hash="blake3:abc", mtime_ns=mtime)
_soft_delete_ref(session, "r1")
session.commit()
rows = get_unenriched_references(session, [str(temp_dir)], max_level=2)
assert len(rows) == 0
def test_sync_ignores_soft_deleted_seed_asset(session, temp_dir):
"""Soft-deleted seed ref is not garbage-collected even when file is missing."""
fp = str(temp_dir / "gone.bin") # file does not exist
_make_asset(session, "seed1", fp, "r1", asset_hash=None, mtime_ns=999)
_soft_delete_ref(session, "r1")
session.commit()
with patch("app.assets.scanner.get_prefixes_for_root", return_value=[str(temp_dir)]):
sync_references_with_filesystem(session, "models")
session.commit()
session.expire_all()
# Asset and ref must still exist — scanner did not see the soft-deleted row
assert session.get(Asset, "seed1") is not None
assert session.get(AssetReference, "r1") is not None

View File

@@ -69,8 +69,8 @@ def test_tags_empty_usage(http: requests.Session, api_base: str, asset_factory,
used_names = [t["name"] for t in body2["tags"]]
assert custom_tag in used_names
# Delete the asset so the tag usage drops to zero
rd = http.delete(f"{api_base}/api/assets/{_asset['id']}", timeout=120)
# Hard-delete the asset so the tag usage drops to zero
rd = http.delete(f"{api_base}/api/assets/{_asset['id']}?delete_content=true", timeout=120)
assert rd.status_code == 204
# Now the custom tag must not be returned when include_zero=false

View File

@@ -18,25 +18,25 @@ def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, ma
assert r1.status_code == 201, a1
assert a1["created_new"] is True
# Second upload with the same data and name should return created_new == False and the same asset
# Second upload with the same data and name creates a new AssetReference (duplicates allowed)
# Returns 200 because Asset already exists, but a new AssetReference is created
files = {"file": (name, data, "application/octet-stream")}
form = {"tags": json.dumps(tags), "name": name, "user_metadata": json.dumps(meta)}
r2 = http.post(api_base + "/api/assets", data=form, files=files, timeout=120)
a2 = r2.json()
assert r2.status_code == 200, a2
assert a2["created_new"] is False
assert r2.status_code in (200, 201), a2
assert a2["asset_hash"] == a1["asset_hash"]
assert a2["id"] == a1["id"] # old reference
assert a2["id"] != a1["id"] # new reference with same content
# Third upload with the same data but new name should return created_new == False and the new AssetReference
# Third upload with the same data but different name also creates new AssetReference
files = {"file": (name, data, "application/octet-stream")}
form = {"tags": json.dumps(tags), "name": name + "_d", "user_metadata": json.dumps(meta)}
r2 = http.post(api_base + "/api/assets", data=form, files=files, timeout=120)
a3 = r2.json()
assert r2.status_code == 200, a3
assert a3["created_new"] is False
r3 = http.post(api_base + "/api/assets", data=form, files=files, timeout=120)
a3 = r3.json()
assert r3.status_code in (200, 201), a3
assert a3["asset_hash"] == a1["asset_hash"]
assert a3["id"] != a1["id"] # old reference
assert a3["id"] != a1["id"]
assert a3["id"] != a2["id"]
def test_upload_fastpath_from_existing_hash_no_file(http: requests.Session, api_base: str):
@@ -116,7 +116,7 @@ def test_concurrent_upload_identical_bytes_different_names(
):
"""
Two concurrent uploads of identical bytes but different names.
Expect a single Asset (same hash), two AssetInfo rows, and exactly one created_new=True.
Expect a single Asset (same hash), two AssetReference rows, and exactly one created_new=True.
"""
scope = f"concupload-{uuid.uuid4().hex[:6]}"
name1, name2 = "cu_a.bin", "cu_b.bin"

View File

@@ -2,4 +2,3 @@ pytest>=7.8.0
pytest-aiohttp
pytest-asyncio
websocket-client
blake3

View File

@@ -0,0 +1,900 @@
"""Unit tests for the _AssetSeeder background scanning class."""
import threading
from unittest.mock import patch
import pytest
from app.assets.database.queries.asset_reference import UnenrichedReferenceRow
from app.assets.seeder import _AssetSeeder, Progress, ScanInProgressError, ScanPhase, State
@pytest.fixture
def fresh_seeder():
"""Create a fresh _AssetSeeder instance for testing."""
seeder = _AssetSeeder()
yield seeder
seeder.shutdown(timeout=1.0)
@pytest.fixture
def mock_dependencies():
"""Mock all external dependencies for isolated testing."""
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", return_value=[]),
patch("app.assets.seeder.build_asset_specs", return_value=([], set(), 0)),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", return_value=[]),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
yield
class TestSeederStateTransitions:
"""Test state machine transitions."""
def test_initial_state_is_idle(self, fresh_seeder: _AssetSeeder):
assert fresh_seeder.get_status().state == State.IDLE
def test_start_transitions_to_running(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
def slow_collect(*args):
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
started = fresh_seeder.start(roots=("models",))
assert started is True
assert reached.wait(timeout=2.0)
assert fresh_seeder.get_status().state == State.RUNNING
barrier.set()
def test_start_while_running_returns_false(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
def slow_collect(*args):
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
second_start = fresh_seeder.start(roots=("models",))
assert second_start is False
barrier.set()
def test_cancel_transitions_to_cancelling(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
def slow_collect(*args):
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
cancelled = fresh_seeder.cancel()
assert cancelled is True
assert fresh_seeder.get_status().state == State.CANCELLING
barrier.set()
def test_cancel_when_idle_returns_false(self, fresh_seeder: _AssetSeeder):
cancelled = fresh_seeder.cancel()
assert cancelled is False
def test_state_returns_to_idle_after_completion(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
fresh_seeder.start(roots=("models",))
completed = fresh_seeder.wait(timeout=5.0)
assert completed is True
assert fresh_seeder.get_status().state == State.IDLE
class TestSeederWait:
"""Test wait() behavior."""
def test_wait_blocks_until_complete(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
fresh_seeder.start(roots=("models",))
completed = fresh_seeder.wait(timeout=5.0)
assert completed is True
assert fresh_seeder.get_status().state == State.IDLE
def test_wait_returns_false_on_timeout(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
def slow_collect(*args):
barrier.wait(timeout=10.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
completed = fresh_seeder.wait(timeout=0.1)
assert completed is False
barrier.set()
def test_wait_when_idle_returns_true(self, fresh_seeder: _AssetSeeder):
completed = fresh_seeder.wait(timeout=1.0)
assert completed is True
class TestSeederProgress:
"""Test progress tracking."""
def test_get_status_returns_progress_during_scan(
self, fresh_seeder: _AssetSeeder
):
barrier = threading.Event()
reached = threading.Event()
def slow_build(*args, **kwargs):
reached.set()
barrier.wait(timeout=5.0)
return ([], set(), 0)
paths = ["/path/file1.safetensors", "/path/file2.safetensors"]
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", return_value=paths),
patch("app.assets.seeder.build_asset_specs", side_effect=slow_build),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", return_value=[]),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
status = fresh_seeder.get_status()
assert status.state == State.RUNNING
assert status.progress is not None
assert status.progress.total == 2
barrier.set()
def test_progress_callback_is_invoked(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
progress_updates: list[Progress] = []
def callback(p: Progress):
progress_updates.append(p)
with patch(
"app.assets.seeder.collect_paths_for_roots",
return_value=[f"/path/file{i}.safetensors" for i in range(10)],
):
fresh_seeder.start(roots=("models",), progress_callback=callback)
fresh_seeder.wait(timeout=5.0)
assert len(progress_updates) > 0
class TestSeederCancellation:
"""Test cancellation behavior."""
def test_scan_commits_partial_progress_on_cancellation(
self, fresh_seeder: _AssetSeeder
):
insert_count = 0
barrier = threading.Event()
first_insert_done = threading.Event()
def slow_insert(specs, tags):
nonlocal insert_count
insert_count += 1
if insert_count == 1:
first_insert_done.set()
if insert_count >= 2:
barrier.wait(timeout=5.0)
return len(specs)
paths = [f"/path/file{i}.safetensors" for i in range(1500)]
specs = [
{
"abs_path": p,
"size_bytes": 100,
"mtime_ns": 0,
"info_name": f"file{i}",
"tags": [],
"fname": f"file{i}",
"metadata": None,
"hash": None,
"mime_type": None,
}
for i, p in enumerate(paths)
]
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", return_value=paths),
patch(
"app.assets.seeder.build_asset_specs", return_value=(specs, set(), 0)
),
patch("app.assets.seeder.insert_asset_specs", side_effect=slow_insert),
patch("app.assets.seeder.get_unenriched_assets_for_roots", return_value=[]),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
fresh_seeder.start(roots=("models",))
assert first_insert_done.wait(timeout=2.0)
fresh_seeder.cancel()
barrier.set()
fresh_seeder.wait(timeout=5.0)
assert 1 <= insert_count < 3 # 1500 paths / 500 batch = 3; cancel stopped early
class TestSeederErrorHandling:
"""Test error handling behavior."""
def test_database_errors_captured_in_status(self, fresh_seeder: _AssetSeeder):
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch(
"app.assets.seeder.collect_paths_for_roots",
return_value=["/path/file.safetensors"],
),
patch(
"app.assets.seeder.build_asset_specs",
return_value=(
[
{
"abs_path": "/path/file.safetensors",
"size_bytes": 100,
"mtime_ns": 0,
"info_name": "file",
"tags": [],
"fname": "file",
"metadata": None,
"hash": None,
"mime_type": None,
}
],
set(),
0,
),
),
patch(
"app.assets.seeder.insert_asset_specs",
side_effect=Exception("DB connection failed"),
),
patch("app.assets.seeder.get_unenriched_assets_for_roots", return_value=[]),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
fresh_seeder.start(roots=("models",))
fresh_seeder.wait(timeout=5.0)
status = fresh_seeder.get_status()
assert len(status.errors) > 0
assert "DB connection failed" in status.errors[0]
def test_dependencies_unavailable_captured_in_errors(
self, fresh_seeder: _AssetSeeder
):
with patch("app.assets.seeder.dependencies_available", return_value=False):
fresh_seeder.start(roots=("models",))
fresh_seeder.wait(timeout=5.0)
status = fresh_seeder.get_status()
assert len(status.errors) > 0
assert "dependencies" in status.errors[0].lower()
def test_thread_crash_resets_state_to_idle(self, fresh_seeder: _AssetSeeder):
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch(
"app.assets.seeder.sync_root_safely",
side_effect=RuntimeError("Unexpected crash"),
),
):
fresh_seeder.start(roots=("models",))
fresh_seeder.wait(timeout=5.0)
status = fresh_seeder.get_status()
assert status.state == State.IDLE
assert len(status.errors) > 0
class TestSeederThreadSafety:
"""Test thread safety of concurrent operations."""
def test_concurrent_start_calls_spawn_only_one_thread(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
def slow_collect(*args):
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
results = []
def try_start():
results.append(fresh_seeder.start(roots=("models",)))
threads = [threading.Thread(target=try_start) for _ in range(10)]
for t in threads:
t.start()
for t in threads:
t.join()
barrier.set()
assert sum(results) == 1
def test_get_status_safe_during_scan(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
def slow_collect(*args):
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
statuses = []
for _ in range(100):
statuses.append(fresh_seeder.get_status())
barrier.set()
assert all(
s.state in (State.RUNNING, State.IDLE, State.CANCELLING)
for s in statuses
)
class TestSeederMarkMissing:
"""Test mark_missing_outside_prefixes behavior."""
def test_mark_missing_when_idle(self, fresh_seeder: _AssetSeeder):
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch(
"app.assets.seeder.get_all_known_prefixes",
return_value=["/models", "/input", "/output"],
),
patch(
"app.assets.seeder.mark_missing_outside_prefixes_safely", return_value=5
) as mock_mark,
):
result = fresh_seeder.mark_missing_outside_prefixes()
assert result == 5
mock_mark.assert_called_once_with(["/models", "/input", "/output"])
def test_mark_missing_raises_when_running(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
def slow_collect(*args):
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
with pytest.raises(ScanInProgressError):
fresh_seeder.mark_missing_outside_prefixes()
barrier.set()
def test_mark_missing_returns_zero_when_dependencies_unavailable(
self, fresh_seeder: _AssetSeeder
):
with patch("app.assets.seeder.dependencies_available", return_value=False):
result = fresh_seeder.mark_missing_outside_prefixes()
assert result == 0
def test_prune_first_flag_triggers_mark_missing_before_scan(
self, fresh_seeder: _AssetSeeder
):
call_order = []
def track_mark(prefixes):
call_order.append("mark_missing")
return 3
def track_sync(root):
call_order.append(f"sync_{root}")
return set()
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.get_all_known_prefixes", return_value=["/models"]),
patch("app.assets.seeder.mark_missing_outside_prefixes_safely", side_effect=track_mark),
patch("app.assets.seeder.sync_root_safely", side_effect=track_sync),
patch("app.assets.seeder.collect_paths_for_roots", return_value=[]),
patch("app.assets.seeder.build_asset_specs", return_value=([], set(), 0)),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", return_value=[]),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
fresh_seeder.start(roots=("models",), prune_first=True)
fresh_seeder.wait(timeout=5.0)
assert call_order[0] == "mark_missing"
assert "sync_models" in call_order
class TestSeederPhases:
"""Test phased scanning behavior."""
def test_start_fast_only_runs_fast_phase(self, fresh_seeder: _AssetSeeder):
"""Verify start_fast only runs the fast phase."""
fast_called = []
enrich_called = []
def track_fast(*args, **kwargs):
fast_called.append(True)
return ([], set(), 0)
def track_enrich(*args, **kwargs):
enrich_called.append(True)
return []
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", return_value=[]),
patch("app.assets.seeder.build_asset_specs", side_effect=track_fast),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", side_effect=track_enrich),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
fresh_seeder.start_fast(roots=("models",))
fresh_seeder.wait(timeout=5.0)
assert len(fast_called) == 1
assert len(enrich_called) == 0
def test_start_enrich_only_runs_enrich_phase(self, fresh_seeder: _AssetSeeder):
"""Verify start_enrich only runs the enrich phase."""
fast_called = []
enrich_called = []
def track_fast(*args, **kwargs):
fast_called.append(True)
return ([], set(), 0)
def track_enrich(*args, **kwargs):
enrich_called.append(True)
return []
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", return_value=[]),
patch("app.assets.seeder.build_asset_specs", side_effect=track_fast),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", side_effect=track_enrich),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
fresh_seeder.start_enrich(roots=("models",))
fresh_seeder.wait(timeout=5.0)
assert len(fast_called) == 0
assert len(enrich_called) == 1
def test_full_scan_runs_both_phases(self, fresh_seeder: _AssetSeeder):
"""Verify full scan runs both fast and enrich phases."""
fast_called = []
enrich_called = []
def track_fast(*args, **kwargs):
fast_called.append(True)
return ([], set(), 0)
def track_enrich(*args, **kwargs):
enrich_called.append(True)
return []
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", return_value=[]),
patch("app.assets.seeder.build_asset_specs", side_effect=track_fast),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", side_effect=track_enrich),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
fresh_seeder.start(roots=("models",), phase=ScanPhase.FULL)
fresh_seeder.wait(timeout=5.0)
assert len(fast_called) == 1
assert len(enrich_called) == 1
class TestSeederPauseResume:
"""Test pause/resume behavior."""
def test_pause_transitions_to_paused(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
def slow_collect(*args):
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
paused = fresh_seeder.pause()
assert paused is True
assert fresh_seeder.get_status().state == State.PAUSED
barrier.set()
def test_pause_when_idle_returns_false(self, fresh_seeder: _AssetSeeder):
paused = fresh_seeder.pause()
assert paused is False
def test_resume_returns_to_running(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
def slow_collect(*args):
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
fresh_seeder.pause()
assert fresh_seeder.get_status().state == State.PAUSED
resumed = fresh_seeder.resume()
assert resumed is True
assert fresh_seeder.get_status().state == State.RUNNING
barrier.set()
def test_resume_when_not_paused_returns_false(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
def slow_collect(*args):
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
resumed = fresh_seeder.resume()
assert resumed is False
barrier.set()
def test_cancel_while_paused_works(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached_checkpoint = threading.Event()
def slow_collect(*args):
reached_checkpoint.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached_checkpoint.wait(timeout=2.0)
fresh_seeder.pause()
assert fresh_seeder.get_status().state == State.PAUSED
cancelled = fresh_seeder.cancel()
assert cancelled is True
barrier.set()
fresh_seeder.wait(timeout=5.0)
assert fresh_seeder.get_status().state == State.IDLE
class TestSeederStopRestart:
"""Test stop and restart behavior."""
def test_stop_is_alias_for_cancel(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
def slow_collect(*args):
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
stopped = fresh_seeder.stop()
assert stopped is True
assert fresh_seeder.get_status().state == State.CANCELLING
barrier.set()
def test_restart_cancels_and_starts_new_scan(
self, fresh_seeder: _AssetSeeder, mock_dependencies
):
barrier = threading.Event()
reached = threading.Event()
start_count = 0
def slow_collect(*args):
nonlocal start_count
start_count += 1
if start_count == 1:
reached.set()
barrier.wait(timeout=5.0)
return []
with patch(
"app.assets.seeder.collect_paths_for_roots", side_effect=slow_collect
):
fresh_seeder.start(roots=("models",))
assert reached.wait(timeout=2.0)
barrier.set()
restarted = fresh_seeder.restart()
assert restarted is True
fresh_seeder.wait(timeout=5.0)
assert start_count == 2
def test_restart_preserves_previous_params(self, fresh_seeder: _AssetSeeder):
"""Verify restart uses previous params when not overridden."""
collected_roots = []
def track_collect(roots):
collected_roots.append(roots)
return []
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", side_effect=track_collect),
patch("app.assets.seeder.build_asset_specs", return_value=([], set(), 0)),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", return_value=[]),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
fresh_seeder.start(roots=("input", "output"))
fresh_seeder.wait(timeout=5.0)
fresh_seeder.restart()
fresh_seeder.wait(timeout=5.0)
assert len(collected_roots) == 2
assert collected_roots[0] == ("input", "output")
assert collected_roots[1] == ("input", "output")
def test_restart_can_override_params(self, fresh_seeder: _AssetSeeder):
"""Verify restart can override previous params."""
collected_roots = []
def track_collect(roots):
collected_roots.append(roots)
return []
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", side_effect=track_collect),
patch("app.assets.seeder.build_asset_specs", return_value=([], set(), 0)),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", return_value=[]),
patch("app.assets.seeder.enrich_assets_batch", return_value=(0, 0)),
):
fresh_seeder.start(roots=("models",))
fresh_seeder.wait(timeout=5.0)
fresh_seeder.restart(roots=("input",))
fresh_seeder.wait(timeout=5.0)
assert len(collected_roots) == 2
assert collected_roots[0] == ("models",)
assert collected_roots[1] == ("input",)
def _make_row(ref_id: str, asset_id: str = "a1") -> UnenrichedReferenceRow:
return UnenrichedReferenceRow(
reference_id=ref_id, asset_id=asset_id,
file_path=f"/fake/{ref_id}.bin", enrichment_level=0,
)
class TestEnrichPhaseDefensiveLogic:
"""Test skip_ids filtering and consecutive_empty termination."""
def test_failed_refs_are_skipped_on_subsequent_batches(
self, fresh_seeder: _AssetSeeder,
):
"""References that fail enrichment are filtered out of future batches."""
row_a = _make_row("r1")
row_b = _make_row("r2")
call_count = 0
def fake_get_unenriched(*args, **kwargs):
nonlocal call_count
call_count += 1
if call_count <= 2:
return [row_a, row_b]
return []
enriched_refs: list[list[str]] = []
def fake_enrich(rows, **kwargs):
ref_ids = [r.reference_id for r in rows]
enriched_refs.append(ref_ids)
# r1 always fails, r2 succeeds
failed = [r.reference_id for r in rows if r.reference_id == "r1"]
enriched = len(rows) - len(failed)
return enriched, failed
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", return_value=[]),
patch("app.assets.seeder.build_asset_specs", return_value=([], set(), 0)),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", side_effect=fake_get_unenriched),
patch("app.assets.seeder.enrich_assets_batch", side_effect=fake_enrich),
):
fresh_seeder.start(roots=("models",), phase=ScanPhase.ENRICH)
fresh_seeder.wait(timeout=5.0)
# First batch: both refs attempted
assert "r1" in enriched_refs[0]
assert "r2" in enriched_refs[0]
# Second batch: r1 filtered out
assert "r1" not in enriched_refs[1]
assert "r2" in enriched_refs[1]
def test_stops_after_consecutive_empty_batches(
self, fresh_seeder: _AssetSeeder,
):
"""Enrich phase terminates after 3 consecutive batches with zero progress."""
row = _make_row("r1")
batch_count = 0
def fake_get_unenriched(*args, **kwargs):
nonlocal batch_count
batch_count += 1
# Always return the same row (simulating a permanently failing ref)
return [row]
def fake_enrich(rows, **kwargs):
# Always fail — zero enriched, all failed
return 0, [r.reference_id for r in rows]
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", return_value=[]),
patch("app.assets.seeder.build_asset_specs", return_value=([], set(), 0)),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", side_effect=fake_get_unenriched),
patch("app.assets.seeder.enrich_assets_batch", side_effect=fake_enrich),
):
fresh_seeder.start(roots=("models",), phase=ScanPhase.ENRICH)
fresh_seeder.wait(timeout=5.0)
# Should stop after exactly 3 consecutive empty batches
# Batch 1: returns row, enrich fails → filtered out in batch 2+
# But get_unenriched keeps returning it, filter removes it → empty → break
# Actually: batch 1 has row, fails. Batch 2 get_unenriched returns [row],
# skip_ids filters it → empty list → breaks via `if not unenriched: break`
# So it terminates in 2 calls to get_unenriched.
assert batch_count == 2
def test_consecutive_empty_counter_resets_on_success(
self, fresh_seeder: _AssetSeeder,
):
"""A successful batch resets the consecutive empty counter."""
call_count = 0
def fake_get_unenriched(*args, **kwargs):
nonlocal call_count
call_count += 1
if call_count <= 6:
return [_make_row(f"r{call_count}", f"a{call_count}")]
return []
def fake_enrich(rows, **kwargs):
ref_id = rows[0].reference_id
# Fail batches 1-2, succeed batch 3, fail batches 4-5, succeed batch 6
if ref_id in ("r1", "r2", "r4", "r5"):
return 0, [ref_id]
return 1, []
with (
patch("app.assets.seeder.dependencies_available", return_value=True),
patch("app.assets.seeder.sync_root_safely", return_value=set()),
patch("app.assets.seeder.collect_paths_for_roots", return_value=[]),
patch("app.assets.seeder.build_asset_specs", return_value=([], set(), 0)),
patch("app.assets.seeder.insert_asset_specs", return_value=0),
patch("app.assets.seeder.get_unenriched_assets_for_roots", side_effect=fake_get_unenriched),
patch("app.assets.seeder.enrich_assets_batch", side_effect=fake_enrich),
):
fresh_seeder.start(roots=("models",), phase=ScanPhase.ENRICH)
fresh_seeder.wait(timeout=5.0)
# All 6 batches should run + 1 final call returning empty
assert call_count == 7
status = fresh_seeder.get_status()
assert status.state == State.IDLE