feat: implement two-phase scanning architecture (fast + enrich)

Phase 1 (FAST): Creates stub records with filesystem metadata only - path, size, mtime - no file content reading - Populates asset database quickly on startup Phase 2 (ENRICH): Extracts metadata and computes hashes - Safetensors header parsing, MIME types - Optional blake3 hash computation - Updates existing stub records Changes: - Add ScanPhase enum (FAST, ENRICH, FULL) - Add enrichment_level column to AssetCacheState (0=stub, 1=metadata, 2=hashed) - Add build_stub_specs() for fast scanning without metadata extraction - Add get_unenriched_cache_states(), enrich_asset(), enrich_assets_batch() - Add start_fast(), start_enrich() convenience methods to AssetSeeder - Update start() to accept phase parameter (defaults to FULL) - Split _run_scan() into _run_fast_phase() and _run_enrich_phase() - Add migration 0003_add_enrichment_level.py - Update tests for new architecture Amp-Thread-ID: https://ampcode.com/threads/T-019c4eef-1568-778f-aede-38254728f848 Co-authored-by: Amp <amp@ampcode.com>
2026-03-08 06:39:56 +00:00 · 2026-02-11 16:04:12 -08:00
parent bd17ee3dc9
commit c7368205e3
7 changed files with 675 additions and 74 deletions
--- a/app/assets/scanner.py
+++ b/app/assets/scanner.py
@@ -7,13 +7,16 @@ from typing import Literal, TypedDict
 import folder_paths
 from app.assets.database.queries import (
    add_missing_tag_for_asset_id,
+    bulk_update_enrichment_level,
    bulk_update_is_missing,
    bulk_update_needs_verify,
    delete_cache_states_by_ids,
    delete_orphaned_seed_asset,
    ensure_tags_exist,
    get_cache_states_for_prefixes,
+    get_unenriched_cache_states,
    remove_missing_tag_for_asset_id,
+    set_asset_info_metadata,
 )
 from app.assets.services.bulk_ingest import (
    SeedAssetSpec,
@@ -341,6 +344,59 @@ def build_asset_specs(
    return specs, tag_pool, skipped


+def build_stub_specs(
+    paths: list[str],
+    existing_paths: set[str],
+) -> tuple[list[SeedAssetSpec], set[str], int]:
+    """Build minimal stub specs for fast phase scanning.
+
+    Only collects filesystem metadata (stat), no file content reading.
+    This is the fastest possible scan to populate the asset database.
+
+    Args:
+        paths: List of file paths to process
+        existing_paths: Set of paths that already exist in the database
+
+    Returns:
+        Tuple of (specs, tag_pool, skipped_count)
+    """
+    specs: list[SeedAssetSpec] = []
+    tag_pool: set[str] = set()
+    skipped = 0
+
+    for p in paths:
+        abs_p = os.path.abspath(p)
+        if abs_p in existing_paths:
+            skipped += 1
+            continue
+        try:
+            stat_p = os.stat(abs_p, follow_symlinks=False)
+        except OSError:
+            continue
+        if not stat_p.st_size:
+            continue
+
+        name, tags = get_name_and_tags_from_asset_path(abs_p)
+        rel_fname = compute_relative_filename(abs_p)
+
+        specs.append(
+            {
+                "abs_path": abs_p,
+                "size_bytes": stat_p.st_size,
+                "mtime_ns": get_mtime_ns(stat_p),
+                "info_name": name,
+                "tags": tags,
+                "fname": rel_fname,
+                "metadata": None,
+                "hash": None,
+                "mime_type": None,
+            }
+        )
+        tag_pool.update(tags)
+
+    return specs, tag_pool, skipped
+
+
 def insert_asset_specs(specs: list[SeedAssetSpec], tag_pool: set[str]) -> int:
    """Insert asset specs into database, returning count of created infos."""
    if not specs:
@@ -394,3 +450,129 @@ def seed_assets(
            skipped_existing,
            len(paths),
        )
+
+
+# Enrichment level constants
+ENRICHMENT_STUB = 0  # Fast scan: path, size, mtime only
+ENRICHMENT_METADATA = 1  # Metadata extracted (safetensors header, mime type)
+ENRICHMENT_HASHED = 2  # Hash computed (blake3)
+
+
+def get_unenriched_assets_for_roots(
+    roots: tuple[RootType, ...],
+    max_level: int = ENRICHMENT_STUB,
+    limit: int = 1000,
+) -> list:
+    """Get assets that need enrichment for the given roots.
+
+    Args:
+        roots: Tuple of root types to scan
+        max_level: Maximum enrichment level to include
+        limit: Maximum number of rows to return
+
+    Returns:
+        List of UnenrichedAssetRow
+    """
+    prefixes: list[str] = []
+    for root in roots:
+        prefixes.extend(get_prefixes_for_root(root))
+
+    if not prefixes:
+        return []
+
+    with create_session() as sess:
+        return get_unenriched_cache_states(sess, prefixes, max_level=max_level, limit=limit)
+
+
+def enrich_asset(
+    file_path: str,
+    cache_state_id: int,
+    asset_info_id: str,
+    extract_metadata: bool = True,
+    compute_hash: bool = False,
+) -> int:
+    """Enrich a single asset with metadata and/or hash.
+
+    Args:
+        file_path: Absolute path to the file
+        cache_state_id: ID of the cache state to update
+        asset_info_id: ID of the asset info to update
+        extract_metadata: If True, extract safetensors header and mime type
+        compute_hash: If True, compute blake3 hash
+
+    Returns:
+        New enrichment level achieved
+    """
+    new_level = ENRICHMENT_STUB
+
+    try:
+        stat_p = os.stat(file_path, follow_symlinks=True)
+    except OSError:
+        return new_level
+
+    rel_fname = compute_relative_filename(file_path)
+
+    with create_session() as sess:
+        if extract_metadata:
+            metadata = extract_file_metadata(
+                file_path,
+                stat_result=stat_p,
+                enable_safetensors=True,
+                relative_filename=rel_fname,
+            )
+            if metadata:
+                user_metadata = metadata.to_user_metadata()
+                set_asset_info_metadata(sess, asset_info_id, user_metadata)
+                new_level = ENRICHMENT_METADATA
+
+        if compute_hash:
+            try:
+                digest = compute_blake3_hash(file_path)
+                # TODO: Update asset.hash field
+                # For now just mark the enrichment level
+                new_level = ENRICHMENT_HASHED
+            except Exception as e:
+                logging.warning("Failed to hash %s: %s", file_path, e)
+
+        bulk_update_enrichment_level(sess, [cache_state_id], new_level)
+        sess.commit()
+
+    return new_level
+
+
+def enrich_assets_batch(
+    rows: list,
+    extract_metadata: bool = True,
+    compute_hash: bool = False,
+) -> tuple[int, int]:
+    """Enrich a batch of assets.
+
+    Args:
+        rows: List of UnenrichedAssetRow from get_unenriched_assets_for_roots
+        extract_metadata: If True, extract metadata for each asset
+        compute_hash: If True, compute hash for each asset
+
+    Returns:
+        Tuple of (enriched_count, failed_count)
+    """
+    enriched = 0
+    failed = 0
+
+    for row in rows:
+        try:
+            new_level = enrich_asset(
+                file_path=row.file_path,
+                cache_state_id=row.cache_state_id,
+                asset_info_id=row.asset_info_id,
+                extract_metadata=extract_metadata,
+                compute_hash=compute_hash,
+            )
+            if new_level > row.enrichment_level:
+                enriched += 1
+            else:
+                failed += 1
+        except Exception as e:
+            logging.warning("Failed to enrich %s: %s", row.file_path, e)
+            failed += 1
+
+    return enriched, failed