refactor: move bulk_ops to queries and scanner service

- Delete bulk_ops.py, moving logic to appropriate layers - Add bulk insert query functions: - queries/asset.bulk_insert_assets - queries/cache_state.bulk_insert_cache_states_ignore_conflicts - queries/cache_state.get_cache_states_by_paths_and_asset_ids - queries/asset_info.bulk_insert_asset_infos_ignore_conflicts - queries/asset_info.get_asset_info_ids_by_ids - queries/tags.bulk_insert_tags_and_meta - Move seed_from_paths_batch orchestration to scanner._seed_from_paths_batch Amp-Thread-ID: https://ampcode.com/threads/T-019c24fd-157d-776a-ad24-4f19cf5d3afe Co-authored-by: Amp <amp@ampcode.com>
2026-02-24 00:44:03 +00:00 · 2026-02-03 11:50:39 -08:00
parent 48bfd29fb6
commit ef97ea8880
7 changed files with 343 additions and 205 deletions
--- a/app/assets/services/scanner.py
+++ b/app/assets/services/scanner.py
@@ -2,9 +2,11 @@ import contextlib
 import logging
 import os
 import time
+import uuid

 import folder_paths
-from app.assets.database.bulk_ops import seed_from_paths_batch
+from sqlalchemy.orm import Session
+
 from app.assets.database.queries import (
    add_missing_tag_for_asset_id,
    ensure_tags_exist,
@@ -16,6 +18,12 @@ from app.assets.database.queries import (
    bulk_set_needs_verify,
    delete_cache_states_by_ids,
    delete_orphaned_seed_asset,
+    bulk_insert_assets,
+    bulk_insert_cache_states_ignore_conflicts,
+    get_cache_states_by_paths_and_asset_ids,
+    bulk_insert_asset_infos_ignore_conflicts,
+    get_asset_info_ids_by_ids,
+    bulk_insert_tags_and_meta,
 )
 from app.assets.helpers import (
    collect_models_files,
@@ -25,10 +33,157 @@ from app.assets.helpers import (
    list_tree,
    prefixes_for_root,
    RootType,
+    utcnow,
 )
 from app.database.db import create_session, dependencies_available


+def _seed_from_paths_batch(
+    session: Session,
+    specs: list[dict],
+    owner_id: str = "",
+) -> dict:
+    """Seed assets from filesystem specs in batch.
+
+    Each spec is a dict with keys:
+      - abs_path: str
+      - size_bytes: int
+      - mtime_ns: int
+      - info_name: str
+      - tags: list[str]
+      - fname: Optional[str]
+
+    This function orchestrates:
+    1. Insert seed Assets (hash=NULL)
+    2. Claim cache states with ON CONFLICT DO NOTHING
+    3. Query to find winners (paths where our asset_id was inserted)
+    4. Delete Assets for losers (path already claimed by another asset)
+    5. Insert AssetInfo for winners
+    6. Insert tags and metadata for successfully inserted AssetInfos
+
+    Returns:
+        dict with keys: inserted_infos, won_states, lost_states
+    """
+    if not specs:
+        return {"inserted_infos": 0, "won_states": 0, "lost_states": 0}
+
+    now = utcnow()
+    asset_rows: list[dict] = []
+    state_rows: list[dict] = []
+    path_to_asset: dict[str, str] = {}
+    asset_to_info: dict[str, dict] = {}
+    path_list: list[str] = []
+
+    for sp in specs:
+        ap = os.path.abspath(sp["abs_path"])
+        aid = str(uuid.uuid4())
+        iid = str(uuid.uuid4())
+        path_list.append(ap)
+        path_to_asset[ap] = aid
+
+        asset_rows.append({
+            "id": aid,
+            "hash": None,
+            "size_bytes": sp["size_bytes"],
+            "mime_type": None,
+            "created_at": now,
+        })
+        state_rows.append({
+            "asset_id": aid,
+            "file_path": ap,
+            "mtime_ns": sp["mtime_ns"],
+        })
+        asset_to_info[aid] = {
+            "id": iid,
+            "owner_id": owner_id,
+            "name": sp["info_name"],
+            "asset_id": aid,
+            "preview_id": None,
+            "user_metadata": {"filename": sp["fname"]} if sp["fname"] else None,
+            "created_at": now,
+            "updated_at": now,
+            "last_access_time": now,
+            "_tags": sp["tags"],
+            "_filename": sp["fname"],
+        }
+
+    # 1. Insert all seed Assets (hash=NULL)
+    bulk_insert_assets(session, asset_rows)
+
+    # 2. Try to claim cache states (file_path unique)
+    bulk_insert_cache_states_ignore_conflicts(session, state_rows)
+
+    # 3. Query to find which paths we won
+    winners_by_path = get_cache_states_by_paths_and_asset_ids(session, path_to_asset)
+
+    all_paths_set = set(path_list)
+    losers_by_path = all_paths_set - winners_by_path
+    lost_assets = [path_to_asset[p] for p in losers_by_path]
+
+    # 4. Delete Assets for losers
+    if lost_assets:
+        delete_assets_by_ids(session, lost_assets)
+
+    if not winners_by_path:
+        return {"inserted_infos": 0, "won_states": 0, "lost_states": len(losers_by_path)}
+
+    # 5. Insert AssetInfo for winners
+    winner_info_rows = [asset_to_info[path_to_asset[p]] for p in winners_by_path]
+    db_info_rows = [
+        {
+            "id": row["id"],
+            "owner_id": row["owner_id"],
+            "name": row["name"],
+            "asset_id": row["asset_id"],
+            "preview_id": row["preview_id"],
+            "user_metadata": row["user_metadata"],
+            "created_at": row["created_at"],
+            "updated_at": row["updated_at"],
+            "last_access_time": row["last_access_time"],
+        }
+        for row in winner_info_rows
+    ]
+    bulk_insert_asset_infos_ignore_conflicts(session, db_info_rows)
+
+    # 6. Find which info rows were actually inserted
+    all_info_ids = [row["id"] for row in winner_info_rows]
+    inserted_info_ids = get_asset_info_ids_by_ids(session, all_info_ids)
+
+    # 7. Build and insert tag + meta rows
+    tag_rows: list[dict] = []
+    meta_rows: list[dict] = []
+    if inserted_info_ids:
+        for row in winner_info_rows:
+            iid = row["id"]
+            if iid not in inserted_info_ids:
+                continue
+            for t in row["_tags"]:
+                tag_rows.append({
+                    "asset_info_id": iid,
+                    "tag_name": t,
+                    "origin": "automatic",
+                    "added_at": now,
+                })
+            if row["_filename"]:
+                meta_rows.append({
+                    "asset_info_id": iid,
+                    "key": "filename",
+                    "ordinal": 0,
+                    "val_str": row["_filename"],
+                    "val_num": None,
+                    "val_bool": None,
+                    "val_json": None,
+                })
+
+    bulk_insert_tags_and_meta(session, tag_rows=tag_rows, meta_rows=meta_rows)
+
+    return {
+        "inserted_infos": len(inserted_info_ids),
+        "won_states": len(winners_by_path),
+        "lost_states": len(losers_by_path),
+    }
+
+
 def prune_orphaned_assets(session, valid_prefixes: list[str]) -> int:
    """Prune cache states outside valid prefixes, then delete orphaned seed assets.

@@ -229,7 +384,7 @@ def seed_assets(roots: tuple[RootType, ...], enable_logging: bool = False) -> No
        with create_session() as sess:
            if tag_pool:
                ensure_tags_exist(sess, tag_pool, tag_type="user")
-            result = seed_from_paths_batch(sess, specs=specs, owner_id="")
+            result = _seed_from_paths_batch(sess, specs=specs, owner_id="")
            created += result["inserted_infos"]
            sess.commit()