feat(isolation): DynamicVRAM compatibility for process isolation

DynamicVRAM's on-demand model loading/offloading conflicted with process isolation in three ways: RPC tensor transport stalls from mid-call GPU offload, race conditions between model lifecycle and active RPC operations, and false positive memory leak detection from changed finalizer patterns. - Marshal CUDA tensors to CPU before RPC transport for dynamic models - Add operation state tracking + quiescence waits at workflow boundaries - Distinguish proxy reference release from actual leaks in cleanup_models_gc - Fix init order: DynamicVRAM must initialize before isolation proxies - Add RPC timeouts to prevent indefinite hangs on model unavailability - Prevent proxy-of-proxy chains from DynamicVRAM model reload cycles - Add torch.device/torch.dtype serializers for new DynamicVRAM RPC paths - Guard isolation overhead so non-isolated workflows are unaffected - Migrate env var to PYISOLATE_CHILD
2026-03-13 00:59:59 +00:00 · 2026-03-04 23:48:02 -06:00
parent a0f8784e9f
commit 9250191c65
38 changed files with 94595 additions and 307 deletions
--- a/execution.py
+++ b/execution.py
@@ -43,6 +43,8 @@ from comfy_execution.utils import CurrentNodeContext
 from comfy_api.internal import _ComfyNodeInternal, _NodeOutputInternal, first_real_override, is_class, make_locked_method_func
 from comfy_api.latest import io, _io

+_AIMDO_VBAR_RESET_UNAVAILABLE_LOGGED = False
+

 class ExecutionResult(Enum):
    SUCCESS = 0
@@ -540,7 +542,17 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
                    if args.verbose == "DEBUG":
                        comfy_aimdo.control.analyze()
                    comfy.model_management.reset_cast_buffers()
-                    comfy_aimdo.model_vbar.vbars_reset_watermark_limits()
+                    vbar_lib = getattr(comfy_aimdo.model_vbar, "lib", None)
+                    if vbar_lib is not None:
+                        comfy_aimdo.model_vbar.vbars_reset_watermark_limits()
+                    else:
+                        global _AIMDO_VBAR_RESET_UNAVAILABLE_LOGGED
+                        if not _AIMDO_VBAR_RESET_UNAVAILABLE_LOGGED:
+                            logging.warning(
+                                "DynamicVRAM backend unavailable for watermark reset; "
+                                "skipping vbar reset for this process."
+                            )
+                            _AIMDO_VBAR_RESET_UNAVAILABLE_LOGGED = True

            if has_pending_tasks:
                pending_async_nodes[unique_id] = output_data
@@ -669,6 +681,8 @@ class PromptExecutor:
        self.success = True

    async def _notify_execution_graph_safe(self, class_types: set[str], *, fail_loud: bool = False) -> None:
+        if not args.use_process_isolation:
+            return
        try:
            from comfy.isolation import notify_execution_graph
            await notify_execution_graph(class_types)
@@ -678,12 +692,34 @@ class PromptExecutor:
            logging.debug("][ EX:notify_execution_graph failed", exc_info=True)

    async def _flush_running_extensions_transport_state_safe(self) -> None:
+        if not args.use_process_isolation:
+            return
        try:
            from comfy.isolation import flush_running_extensions_transport_state
            await flush_running_extensions_transport_state()
        except Exception:
            logging.debug("][ EX:flush_running_extensions_transport_state failed", exc_info=True)

+    async def _wait_model_patcher_quiescence_safe(
+        self,
+        *,
+        fail_loud: bool = False,
+        timeout_ms: int = 120000,
+        marker: str = "EX:wait_model_patcher_idle",
+    ) -> None:
+        if not args.use_process_isolation:
+            return
+        try:
+            from comfy.isolation import wait_for_model_patcher_quiescence
+
+            await wait_for_model_patcher_quiescence(
+                timeout_ms=timeout_ms, fail_loud=fail_loud, marker=marker
+            )
+        except Exception:
+            if fail_loud:
+                raise
+            logging.debug("][ EX:wait_model_patcher_quiescence failed", exc_info=True)
+
    def add_message(self, event, data: dict, broadcast: bool):
        data = {
            **data,
@@ -725,16 +761,17 @@ class PromptExecutor:
        asyncio.run(self.execute_async(prompt, prompt_id, extra_data, execute_outputs))

    async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=[]):
-        # Update RPC event loops for all isolated extensions
-        # This is critical for serial workflow execution - each asyncio.run() creates
-        # a new event loop, and RPC instances must be updated to use it
-        try:
-            from comfy.isolation import update_rpc_event_loops
-            update_rpc_event_loops()
-        except ImportError:
-            pass  # Isolation not available
-        except Exception as e:
-            logging.getLogger(__name__).warning(f"Failed to update RPC event loops: {e}")
+        if args.use_process_isolation:
+            # Update RPC event loops for all isolated extensions.
+            # This is critical for serial workflow execution - each asyncio.run() creates
+            # a new event loop, and RPC instances must be updated to use it.
+            try:
+                from comfy.isolation import update_rpc_event_loops
+                update_rpc_event_loops()
+            except ImportError:
+                pass  # Isolation not available
+            except Exception as e:
+                logging.getLogger(__name__).warning(f"Failed to update RPC event loops: {e}")

        set_preview_method(extra_data.get("preview_method"))

@@ -754,6 +791,11 @@ class PromptExecutor:
                    # Boundary cleanup runs at the start of the next workflow in
                    # isolation mode, matching non-isolated "next prompt" timing.
                    self.caches = CacheSet(cache_type=self.cache_type, cache_args=self.cache_args)
+                    await self._wait_model_patcher_quiescence_safe(
+                        fail_loud=False,
+                        timeout_ms=120000,
+                        marker="EX:boundary_cleanup_wait_idle",
+                    )
                    await self._flush_running_extensions_transport_state_safe()
                    comfy.model_management.unload_all_models()
                    comfy.model_management.cleanup_models_gc()
@@ -794,6 +836,11 @@ class PromptExecutor:
                for node_id in execution_list.pendingNodes.keys():
                    class_type = dynamic_prompt.get_node(node_id)["class_type"]
                    pending_class_types.add(class_type)
+                await self._wait_model_patcher_quiescence_safe(
+                    fail_loud=True,
+                    timeout_ms=120000,
+                    marker="EX:notify_graph_wait_idle",
+                )
                await self._notify_execution_graph_safe(pending_class_types, fail_loud=True)

            while not execution_list.is_empty():