Made multigpu deepclone load model from disk to avoid needing to deepclone actual model object, fixed issues with merge, turn off cuda backend as it causes device mismatch issue with rope (and potentially other ops), will investigate

2026-02-18 22:20:03 +00:00 · 2026-02-17 04:55:00 -08:00
parent df2fd4c869
commit f4b99bc623
4 changed files with 15 additions and 4 deletions
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -23,6 +23,7 @@ import inspect
 import logging
 import math
 import uuid
+import copy
 from typing import Callable, Optional

 import torch
@@ -274,6 +275,7 @@ class ModelPatcher:
        self.is_clip = False
        self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed

+        self.cached_patcher_init: tuple[Callable, tuple] | tuple[Callable, tuple, int] | None = None
        self.is_multigpu_base_clone = False
        self.clone_base_uuid = uuid.uuid4()

@@ -368,6 +370,7 @@ class ModelPatcher:
        n.is_clip = self.is_clip
        n.hook_mode = self.hook_mode

+        n.cached_patcher_init = self.cached_patcher_init
        n.is_multigpu_base_clone = self.is_multigpu_base_clone
        n.clone_base_uuid = self.clone_base_uuid

@@ -382,12 +385,18 @@ class ModelPatcher:
        # set load device, if present
        if new_load_device is not None:
            n.load_device = new_load_device
+        if self.cached_patcher_init is not None:
+            temp_model_patcher: ModelPatcher | list[ModelPatcher] = self.cached_patcher_init[0](*self.cached_patcher_init[1])
+            if len(self.cached_patcher_init) > 2:
+                temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]]
+            n.model = temp_model_patcher.model
+        else:
+            n.model = copy.deepcopy(n.model)
        # unlike for normal clone, backup dicts that shared same ref should not;
        # otherwise, patchers that have deep copies of base models will erroneously influence each other.
        n.backup = copy.deepcopy(n.backup)
        n.object_patches_backup = copy.deepcopy(n.object_patches_backup)
        n.hook_backup = copy.deepcopy(n.hook_backup)
-        n.model = copy.deepcopy(n.model)
        # multigpu clone should not have multigpu additional_models entry
        n.remove_additional_models("multigpu")
        # multigpu_clone all stored additional_models; make sure circular references are properly handled
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -20,7 +20,7 @@ try:
        if cuda_version < (13,):
            ck.registry.disable("cuda")
            logging.warning("WARNING: You need pytorch with cu130 or higher to use optimized CUDA operations.")
-
+    ck.registry.disable("cuda") # multigpu will not work rn with comfy-kitchen on cuda backend
    ck.registry.disable("triton")
    for k, v in ck.list_backends().items():
        logging.info(f"Found comfy_kitchen backend {k}: {v}")
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -418,7 +418,7 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
            to_batch_temp.reverse()
            to_batch = to_batch_temp[:1]

-            free_memory = model_management.get_free_memory(current_device)
+            free_memory = comfy.model_management.get_free_memory(current_device)
            for i in range(1, len(to_batch_temp) + 1):
                batch_amount = to_batch_temp[:len(to_batch_temp)//i]
                input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
@@ -487,7 +487,7 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t

                    transformer_options["cond_or_uncond"] = cond_or_uncond[:]
                    transformer_options["uuids"] = uuids[:]
-                    transformer_options["sigmas"] = timestep
+                    transformer_options["sigmas"] = timestep.to(device)
                    transformer_options["sample_sigmas"] = transformer_options["sample_sigmas"].to(device)
                    transformer_options["multigpu_thread_device"] = device

--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1510,6 +1510,7 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata)
    if out is None:
        raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd)))
+    out[0].cached_patcher_init = (load_checkpoint_guess_config, (ckpt_path, False, False, False, embedding_directory, output_model, model_options, te_model_options), 0)
    return out

 def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None):
@@ -1711,6 +1712,7 @@ def load_diffusion_model(unet_path, model_options={}):
    if model is None:
        logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path))
        raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd)))
+    model.cached_patcher_init = (load_diffusion_model, (unet_path, model_options))
    return model

 def load_unet(unet_path, dtype=None):