From f4b99bc62389af315013dda85f24f2bbd262b686 Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Tue, 17 Feb 2026 04:55:00 -0800
Subject: [PATCH] Made multigpu deepclone load model from disk to avoid needing
 to deepclone actual model object, fixed issues with merge,  turn off cuda
 backend as it causes device mismatch issue with rope (and potentially other
 ops), will investigate

---
 comfy/model_patcher.py | 11 ++++++++++-
 comfy/quant_ops.py     |  2 +-
 comfy/samplers.py      |  4 ++--
 comfy/sd.py            |  2 ++
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index d0110c7c6..aa7b862e7 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -23,6 +23,7 @@ import inspect
 import logging
 import math
 import uuid
+import copy
 from typing import Callable, Optional
 
 import torch
@@ -274,6 +275,7 @@ class ModelPatcher:
         self.is_clip = False
         self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed
 
+        self.cached_patcher_init: tuple[Callable, tuple] | tuple[Callable, tuple, int] | None = None
         self.is_multigpu_base_clone = False
         self.clone_base_uuid = uuid.uuid4()
 
@@ -368,6 +370,7 @@ class ModelPatcher:
         n.is_clip = self.is_clip
         n.hook_mode = self.hook_mode
 
+        n.cached_patcher_init = self.cached_patcher_init
         n.is_multigpu_base_clone = self.is_multigpu_base_clone
         n.clone_base_uuid = self.clone_base_uuid
 
@@ -382,12 +385,18 @@ class ModelPatcher:
         # set load device, if present
         if new_load_device is not None:
             n.load_device = new_load_device
+        if self.cached_patcher_init is not None:
+            temp_model_patcher: ModelPatcher | list[ModelPatcher] = self.cached_patcher_init[0](*self.cached_patcher_init[1])
+            if len(self.cached_patcher_init) > 2:
+                temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]]
+            n.model = temp_model_patcher.model
+        else:
+            n.model = copy.deepcopy(n.model)
         # unlike for normal clone, backup dicts that shared same ref should not;
         # otherwise, patchers that have deep copies of base models will erroneously influence each other.
         n.backup = copy.deepcopy(n.backup)
         n.object_patches_backup = copy.deepcopy(n.object_patches_backup)
         n.hook_backup = copy.deepcopy(n.hook_backup)
-        n.model = copy.deepcopy(n.model)
         # multigpu clone should not have multigpu additional_models entry
         n.remove_additional_models("multigpu")
         # multigpu_clone all stored additional_models; make sure circular references are properly handled
diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py
index 15a4f457b..d8addefd8 100644
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -20,7 +20,7 @@ try:
         if cuda_version < (13,):
             ck.registry.disable("cuda")
             logging.warning("WARNING: You need pytorch with cu130 or higher to use optimized CUDA operations.")
-
+    ck.registry.disable("cuda") # multigpu will not work rn with comfy-kitchen on cuda backend
     ck.registry.disable("triton")
     for k, v in ck.list_backends().items():
         logging.info(f"Found comfy_kitchen backend {k}: {v}")
diff --git a/comfy/samplers.py b/comfy/samplers.py
index 3f5a699d9..5dee49e7e 100755
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -418,7 +418,7 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
             to_batch_temp.reverse()
             to_batch = to_batch_temp[:1]
 
-            free_memory = model_management.get_free_memory(current_device)
+            free_memory = comfy.model_management.get_free_memory(current_device)
             for i in range(1, len(to_batch_temp) + 1):
                 batch_amount = to_batch_temp[:len(to_batch_temp)//i]
                 input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
@@ -487,7 +487,7 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
 
                     transformer_options["cond_or_uncond"] = cond_or_uncond[:]
                     transformer_options["uuids"] = uuids[:]
-                    transformer_options["sigmas"] = timestep
+                    transformer_options["sigmas"] = timestep.to(device)
                     transformer_options["sample_sigmas"] = transformer_options["sample_sigmas"].to(device)
                     transformer_options["multigpu_thread_device"] = device
 
diff --git a/comfy/sd.py b/comfy/sd.py
index f65e7cadd..2643de26d 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1510,6 +1510,7 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
     out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata)
     if out is None:
         raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd)))
+    out[0].cached_patcher_init = (load_checkpoint_guess_config, (ckpt_path, False, False, False, embedding_directory, output_model, model_options, te_model_options), 0)
     return out
 
 def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None):
@@ -1711,6 +1712,7 @@ def load_diffusion_model(unet_path, model_options={}):
     if model is None:
         logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path))
         raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd)))
+    model.cached_patcher_init = (load_diffusion_model, (unet_path, model_options))
     return model
 
 def load_unet(unet_path, dtype=None):