diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index d0110c7c6..aa7b862e7 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -23,6 +23,7 @@ import inspect
 import logging
 import math
 import uuid
+import copy
 from typing import Callable, Optional
 
 import torch
@@ -274,6 +275,7 @@ class ModelPatcher:
         self.is_clip = False
         self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed
 
+        self.cached_patcher_init: tuple[Callable, tuple] | tuple[Callable, tuple, int] | None = None
         self.is_multigpu_base_clone = False
         self.clone_base_uuid = uuid.uuid4()
 
@@ -368,6 +370,7 @@ class ModelPatcher:
         n.is_clip = self.is_clip
         n.hook_mode = self.hook_mode
 
+        n.cached_patcher_init = self.cached_patcher_init
         n.is_multigpu_base_clone = self.is_multigpu_base_clone
         n.clone_base_uuid = self.clone_base_uuid
 
@@ -382,12 +385,18 @@ class ModelPatcher:
         # set load device, if present
         if new_load_device is not None:
             n.load_device = new_load_device
+        if self.cached_patcher_init is not None:
+            temp_model_patcher: ModelPatcher | list[ModelPatcher] = self.cached_patcher_init[0](*self.cached_patcher_init[1])
+            if len(self.cached_patcher_init) > 2:
+                temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]]
+            n.model = temp_model_patcher.model
+        else:
+            n.model = copy.deepcopy(n.model)
         # unlike for normal clone, backup dicts that shared same ref should not;
         # otherwise, patchers that have deep copies of base models will erroneously influence each other.
         n.backup = copy.deepcopy(n.backup)
         n.object_patches_backup = copy.deepcopy(n.object_patches_backup)
         n.hook_backup = copy.deepcopy(n.hook_backup)
-        n.model = copy.deepcopy(n.model)
         # multigpu clone should not have multigpu additional_models entry
         n.remove_additional_models("multigpu")
         # multigpu_clone all stored additional_models; make sure circular references are properly handled
diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py
index 15a4f457b..d8addefd8 100644
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -20,7 +20,7 @@ try:
         if cuda_version < (13,):
             ck.registry.disable("cuda")
             logging.warning("WARNING: You need pytorch with cu130 or higher to use optimized CUDA operations.")
-
+    ck.registry.disable("cuda") # multigpu will not work rn with comfy-kitchen on cuda backend
     ck.registry.disable("triton")
     for k, v in ck.list_backends().items():
         logging.info(f"Found comfy_kitchen backend {k}: {v}")
diff --git a/comfy/samplers.py b/comfy/samplers.py
index 3f5a699d9..5dee49e7e 100755
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -418,7 +418,7 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
             to_batch_temp.reverse()
             to_batch = to_batch_temp[:1]
 
-            free_memory = model_management.get_free_memory(current_device)
+            free_memory = comfy.model_management.get_free_memory(current_device)
             for i in range(1, len(to_batch_temp) + 1):
                 batch_amount = to_batch_temp[:len(to_batch_temp)//i]
                 input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
@@ -487,7 +487,7 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t
 
                     transformer_options["cond_or_uncond"] = cond_or_uncond[:]
                     transformer_options["uuids"] = uuids[:]
-                    transformer_options["sigmas"] = timestep
+                    transformer_options["sigmas"] = timestep.to(device)
                     transformer_options["sample_sigmas"] = transformer_options["sample_sigmas"].to(device)
                     transformer_options["multigpu_thread_device"] = device
 
diff --git a/comfy/sd.py b/comfy/sd.py
index f65e7cadd..2643de26d 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1510,6 +1510,7 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
     out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata)
     if out is None:
         raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd)))
+    out[0].cached_patcher_init = (load_checkpoint_guess_config, (ckpt_path, False, False, False, embedding_directory, output_model, model_options, te_model_options), 0)
     return out
 
 def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None):
@@ -1711,6 +1712,7 @@ def load_diffusion_model(unet_path, model_options={}):
     if model is None:
         logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path))
         raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd)))
+    model.cached_patcher_init = (load_diffusion_model, (unet_path, model_options))
     return model
 
 def load_unet(unet_path, dtype=None):