color correct

Boundingbox widget
2026-02-17 13:40:04 +00:00 · 2026-02-14 08:15:25 -05:00 · 2026-02-13 07:56:20 -05:00
62 changed files with 777 additions and 2262 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,7 @@ extra_model_paths.yaml
 /.vs
 .vscode/
 .idea/
-venv*/
+venv/
 .venv/
 /web/extensions/*
 !/web/extensions/logging.js.example
--- a/README.md
+++ b/README.md
@@ -227,7 +227,7 @@ Put your VAE in: models/vae

 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.1```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4```

 This is the command to install the nightly with ROCm 7.1 which might have some performance improvements:

--- a/app/node_replace_manager.py
+++ b/app/node_replace_manager.py
@@ -1,105 +0,0 @@
-from __future__ import annotations
-
-from aiohttp import web
-
-from typing import TYPE_CHECKING, TypedDict
-if TYPE_CHECKING:
-    from comfy_api.latest._io_public import NodeReplace
-
-from comfy_execution.graph_utils import is_link
-import nodes
-
-class NodeStruct(TypedDict):
-    inputs: dict[str, str | int | float | bool | tuple[str, int]]
-    class_type: str
-    _meta: dict[str, str]
-
-def copy_node_struct(node_struct: NodeStruct, empty_inputs: bool = False) -> NodeStruct:
-    new_node_struct = node_struct.copy()
-    if empty_inputs:
-        new_node_struct["inputs"] = {}
-    else:
-        new_node_struct["inputs"] = node_struct["inputs"].copy()
-    new_node_struct["_meta"] = node_struct["_meta"].copy()
-    return new_node_struct
-
-
-class NodeReplaceManager:
-    """Manages node replacement registrations."""
-
-    def __init__(self):
-        self._replacements: dict[str, list[NodeReplace]] = {}
-
-    def register(self, node_replace: NodeReplace):
-        """Register a node replacement mapping."""
-        self._replacements.setdefault(node_replace.old_node_id, []).append(node_replace)
-
-    def get_replacement(self, old_node_id: str) -> list[NodeReplace] | None:
-        """Get replacements for an old node ID."""
-        return self._replacements.get(old_node_id)
-
-    def has_replacement(self, old_node_id: str) -> bool:
-        """Check if a replacement exists for an old node ID."""
-        return old_node_id in self._replacements
-
-    def apply_replacements(self, prompt: dict[str, NodeStruct]):
-        connections: dict[str, list[tuple[str, str, int]]] = {}
-        need_replacement: set[str] = set()
-        for node_number, node_struct in prompt.items():
-            class_type = node_struct["class_type"]
-            # need replacement if not in NODE_CLASS_MAPPINGS and has replacement
-            if class_type not in nodes.NODE_CLASS_MAPPINGS.keys() and self.has_replacement(class_type):
-                need_replacement.add(node_number)
-            # keep track of connections
-            for input_id, input_value in node_struct["inputs"].items():
-                if is_link(input_value):
-                    conn_number = input_value[0]
-                    connections.setdefault(conn_number, []).append((node_number, input_id, input_value[1]))
-        for node_number in need_replacement:
-            node_struct = prompt[node_number]
-            class_type = node_struct["class_type"]
-            replacements = self.get_replacement(class_type)
-            if replacements is None:
-                continue
-            # just use the first replacement
-            replacement = replacements[0]
-            new_node_id = replacement.new_node_id
-            # if replacement is not a valid node, skip trying to replace it as will only cause confusion
-            if new_node_id not in nodes.NODE_CLASS_MAPPINGS.keys():
-                continue
-            # first, replace node id (class_type)
-            new_node_struct = copy_node_struct(node_struct, empty_inputs=True)
-            new_node_struct["class_type"] = new_node_id
-            # TODO: consider replacing display_name in _meta as well for error reporting purposes; would need to query node schema
-            # second, replace inputs
-            if replacement.input_mapping is not None:
-                for input_map in replacement.input_mapping:
-                    if "set_value" in input_map:
-                        new_node_struct["inputs"][input_map["new_id"]] = input_map["set_value"]
-                    elif "old_id" in input_map:
-                        new_node_struct["inputs"][input_map["new_id"]] = node_struct["inputs"][input_map["old_id"]]
-            # finalize input replacement
-            prompt[node_number] = new_node_struct
-            # third, replace outputs
-            if replacement.output_mapping is not None:
-                # re-mapping outputs requires changing the input values of nodes that receive connections from this one
-                if node_number in connections:
-                    for conns in connections[node_number]:
-                        conn_node_number, conn_input_id, old_output_idx = conns
-                        for output_map in replacement.output_mapping:
-                            if output_map["old_idx"] == old_output_idx:
-                                new_output_idx = output_map["new_idx"]
-                                previous_input = prompt[conn_node_number]["inputs"][conn_input_id]
-                                previous_input[1] = new_output_idx
-
-    def as_dict(self):
-        """Serialize all replacements to dict."""
-        return {
-            k: [v.as_dict() for v in v_list]
-            for k, v_list in self._replacements.items()
-        }
-
-    def add_routes(self, routes):
-        @routes.get("/node_replacements")
-        async def get_node_replacements(request):
-            return web.json_response(self.as_dict())
--- a/comfy/checkpoint_pickle.py
+++ b/comfy/checkpoint_pickle.py
@@ -0,0 +1,13 @@
+import pickle
+
+load = pickle.load
+
+class Empty:
+    pass
+
+class Unpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        #TODO: safe unpickle
+        if module.startswith("pytorch_lightning"):
+            return Empty
+        return super().find_class(module, name)
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -49,7 +49,7 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co
 parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use. All other devices will not be visible.")
+parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.")
 parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -15,14 +15,13 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
-from __future__ import annotations
+

 import torch
 from enum import Enum
 import math
 import os
 import logging
-import copy
 import comfy.utils
 import comfy.model_management
 import comfy.model_detection
@@ -39,7 +38,7 @@ import comfy.ldm.hydit.controlnet
 import comfy.ldm.flux.controlnet
 import comfy.ldm.qwen_image.controlnet
 import comfy.cldm.dit_embedder
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from comfy.hooks import HookGroup

@@ -65,18 +64,6 @@ class StrengthType(Enum):
    CONSTANT = 1
    LINEAR_UP = 2

-class ControlIsolation:
-    '''Temporarily set a ControlBase object's previous_controlnet to None to prevent cascading calls.'''
-    def __init__(self, control: ControlBase):
-        self.control = control
-        self.orig_previous_controlnet = control.previous_controlnet
-
-    def __enter__(self):
-        self.control.previous_controlnet = None
-
-    def __exit__(self, *args):
-        self.control.previous_controlnet = self.orig_previous_controlnet
-
 class ControlBase:
    def __init__(self):
        self.cond_hint_original = None
@@ -90,7 +77,7 @@ class ControlBase:
        self.compression_ratio = 8
        self.upscale_algorithm = 'nearest-exact'
        self.extra_args = {}
-        self.previous_controlnet: Union[ControlBase, None] = None
+        self.previous_controlnet = None
        self.extra_conds = []
        self.strength_type = StrengthType.CONSTANT
        self.concat_mask = False
@@ -98,7 +85,6 @@ class ControlBase:
        self.extra_concat = None
        self.extra_hooks: HookGroup = None
        self.preprocess_image = lambda a: a
-        self.multigpu_clones: dict[torch.device, ControlBase] = {}

    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
        self.cond_hint_original = cond_hint
@@ -125,38 +111,17 @@ class ControlBase:
    def cleanup(self):
        if self.previous_controlnet is not None:
            self.previous_controlnet.cleanup()
-        for device_cnet in self.multigpu_clones.values():
-            with ControlIsolation(device_cnet):
-                device_cnet.cleanup()
+
        self.cond_hint = None
        self.extra_concat = None
        self.timestep_range = None

    def get_models(self):
        out = []
-        for device_cnet in self.multigpu_clones.values():
-            out += device_cnet.get_models_only_self()
        if self.previous_controlnet is not None:
            out += self.previous_controlnet.get_models()
        return out

-    def get_models_only_self(self):
-        'Calls get_models, but temporarily sets previous_controlnet to None.'
-        with ControlIsolation(self):
-            return self.get_models()
-
-    def get_instance_for_device(self, device):
-        'Returns instance of this Control object intended for selected device.'
-        return self.multigpu_clones.get(device, self)
-
-    def deepclone_multigpu(self, load_device, autoregister=False):
-        '''
-        Create deep clone of Control object where model(s) is set to other devices.
-
-        When autoregister is set to True, the deep clone is also added to multigpu_clones dict.
-        '''
-        raise NotImplementedError("Classes inheriting from ControlBase should define their own deepclone_multigpu funtion.")
-
    def get_extra_hooks(self):
        out = []
        if self.extra_hooks is not None:
@@ -165,7 +130,7 @@ class ControlBase:
            out += self.previous_controlnet.get_extra_hooks()
        return out

-    def copy_to(self, c: ControlBase):
+    def copy_to(self, c):
        c.cond_hint_original = self.cond_hint_original
        c.strength = self.strength
        c.timestep_percent_range = self.timestep_percent_range
@@ -319,14 +284,6 @@ class ControlNet(ControlBase):
        self.copy_to(c)
        return c

-    def deepclone_multigpu(self, load_device, autoregister=False):
-        c = self.copy()
-        c.control_model = copy.deepcopy(c.control_model)
-        c.control_model_wrapped = comfy.model_patcher.ModelPatcher(c.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
-        if autoregister:
-            self.multigpu_clones[load_device] = c
-        return c
-
    def get_models(self):
        out = super().get_models()
        out.append(self.control_model_wrapped)
@@ -340,30 +297,6 @@ class ControlNet(ControlBase):
        self.model_sampling_current = None
        super().cleanup()

-
-class QwenFunControlNet(ControlNet):
-    def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
-        # Fun checkpoints are more sensitive to high strengths in the generic
-        # ControlNet merge path. Use a soft response curve so strength=1.0 stays
-        # unchanged while >1 grows more gently.
-        original_strength = self.strength
-        self.strength = math.sqrt(max(self.strength, 0.0))
-        try:
-            return super().get_control(x_noisy, t, cond, batched_number, transformer_options)
-        finally:
-            self.strength = original_strength
-
-    def pre_run(self, model, percent_to_timestep_function):
-        super().pre_run(model, percent_to_timestep_function)
-        self.set_extra_arg("base_model", model.diffusion_model)
-
-    def copy(self):
-        c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
-        c.control_model = self.control_model
-        c.control_model_wrapped = self.control_model_wrapped
-        self.copy_to(c)
-        return c
-
 class ControlLoraOps:
    class Linear(torch.nn.Module, comfy.ops.CastWeightBiasOp):
        def __init__(self, in_features: int, out_features: int, bias: bool = True,
@@ -627,7 +560,6 @@ def load_controlnet_hunyuandit(controlnet_data, model_options={}):
 def load_controlnet_flux_xlabs_mistoline(sd, mistoline=False, model_options={}):
    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(mistoline=mistoline, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
-    sd = model_config.process_unet_state_dict(sd)
    control_model = controlnet_load_state_dict(control_model, sd)
    extra_conds = ['y', 'guidance']
    control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
@@ -673,53 +605,6 @@ def load_controlnet_qwen_instantx(sd, model_options={}):
    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
    return control

-
-def load_controlnet_qwen_fun(sd, model_options={}):
-    load_device = comfy.model_management.get_torch_device()
-    weight_dtype = comfy.utils.weight_dtype(sd)
-    unet_dtype = model_options.get("dtype", weight_dtype)
-    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
-
-    operations = model_options.get("custom_operations", None)
-    if operations is None:
-        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
-
-    in_features = sd["control_img_in.weight"].shape[1]
-    inner_dim = sd["control_img_in.weight"].shape[0]
-
-    block_weight = sd["control_blocks.0.attn.to_q.weight"]
-    attention_head_dim = sd["control_blocks.0.attn.norm_q.weight"].shape[0]
-    num_attention_heads = max(1, block_weight.shape[0] // max(1, attention_head_dim))
-
-    model = comfy.ldm.qwen_image.controlnet.QwenImageFunControlNetModel(
-        control_in_features=in_features,
-        inner_dim=inner_dim,
-        num_attention_heads=num_attention_heads,
-        attention_head_dim=attention_head_dim,
-        num_control_blocks=5,
-        main_model_double=60,
-        injection_layers=(0, 12, 24, 36, 48),
-        operations=operations,
-        device=comfy.model_management.unet_offload_device(),
-        dtype=unet_dtype,
-    )
-    model = controlnet_load_state_dict(model, sd)
-
-    latent_format = comfy.latent_formats.Wan21()
-    control = QwenFunControlNet(
-        model,
-        compression_ratio=1,
-        latent_format=latent_format,
-        # Fun checkpoints already expect their own 33-channel context handling.
-        # Enabling generic concat_mask injects an extra mask channel at apply-time
-        # and breaks the intended fallback packing path.
-        concat_mask=False,
-        load_device=load_device,
-        manual_cast_dtype=manual_cast_dtype,
-        extra_conds=[],
-    )
-    return control
-
 def convert_mistoline(sd):
    return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."})

@@ -797,8 +682,6 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
            return load_controlnet_qwen_instantx(controlnet_data, model_options=model_options)
        elif "controlnet_x_embedder.weight" in controlnet_data:
            return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
-    elif "control_blocks.0.after_proj.weight" in controlnet_data and "control_img_in.weight" in controlnet_data:
-        return load_controlnet_qwen_fun(controlnet_data, model_options=model_options)

    elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
        return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options)
@@ -949,14 +832,6 @@ class T2IAdapter(ControlBase):
        self.copy_to(c)
        return c

-    def deepclone_multigpu(self, load_device, autoregister=False):
-        c = self.copy()
-        c.t2i_model = copy.deepcopy(c.t2i_model)
-        c.device = load_device
-        if autoregister:
-            self.multigpu_clones[load_device] = c
-        return c
-
 def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
    compression_ratio = 8
    upscale_algorithm = 'nearest-exact'
--- a/comfy/ldm/chroma/layers.py
+++ b/comfy/ldm/chroma/layers.py
@@ -3,6 +3,7 @@ from torch import Tensor, nn

 from comfy.ldm.flux.layers import (
    MLPEmbedder,
+    RMSNorm,
    ModulationOut,
 )

@@ -28,7 +29,7 @@ class Approximator(nn.Module):
        super().__init__()
        self.in_proj = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
        self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
-        self.norms = nn.ModuleList([operations.RMSNorm(hidden_dim, dtype=dtype, device=device) for x in range( n_layers)])
+        self.norms = nn.ModuleList([RMSNorm(hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)])
        self.out_proj = operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device)

    @property
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@@ -152,7 +152,6 @@ class Chroma(nn.Module):
        transformer_options={},
        attn_mask: Tensor = None,
    ) -> Tensor:
-        transformer_options = transformer_options.copy()
        patches_replace = transformer_options.get("patches_replace", {})

        # running on sequences img
@@ -229,7 +228,6 @@ class Chroma(nn.Module):

        transformer_options["total_blocks"] = len(self.single_blocks)
        transformer_options["block_type"] = "single"
-        transformer_options["img_slice"] = [txt.shape[1], img.shape[1]]
        for i, block in enumerate(self.single_blocks):
            transformer_options["block_index"] = i
            if i not in self.skip_dit:
--- a/comfy/ldm/chroma_radiance/layers.py
+++ b/comfy/ldm/chroma_radiance/layers.py
@@ -4,6 +4,8 @@ from functools import lru_cache
 import torch
 from torch import nn

+from comfy.ldm.flux.layers import RMSNorm
+

 class NerfEmbedder(nn.Module):
    """
@@ -143,7 +145,7 @@ class NerfGLUBlock(nn.Module):
        # We now need to generate parameters for 3 matrices.
        total_params = 3 * hidden_size_x**2 * mlp_ratio
        self.param_generator = operations.Linear(hidden_size_s, total_params, dtype=dtype, device=device)
-        self.norm = operations.RMSNorm(hidden_size_x, dtype=dtype, device=device)
+        self.norm = RMSNorm(hidden_size_x, dtype=dtype, device=device, operations=operations)
        self.mlp_ratio = mlp_ratio


@@ -176,7 +178,7 @@ class NerfGLUBlock(nn.Module):
 class NerfFinalLayer(nn.Module):
    def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None):
        super().__init__()
-        self.norm = operations.RMSNorm(hidden_size, dtype=dtype, device=device)
+        self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations)
        self.linear = operations.Linear(hidden_size, out_channels, dtype=dtype, device=device)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -188,7 +190,7 @@ class NerfFinalLayer(nn.Module):
 class NerfFinalLayerConv(nn.Module):
    def __init__(self, hidden_size: int, out_channels: int, dtype=None, device=None, operations=None):
        super().__init__()
-        self.norm = operations.RMSNorm(hidden_size, dtype=dtype, device=device)
+        self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations)
        self.conv = operations.Conv2d(
            in_channels=hidden_size,
            out_channels=out_channels,
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -5,9 +5,9 @@ import torch
 from torch import Tensor, nn

 from .math import attention, rope
+import comfy.ops
+import comfy.ldm.common_dit

-# Fix import for some custom nodes, TODO: delete eventually.
-RMSNorm = None

 class EmbedND(nn.Module):
    def __init__(self, dim: int, theta: int, axes_dim: list):
@@ -87,12 +87,20 @@ def build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=False, yak_mlp=False, dt
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )

+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))
+
+    def forward(self, x: Tensor):
+        return comfy.ldm.common_dit.rms_norm(x, self.scale, 1e-6)
+

 class QKNorm(torch.nn.Module):
    def __init__(self, dim: int, dtype=None, device=None, operations=None):
        super().__init__()
-        self.query_norm = operations.RMSNorm(dim, dtype=dtype, device=device)
-        self.key_norm = operations.RMSNorm(dim, dtype=dtype, device=device)
+        self.query_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)
+        self.key_norm = RMSNorm(dim, dtype=dtype, device=device, operations=operations)

    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple:
        q = self.query_norm(q)
@@ -161,7 +169,7 @@ class SiLUActivation(nn.Module):


 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None):
        super().__init__()

        mlp_hidden_dim = int(hidden_size * mlp_ratio)
@@ -189,6 +197,8 @@ class DoubleStreamBlock(nn.Module):

        self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)

+        self.flipped_img_txt = flipped_img_txt
+
    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
        if self.modulation:
            img_mod1, img_mod2 = self.img_mod(vec)
@@ -196,9 +206,6 @@ class DoubleStreamBlock(nn.Module):
        else:
            (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec

-        transformer_patches = transformer_options.get("patches", {})
-        extra_options = transformer_options.copy()
-
        # prepare image for attention
        img_modulated = self.img_norm1(img)
        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
@@ -217,23 +224,32 @@ class DoubleStreamBlock(nn.Module):
        del txt_qkv
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)

-        q = torch.cat((txt_q, img_q), dim=2)
-        del txt_q, img_q
-        k = torch.cat((txt_k, img_k), dim=2)
-        del txt_k, img_k
-        v = torch.cat((txt_v, img_v), dim=2)
-        del txt_v, img_v
-        # run actual attention
-        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
-        del q, k, v
+        if self.flipped_img_txt:
+            q = torch.cat((img_q, txt_q), dim=2)
+            del img_q, txt_q
+            k = torch.cat((img_k, txt_k), dim=2)
+            del img_k, txt_k
+            v = torch.cat((img_v, txt_v), dim=2)
+            del img_v, txt_v
+            # run actual attention
+            attn = attention(q, k, v,
+                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
+            del q, k, v

-        if "attn1_output_patch" in transformer_patches:
-            extra_options["img_slice"] = [txt.shape[1], attn.shape[1]]
-            patch = transformer_patches["attn1_output_patch"]
-            for p in patch:
-                attn = p(attn, extra_options)
+            img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:]
+        else:
+            q = torch.cat((txt_q, img_q), dim=2)
+            del txt_q, img_q
+            k = torch.cat((txt_k, img_k), dim=2)
+            del txt_k, img_k
+            v = torch.cat((txt_v, img_v), dim=2)
+            del txt_v, img_v
+            # run actual attention
+            attn = attention(q, k, v,
+                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
+            del q, k, v

-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
+            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]

        # calculate the img bloks
        img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
@@ -312,9 +328,6 @@ class SingleStreamBlock(nn.Module):
        else:
            mod = vec

-        transformer_patches = transformer_options.get("patches", {})
-        extra_options = transformer_options.copy()
-
        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim_first], dim=-1)

        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
@@ -324,12 +337,6 @@ class SingleStreamBlock(nn.Module):
        # compute attention
        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
        del q, k, v
-
-        if "attn1_output_patch" in transformer_patches:
-            patch = transformer_patches["attn1_output_patch"]
-            for p in patch:
-                attn = p(attn, extra_options)
-
        # compute activation in mlp stream, cat again and run second linear layer
        if self.yak_mlp:
            mlp = self.mlp_act(mlp[..., self.mlp_hidden_dim_first // 2:]) * mlp[..., :self.mlp_hidden_dim_first // 2]
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -16,6 +16,7 @@ from .layers import (
    SingleStreamBlock,
    timestep_embedding,
    Modulation,
+    RMSNorm
 )

@dataclass
@@ -80,7 +81,7 @@ class Flux(nn.Module):
        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)

        if params.txt_norm:
-            self.txt_norm = operations.RMSNorm(params.context_in_dim, dtype=dtype, device=device)
+            self.txt_norm = RMSNorm(params.context_in_dim, dtype=dtype, device=device, operations=operations)
        else:
            self.txt_norm = None

@@ -142,7 +143,6 @@ class Flux(nn.Module):
        attn_mask: Tensor = None,
    ) -> Tensor:

-        transformer_options = transformer_options.copy()
        patches = transformer_options.get("patches", {})
        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
@@ -232,7 +232,6 @@ class Flux(nn.Module):

        transformer_options["total_blocks"] = len(self.single_blocks)
        transformer_options["block_type"] = "single"
-        transformer_options["img_slice"] = [txt.shape[1], img.shape[1]]
        for i, block in enumerate(self.single_blocks):
            transformer_options["block_index"] = i
            if ("single_block", i) in blocks_replace:
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -241,6 +241,7 @@ class HunyuanVideo(nn.Module):
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
+                    flipped_img_txt=True,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
@@ -304,7 +305,6 @@ class HunyuanVideo(nn.Module):
        control=None,
        transformer_options={},
    ) -> Tensor:
-        transformer_options = transformer_options.copy()
        patches_replace = transformer_options.get("patches_replace", {})

        initial_shape = list(img.shape)
@@ -378,14 +378,14 @@ class HunyuanVideo(nn.Module):
            extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
            txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1)

-        ids = torch.cat((txt_ids, img_ids), dim=1)
+        ids = torch.cat((img_ids, txt_ids), dim=1)
        pe = self.pe_embedder(ids)

        img_len = img.shape[1]
        if txt_mask is not None:
            attn_mask_len = img_len + txt.shape[1]
            attn_mask = torch.zeros((1, 1, attn_mask_len), dtype=img.dtype, device=img.device)
-            attn_mask[:, 0, :txt.shape[1]] = txt_mask
+            attn_mask[:, 0, img_len:] = txt_mask
        else:
            attn_mask = None

@@ -413,11 +413,10 @@ class HunyuanVideo(nn.Module):
                    if add is not None:
                        img += add

-        img = torch.cat((txt, img), 1)
+        img = torch.cat((img, txt), 1)

        transformer_options["total_blocks"] = len(self.single_blocks)
        transformer_options["block_type"] = "single"
-        transformer_options["img_slice"] = [txt.shape[1], img.shape[1]]
        for i, block in enumerate(self.single_blocks):
            transformer_options["block_index"] = i
            if ("single_block", i) in blocks_replace:
@@ -436,9 +435,9 @@ class HunyuanVideo(nn.Module):
                if i < len(control_o):
                    add = control_o[i]
                    if add is not None:
-                        img[:, txt.shape[1]: img_len + txt.shape[1]] += add
+                        img[:, : img_len] += add

-        img = img[:, txt.shape[1]: img_len + txt.shape[1]]
+        img = img[:, : img_len]
        if ref_latent is not None:
            img = img[:, ref_latent.shape[1]:]

--- a/comfy/ldm/modules/diffusionmodules/model.py
+++ b/comfy/ldm/modules/diffusionmodules/model.py
@@ -102,7 +102,19 @@ class VideoConv3d(nn.Module):
        return self.conv(x)

 def interpolate_up(x, scale_factor):
-    return torch.nn.functional.interpolate(x, scale_factor=scale_factor, mode="nearest")
+    try:
+        return torch.nn.functional.interpolate(x, scale_factor=scale_factor, mode="nearest")
+    except: #operation not implemented for bf16
+        orig_shape = list(x.shape)
+        out_shape = orig_shape[:2]
+        for i in range(len(orig_shape) - 2):
+            out_shape.append(round(orig_shape[i + 2] * scale_factor[i]))
+        out = torch.empty(out_shape, dtype=x.dtype, layout=x.layout, device=x.device)
+        split = 8
+        l = out.shape[1] // split
+        for i in range(0, out.shape[1], l):
+            out[:,i:i+l] = torch.nn.functional.interpolate(x[:,i:i+l].to(torch.float32), scale_factor=scale_factor, mode="nearest").to(x.dtype)
+        return out

 class Upsample(nn.Module):
    def __init__(self, in_channels, with_conv, conv_op=ops.Conv2d, scale_factor=2.0):
--- a/comfy/ldm/qwen_image/controlnet.py
+++ b/comfy/ldm/qwen_image/controlnet.py
@@ -2,196 +2,6 @@ import torch
 import math

 from .model import QwenImageTransformer2DModel
-from .model import QwenImageTransformerBlock
-
-
-class QwenImageFunControlBlock(QwenImageTransformerBlock):
-    def __init__(self, dim, num_attention_heads, attention_head_dim, has_before_proj=False, dtype=None, device=None, operations=None):
-        super().__init__(
-            dim=dim,
-            num_attention_heads=num_attention_heads,
-            attention_head_dim=attention_head_dim,
-            dtype=dtype,
-            device=device,
-            operations=operations,
-        )
-        self.has_before_proj = has_before_proj
-        if has_before_proj:
-            self.before_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
-        self.after_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
-
-
-class QwenImageFunControlNetModel(torch.nn.Module):
-    def __init__(
-        self,
-        control_in_features=132,
-        inner_dim=3072,
-        num_attention_heads=24,
-        attention_head_dim=128,
-        num_control_blocks=5,
-        main_model_double=60,
-        injection_layers=(0, 12, 24, 36, 48),
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
-        super().__init__()
-        self.dtype = dtype
-        self.main_model_double = main_model_double
-        self.injection_layers = tuple(injection_layers)
-        # Keep base hint scaling at 1.0 so user-facing strength behaves similarly
-        # to the reference Gen2/VideoX implementation around strength=1.
-        self.hint_scale = 1.0
-        self.control_img_in = operations.Linear(control_in_features, inner_dim, device=device, dtype=dtype)
-
-        self.control_blocks = torch.nn.ModuleList([])
-        for i in range(num_control_blocks):
-            self.control_blocks.append(
-                QwenImageFunControlBlock(
-                    dim=inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    has_before_proj=(i == 0),
-                    dtype=dtype,
-                    device=device,
-                    operations=operations,
-                )
-            )
-
-    def _process_hint_tokens(self, hint):
-        if hint is None:
-            return None
-        if hint.ndim == 4:
-            hint = hint.unsqueeze(2)
-
-        # Fun checkpoints are trained with 33 latent channels before 2x2 packing:
-        # [control_latent(16), mask(1), inpaint_latent(16)] -> 132 features.
-        # Default behavior (no inpaint input in stock Apply ControlNet) should use
-        # zeros for mask/inpaint branches, matching VideoX fallback semantics.
-        expected_c = self.control_img_in.weight.shape[1] // 4
-        if hint.shape[1] == 16 and expected_c == 33:
-            zeros_mask = torch.zeros_like(hint[:, :1])
-            zeros_inpaint = torch.zeros_like(hint)
-            hint = torch.cat([hint, zeros_mask, zeros_inpaint], dim=1)
-
-        bs, c, t, h, w = hint.shape
-        hidden_states = torch.nn.functional.pad(hint, (0, w % 2, 0, h % 2))
-        orig_shape = hidden_states.shape
-        hidden_states = hidden_states.view(
-            orig_shape[0],
-            orig_shape[1],
-            orig_shape[-3],
-            orig_shape[-2] // 2,
-            2,
-            orig_shape[-1] // 2,
-            2,
-        )
-        hidden_states = hidden_states.permute(0, 2, 3, 5, 1, 4, 6)
-        hidden_states = hidden_states.reshape(
-            bs,
-            t * ((h + 1) // 2) * ((w + 1) // 2),
-            c * 4,
-        )
-
-        expected_in = self.control_img_in.weight.shape[1]
-        cur_in = hidden_states.shape[-1]
-        if cur_in < expected_in:
-            pad = torch.zeros(
-                (hidden_states.shape[0], hidden_states.shape[1], expected_in - cur_in),
-                device=hidden_states.device,
-                dtype=hidden_states.dtype,
-            )
-            hidden_states = torch.cat([hidden_states, pad], dim=-1)
-        elif cur_in > expected_in:
-            hidden_states = hidden_states[:, :, :expected_in]
-
-        return hidden_states
-
-    def forward(
-        self,
-        x,
-        timesteps,
-        context,
-        attention_mask=None,
-        guidance: torch.Tensor = None,
-        hint=None,
-        transformer_options={},
-        base_model=None,
-        **kwargs,
-    ):
-        if base_model is None:
-            raise RuntimeError("Qwen Fun ControlNet requires a QwenImage base model at runtime.")
-
-        encoder_hidden_states_mask = attention_mask
-        # Keep attention mask disabled inside Fun control blocks to mirror
-        # VideoX behavior (they rely on seq lengths for RoPE, not masked attention).
-        encoder_hidden_states_mask = None
-
-        hidden_states, img_ids, _ = base_model.process_img(x)
-        hint_tokens = self._process_hint_tokens(hint)
-        if hint_tokens is None:
-            raise RuntimeError("Qwen Fun ControlNet requires a control hint image.")
-
-        if hint_tokens.shape[1] != hidden_states.shape[1]:
-            max_tokens = min(hint_tokens.shape[1], hidden_states.shape[1])
-            hint_tokens = hint_tokens[:, :max_tokens]
-            hidden_states = hidden_states[:, :max_tokens]
-            img_ids = img_ids[:, :max_tokens]
-
-        txt_start = round(
-            max(
-                ((x.shape[-1] + (base_model.patch_size // 2)) // base_model.patch_size) // 2,
-                ((x.shape[-2] + (base_model.patch_size // 2)) // base_model.patch_size) // 2,
-            )
-        )
-        txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3)
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        image_rotary_emb = base_model.pe_embedder(ids).to(x.dtype).contiguous()
-
-        hidden_states = base_model.img_in(hidden_states)
-        encoder_hidden_states = base_model.txt_norm(context)
-        encoder_hidden_states = base_model.txt_in(encoder_hidden_states)
-
-        if guidance is not None:
-            guidance = guidance * 1000
-
-        temb = (
-            base_model.time_text_embed(timesteps, hidden_states)
-            if guidance is None
-            else base_model.time_text_embed(timesteps, guidance, hidden_states)
-        )
-
-        c = self.control_img_in(hint_tokens)
-
-        for i, block in enumerate(self.control_blocks):
-            if i == 0:
-                c_in = block.before_proj(c) + hidden_states
-                all_c = []
-            else:
-                all_c = list(torch.unbind(c, dim=0))
-                c_in = all_c.pop(-1)
-
-            encoder_hidden_states, c_out = block(
-                hidden_states=c_in,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_hidden_states_mask=encoder_hidden_states_mask,
-                temb=temb,
-                image_rotary_emb=image_rotary_emb,
-                transformer_options=transformer_options,
-            )
-
-            c_skip = block.after_proj(c_out) * self.hint_scale
-            all_c += [c_skip, c_out]
-            c = torch.stack(all_c, dim=0)
-
-        hints = torch.unbind(c, dim=0)[:-1]
-
-        controlnet_block_samples = [None] * self.main_model_double
-        for local_idx, base_idx in enumerate(self.injection_layers):
-            if local_idx < len(hints) and base_idx < len(controlnet_block_samples):
-                controlnet_block_samples[base_idx] = hints[local_idx]
-
-        return {"input": controlnet_block_samples}


 class QwenImageControlNetModel(QwenImageTransformer2DModel):
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -374,31 +374,6 @@ def pad_tensor_to_shape(tensor: torch.Tensor, new_shape: list[int]) -> torch.Ten

    return padded_tensor

-def calculate_shape(patches, weight, key, original_weights=None):
-    current_shape = weight.shape
-
-    for p in patches:
-        v = p[1]
-        offset = p[3]
-
-        # Offsets restore the old shape; lists force a diff without metadata
-        if offset is not None or isinstance(v, list):
-            continue
-
-        if isinstance(v, weight_adapter.WeightAdapterBase):
-            adapter_shape = v.calculate_shape(key)
-            if adapter_shape is not None:
-                current_shape = adapter_shape
-            continue
-
-        # Standard diff logic with padding
-        if len(v) == 2:
-            patch_type, patch_data = v[0], v[1]
-            if patch_type == "diff" and len(patch_data) > 1 and patch_data[1]['pad_weight']:
-                current_shape = patch_data[0].shape
-
-    return current_shape
-
 def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, original_weights=None):
    for p in patches:
        strength = p[0]
--- a/comfy/lora_convert.py
+++ b/comfy/lora_convert.py
@@ -5,7 +5,7 @@ import comfy.utils
 def convert_lora_bfl_control(sd): #BFL loras for Flux
    sd_out = {}
    for k in sd:
-        k_to = "diffusion_model.{}".format(k.replace(".lora_B.bias", ".diff_b").replace("_norm.scale", "_norm.set_weight"))
+        k_to = "diffusion_model.{}".format(k.replace(".lora_B.bias", ".diff_b").replace("_norm.scale", "_norm.scale.set_weight"))
        sd_out[k_to] = sd[k]

    sd_out["diffusion_model.img_in.reshape_weight"] = torch.tensor([sd["img_in.lora_B.weight"].shape[0], sd["img_in.lora_A.weight"].shape[1]])
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -178,7 +178,10 @@ class BaseModel(torch.nn.Module):
            xc = torch.cat([xc] + [comfy.model_management.cast_to_device(c_concat, xc.device, xc.dtype)], dim=1)

        context = c_crossattn
-        dtype = self.get_dtype_inference()
+        dtype = self.get_dtype()
+
+        if self.manual_cast_dtype is not None:
+            dtype = self.manual_cast_dtype

        xc = xc.to(dtype)
        device = xc.device
@@ -215,13 +218,6 @@ class BaseModel(torch.nn.Module):
    def get_dtype(self):
        return self.diffusion_model.dtype

-    def get_dtype_inference(self):
-        dtype = self.get_dtype()
-
-        if self.manual_cast_dtype is not None:
-            dtype = self.manual_cast_dtype
-        return dtype
-
    def encode_adm(self, **kwargs):
        return None

@@ -376,7 +372,9 @@ class BaseModel(torch.nn.Module):
                    input_shapes += shape

        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
-            dtype = self.get_dtype_inference()
+            dtype = self.get_dtype()
+            if self.manual_cast_dtype is not None:
+                dtype = self.manual_cast_dtype
            #TODO: this needs to be tweaked
            area = sum(map(lambda input_shape: input_shape[0] * math.prod(input_shape[2:]), input_shapes))
            return (area * comfy.model_management.dtype_size(dtype) * 0.01 * self.memory_usage_factor) * (1024 * 1024)
@@ -1167,7 +1165,7 @@ class Anima(BaseModel):
                t5xxl_ids = t5xxl_ids.unsqueeze(0)

                if torch.is_inference_mode_enabled():  # if not we are training
-                    cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype_inference()), t5xxl_ids.to(device=device), t5xxl_weights=t5xxl_weights.to(device=device, dtype=self.get_dtype_inference()))
+                    cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.to(device=device), t5xxl_weights=t5xxl_weights.to(device=device, dtype=self.get_dtype()))
                else:
                    out['t5xxl_ids'] = comfy.conds.CONDRegular(t5xxl_ids)
                    out['t5xxl_weights'] = comfy.conds.CONDRegular(t5xxl_weights)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -19,12 +19,6 @@ def count_blocks(state_dict_keys, prefix_string):
        count += 1
    return count

-def any_suffix_in(keys, prefix, main, suffix_list=[]):
-    for x in suffix_list:
-        if "{}{}{}".format(prefix, main, x) in keys:
-            return True
-    return False
-
 def calculate_transformer_depth(prefix, state_dict_keys, state_dict):
    context_dim = None
    use_linear_in_transformer = False
@@ -192,7 +186,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["meanflow_sum"] = False
        return dit_config

-    if any_suffix_in(state_dict_keys, key_prefix, 'double_blocks.0.img_attn.norm.key_norm.', ["weight", "scale"]) and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or any_suffix_in(state_dict_keys, key_prefix, 'distilled_guidance_layer.norms.0.', ["weight", "scale"])): #Flux, Chroma or Chroma Radiance (has no img_in.weight)
+    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight)
        dit_config = {}
        if '{}double_stream_modulation_img.lin.weight'.format(key_prefix) in state_dict_keys:
            dit_config["image_model"] = "flux2"
@@ -247,8 +241,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):

        dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
        dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
-
-        if any_suffix_in(state_dict_keys, key_prefix, 'distilled_guidance_layer.0.norms.0.', ["weight", "scale"]) or any_suffix_in(state_dict_keys, key_prefix, 'distilled_guidance_layer.norms.0.', ["weight", "scale"]): #Chroma
+        if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma
            dit_config["image_model"] = "chroma"
            dit_config["in_channels"] = 64
            dit_config["out_channels"] = 64
@@ -256,8 +249,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["out_dim"] = 3072
            dit_config["hidden_dim"] = 5120
            dit_config["n_layers"] = 5
-
-            if any_suffix_in(state_dict_keys, key_prefix, 'nerf_blocks.0.norm.', ["weight", "scale"]): #Chroma Radiance
+            if f"{key_prefix}nerf_blocks.0.norm.scale" in state_dict_keys: #Chroma Radiance
                dit_config["image_model"] = "chroma_radiance"
                dit_config["in_channels"] = 3
                dit_config["out_channels"] = 3
@@ -267,7 +259,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                dit_config["nerf_depth"] = 4
                dit_config["nerf_max_freqs"] = 8
                dit_config["nerf_tile_size"] = 512
-                dit_config["nerf_final_head_type"] = "conv" if any_suffix_in(state_dict_keys, key_prefix, 'nerf_final_layer_conv.norm.', ["weight", "scale"]) else "linear"
+                dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
                dit_config["nerf_embedder_dtype"] = torch.float32
                if "{}__x0__".format(key_prefix) in state_dict_keys: # x0 pred
                    dit_config["use_x0"] = True
@@ -276,7 +268,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        else:
            dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
            dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys
-            dit_config["txt_norm"] = any_suffix_in(state_dict_keys, key_prefix, 'txt_norm.', ["weight", "scale"])
+            dit_config["txt_norm"] = "{}txt_norm.scale".format(key_prefix) in state_dict_keys
            if dit_config["yak_mlp"] and dit_config["txt_norm"]:  # Ovis model
                dit_config["txt_ids_dims"] = [1, 2]

--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -15,7 +15,6 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
-from __future__ import annotations

 import psutil
 import logging
@@ -36,10 +35,6 @@ import comfy.quant_ops
 import comfy_aimdo.torch
 import comfy_aimdo.model_vbar

-from typing import TYPE_CHECKING
-if TYPE_CHECKING:
-    from comfy.model_patcher import ModelPatcher
-
 class VRAMState(Enum):
    DISABLED = 0    #No vram present: no need to move models to vram
    NO_VRAM = 1     #Very low vram: enable all the options to save vram
@@ -205,25 +200,6 @@ def get_torch_device():
        else:
            return torch.device(torch.cuda.current_device())

-def get_all_torch_devices(exclude_current=False):
-    global cpu_state
-    devices = []
-    if cpu_state == CPUState.GPU:
-        if is_nvidia():
-            for i in range(torch.cuda.device_count()):
-                devices.append(torch.device(i))
-        elif is_intel_xpu():
-            for i in range(torch.xpu.device_count()):
-                devices.append(torch.device(i))
-        elif is_ascend_npu():
-            for i in range(torch.npu.device_count()):
-                devices.append(torch.device(i))
-    else:
-        devices.append(get_torch_device())
-    if exclude_current:
-        devices.remove(get_torch_device())
-    return devices
-
 def get_total_memory(dev=None, torch_total_too=False):
    global directml_enabled
    if dev is None:
@@ -495,13 +471,9 @@ try:
    logging.info("Device: {}".format(get_torch_device_name(get_torch_device())))
 except:
    logging.warning("Could not pick default device.")
-try:
-    for device in get_all_torch_devices(exclude_current=True):
-        logging.info("Device: {}".format(get_torch_device_name(device)))
-except:
-    pass

-current_loaded_models: list[LoadedModel] = []
+
+current_loaded_models = []

 def module_size(module):
    module_mem = 0
@@ -512,7 +484,7 @@ def module_size(module):
    return module_mem

 class LoadedModel:
-    def __init__(self, model: ModelPatcher):
+    def __init__(self, model):
        self._set_model(model)
        self.device = model.load_device
        self.real_model = None
@@ -520,7 +492,7 @@ class LoadedModel:
        self.model_finalizer = None
        self._patcher_finalizer = None

-    def _set_model(self, model: ModelPatcher):
+    def _set_model(self, model):
        self._model = weakref.ref(model)
        if model.parent is not None:
            self._parent_model = weakref.ref(model.parent)
@@ -1741,34 +1713,7 @@ def soft_empty_cache(force=False):
        torch.cuda.ipc_collect()

 def unload_all_models():
-    for device in get_all_torch_devices():
-        free_memory(1e30, device)
-
-def unload_model_and_clones(model: ModelPatcher, unload_additional_models=True, all_devices=False):
-    'Unload only model and its clones - primarily for multigpu cloning purposes.'
-    initial_keep_loaded: list[LoadedModel] = current_loaded_models.copy()
-    additional_models = []
-    if unload_additional_models:
-        additional_models = model.get_nested_additional_models()
-    keep_loaded = []
-    for loaded_model in initial_keep_loaded:
-        if loaded_model.model is not None:
-            if model.clone_base_uuid == loaded_model.model.clone_base_uuid:
-                continue
-            # check additional models if they are a match
-            skip = False
-            for add_model in additional_models:
-                if add_model.clone_base_uuid == loaded_model.model.clone_base_uuid:
-                    skip = True
-                    break
-            if skip:
-                continue
-        keep_loaded.append(loaded_model)
-    if not all_devices:
-        free_memory(1e30, get_torch_device(), keep_loaded)
-    else:
-        for device in get_all_torch_devices():
-            free_memory(1e30, device, keep_loaded)
+    free_memory(1e30, get_torch_device())

 def debug_memory_summary():
    if is_amd() or is_nvidia():
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -23,7 +23,6 @@ import inspect
 import logging
 import math
 import uuid
-import copy
 from typing import Callable, Optional

 import torch
@@ -76,15 +75,12 @@ def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_
 def create_model_options_clone(orig_model_options: dict):
    return comfy.patcher_extension.copy_nested_dicts(orig_model_options)

-def create_hook_patches_clone(orig_hook_patches, copy_tuples=False):
+def create_hook_patches_clone(orig_hook_patches):
    new_hook_patches = {}
    for hook_ref in orig_hook_patches:
        new_hook_patches[hook_ref] = {}
        for k in orig_hook_patches[hook_ref]:
            new_hook_patches[hook_ref][k] = orig_hook_patches[hook_ref][k][:]
-            if copy_tuples:
-                for i in range(len(new_hook_patches[hook_ref][k])):
-                    new_hook_patches[hook_ref][k][i] = tuple(new_hook_patches[hook_ref][k][i])
    return new_hook_patches

 def wipe_lowvram_weight(m):
@@ -275,10 +271,6 @@ class ModelPatcher:
        self.is_clip = False
        self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed

-        self.cached_patcher_init: tuple[Callable, tuple] | tuple[Callable, tuple, int] | None = None
-        self.is_multigpu_base_clone = False
-        self.clone_base_uuid = uuid.uuid4()
-
        if not hasattr(self.model, 'model_loaded_weight_memory'):
            self.model.model_loaded_weight_memory = 0

@@ -370,99 +362,18 @@ class ModelPatcher:
        n.is_clip = self.is_clip
        n.hook_mode = self.hook_mode

-        n.cached_patcher_init = self.cached_patcher_init
-        n.is_multigpu_base_clone = self.is_multigpu_base_clone
-        n.clone_base_uuid = self.clone_base_uuid
-
        for callback in self.get_all_callbacks(CallbacksMP.ON_CLONE):
            callback(self, n)
        return n

-    def deepclone_multigpu(self, new_load_device=None, models_cache: dict[uuid.UUID,ModelPatcher]=None):
-        logging.info(f"Creating deepclone of {self.model.__class__.__name__} for {new_load_device if new_load_device else self.load_device}.")
-        comfy.model_management.unload_model_and_clones(self)
-        n = self.clone()
-        # set load device, if present
-        if new_load_device is not None:
-            n.load_device = new_load_device
-        if self.cached_patcher_init is not None:
-            temp_model_patcher: ModelPatcher | list[ModelPatcher] = self.cached_patcher_init[0](*self.cached_patcher_init[1])
-            if len(self.cached_patcher_init) > 2:
-                temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]]
-            n.model = temp_model_patcher.model
-        else:
-            n.model = copy.deepcopy(n.model)
-        # unlike for normal clone, backup dicts that shared same ref should not;
-        # otherwise, patchers that have deep copies of base models will erroneously influence each other.
-        n.backup = copy.deepcopy(n.backup)
-        n.object_patches_backup = copy.deepcopy(n.object_patches_backup)
-        n.hook_backup = copy.deepcopy(n.hook_backup)
-        # multigpu clone should not have multigpu additional_models entry
-        n.remove_additional_models("multigpu")
-        # multigpu_clone all stored additional_models; make sure circular references are properly handled
-        if models_cache is None:
-            models_cache = {}
-        for key, model_list in n.additional_models.items():
-            for i in range(len(model_list)):
-                add_model = n.additional_models[key][i]
-                if add_model.clone_base_uuid not in models_cache:
-                    models_cache[add_model.clone_base_uuid] = add_model.deepclone_multigpu(new_load_device=new_load_device, models_cache=models_cache)
-                n.additional_models[key][i] = models_cache[add_model.clone_base_uuid]
-        for callback in self.get_all_callbacks(CallbacksMP.ON_DEEPCLONE_MULTIGPU):
-            callback(self, n)
-        return n
-
-    def match_multigpu_clones(self):
-        multigpu_models = self.get_additional_models_with_key("multigpu")
-        if len(multigpu_models) > 0:
-            new_multigpu_models = []
-            for mm in multigpu_models:
-                # clone main model, but bring over relevant props from existing multigpu clone
-                n = self.clone()
-                n.load_device = mm.load_device
-                n.backup = mm.backup
-                n.object_patches_backup = mm.object_patches_backup
-                n.hook_backup = mm.hook_backup
-                n.model = mm.model
-                n.is_multigpu_base_clone = mm.is_multigpu_base_clone
-                n.remove_additional_models("multigpu")
-                orig_additional_models: dict[str, list[ModelPatcher]] = comfy.patcher_extension.copy_nested_dicts(n.additional_models)
-                n.additional_models = comfy.patcher_extension.copy_nested_dicts(mm.additional_models)
-                # figure out which additional models are not present in multigpu clone
-                models_cache = {}
-                for mm_add_model in mm.get_additional_models():
-                    models_cache[mm_add_model.clone_base_uuid] = mm_add_model
-                remove_models_uuids = set(list(models_cache.keys()))
-                for key, model_list in orig_additional_models.items():
-                    for orig_add_model in model_list:
-                        if orig_add_model.clone_base_uuid not in models_cache:
-                            models_cache[orig_add_model.clone_base_uuid] = orig_add_model.deepclone_multigpu(new_load_device=n.load_device, models_cache=models_cache)
-                            existing_list = n.get_additional_models_with_key(key)
-                            existing_list.append(models_cache[orig_add_model.clone_base_uuid])
-                            n.set_additional_models(key, existing_list)
-                        if orig_add_model.clone_base_uuid in remove_models_uuids:
-                            remove_models_uuids.remove(orig_add_model.clone_base_uuid)
-                # remove duplicate additional models
-                for key, model_list in n.additional_models.items():
-                    new_model_list = [x for x in model_list if x.clone_base_uuid not in remove_models_uuids]
-                    n.set_additional_models(key, new_model_list)
-                for callback in self.get_all_callbacks(CallbacksMP.ON_MATCH_MULTIGPU_CLONES):
-                    callback(self, n)
-                new_multigpu_models.append(n)
-            self.set_additional_models("multigpu", new_multigpu_models)
-
    def is_clone(self, other):
        if hasattr(other, 'model') and self.model is other.model:
            return True
        return False

-    def clone_has_same_weights(self, clone: ModelPatcher, allow_multigpu=False):
-        if allow_multigpu:
-            if self.clone_base_uuid != clone.clone_base_uuid:
-                return False
-        else:
-            if not self.is_clone(clone):
-                return False
+    def clone_has_same_weights(self, clone: 'ModelPatcher'):
+        if not self.is_clone(clone):
+            return False

        if self.current_hooks != clone.current_hooks:
            return False
@@ -495,16 +406,13 @@ class ModelPatcher:
    def memory_required(self, input_shape):
        return self.model.memory_required(input_shape=input_shape)

-    def disable_model_cfg1_optimization(self):
-        self.model_options["disable_cfg1_optimization"] = True
-
    def set_model_sampler_cfg_function(self, sampler_cfg_function, disable_cfg1_optimization=False):
        if len(inspect.signature(sampler_cfg_function).parameters) == 3:
            self.model_options["sampler_cfg_function"] = lambda args: sampler_cfg_function(args["cond"], args["uncond"], args["cond_scale"]) #Old way
        else:
            self.model_options["sampler_cfg_function"] = sampler_cfg_function
        if disable_cfg1_optimization:
-            self.disable_model_cfg1_optimization()
+            self.model_options["disable_cfg1_optimization"] = True

    def set_model_sampler_post_cfg_function(self, post_cfg_function, disable_cfg1_optimization=False):
        self.model_options = set_model_options_post_cfg_function(self.model_options, post_cfg_function, disable_cfg1_optimization)
@@ -771,19 +679,18 @@ class ModelPatcher:
        for key in list(self.pinned):
            self.unpin_weight(key)

-    def _load_list(self, prio_comfy_cast_weights=False, default_device=None):
+    def _load_list(self, prio_comfy_cast_weights=False):
        loading = []
        for n, m in self.model.named_modules():
-            default = False
-            params = { name: param for name, param in m.named_parameters(recurse=False) }
+            params = []
+            skip = False
+            for name, param in m.named_parameters(recurse=False):
+                params.append(name)
            for name, param in m.named_parameters(recurse=True):
                if name not in params:
-                    default = True # default random weights in non leaf modules
+                    skip = True # skip random weights in non leaf modules
                    break
-            if default and default_device is not None:
-                for param in params.values():
-                    param.data = param.data.to(device=default_device)
-            if not default and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
+            if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
                module_mem = comfy.model_management.module_size(m)
                module_offload_mem = module_mem
                if hasattr(m, "comfy_cast_weights"):
@@ -1202,7 +1109,7 @@ class ModelPatcher:
        return self.additional_models.get(key, [])

    def get_additional_models(self):
-        all_models: list[ModelPatcher] = []
+        all_models = []
        for models in self.additional_models.values():
            all_models.extend(models)
        return all_models
@@ -1256,13 +1163,9 @@ class ModelPatcher:
        for callback in self.get_all_callbacks(CallbacksMP.ON_PRE_RUN):
            callback(self)

-    def prepare_state(self, timestep, model_options, ignore_multigpu=False):
+    def prepare_state(self, timestep):
        for callback in self.get_all_callbacks(CallbacksMP.ON_PREPARE_STATE):
-            callback(self, timestep, model_options, ignore_multigpu)
-        if not ignore_multigpu and "multigpu_clones" in model_options:
-            for p in model_options["multigpu_clones"].values():
-                p: ModelPatcher
-                p.prepare_state(timestep, model_options, ignore_multigpu=True)
+            callback(self, timestep)

    def restore_hook_patches(self):
        if self.hook_patches_backup is not None:
@@ -1275,18 +1178,12 @@ class ModelPatcher:
    def prepare_hook_patches_current_keyframe(self, t: torch.Tensor, hook_group: comfy.hooks.HookGroup, model_options: dict[str]):
        curr_t = t[0]
        reset_current_hooks = False
-        multigpu_kf_changed_cache = None
        transformer_options = model_options.get("transformer_options", {})
        for hook in hook_group.hooks:
            changed = hook.hook_keyframe.prepare_current_keyframe(curr_t=curr_t, transformer_options=transformer_options)
            # if keyframe changed, remove any cached HookGroups that contain hook with the same hook_ref;
            # this will cause the weights to be recalculated when sampling
            if changed:
-                # cache changed for multigpu usage
-                if "multigpu_clones" in model_options:
-                    if multigpu_kf_changed_cache is None:
-                        multigpu_kf_changed_cache = []
-                    multigpu_kf_changed_cache.append(hook)
                # reset current_hooks if contains hook that changed
                if self.current_hooks is not None:
                    for current_hook in self.current_hooks.hooks:
@@ -1298,28 +1195,6 @@ class ModelPatcher:
                        self.cached_hook_patches.pop(cached_group)
        if reset_current_hooks:
            self.patch_hooks(None)
-        if "multigpu_clones" in model_options:
-            for p in model_options["multigpu_clones"].values():
-                p: ModelPatcher
-                p._handle_changed_hook_keyframes(multigpu_kf_changed_cache)
-
-    def _handle_changed_hook_keyframes(self, kf_changed_cache: list[comfy.hooks.Hook]):
-        'Used to handle multigpu behavior inside prepare_hook_patches_current_keyframe.'
-        if kf_changed_cache is None:
-            return
-        reset_current_hooks = False
-        # reset current_hooks if contains hook that changed
-        for hook in kf_changed_cache:
-            if self.current_hooks is not None:
-                for current_hook in self.current_hooks.hooks:
-                    if current_hook == hook:
-                        reset_current_hooks = True
-                        break
-            for cached_group in list(self.cached_hook_patches.keys()):
-                if cached_group.contains(hook):
-                    self.cached_hook_patches.pop(cached_group)
-        if reset_current_hooks:
-            self.patch_hooks(None)

    def register_all_hook_patches(self, hooks: comfy.hooks.HookGroup, target_dict: dict[str], model_options: dict=None,
                                  registered: comfy.hooks.HookGroup = None):
@@ -1620,7 +1495,7 @@ class ModelPatcherDynamic(ModelPatcher):
            #with pin and unpin syncrhonization which can be expensive for small weights
            #with a high layer rate (e.g. autoregressive LLMs).
            #prioritize the non-comfy weights (note the order reverse).
-            loading = self._load_list(prio_comfy_cast_weights=True, default_device=device_to)
+            loading = self._load_list(prio_comfy_cast_weights=True)
            loading.sort(reverse=True)

            for x in loading:
@@ -1638,10 +1513,8 @@ class ModelPatcherDynamic(ModelPatcher):

                    weight, _, _ = get_key_weight(self.model, key)
                    if weight is None:
-                        return (False, 0)
+                        return 0
                    if key in self.patches:
-                        if comfy.lora.calculate_shape(self.patches[key], weight, key) != weight.shape:
-                            return (True, 0)
                        setattr(m, param_key + "_lowvram_function", LowVramPatch(key, self.patches))
                        num_patches += 1
                    else:
@@ -1655,13 +1528,7 @@ class ModelPatcherDynamic(ModelPatcher):
                        model_dtype = getattr(m, param_key + "_comfy_model_dtype", None) or weight.dtype
                        weight._model_dtype = model_dtype
                        geometry = comfy.memory_management.TensorGeometry(shape=weight.shape, dtype=model_dtype)
-                    return (False, comfy.memory_management.vram_aligned_size(geometry))
-
-                def force_load_param(self, param_key, device_to):
-                    key = key_param_name_to_key(n, param_key)
-                    if key in self.backup:
-                        comfy.utils.set_attr_param(self.model, key, self.backup[key].weight)
-                    self.patch_weight_to_device(key, device_to=device_to)
+                    return comfy.memory_management.vram_aligned_size(geometry)

                if hasattr(m, "comfy_cast_weights"):
                    m.comfy_cast_weights = True
@@ -1669,19 +1536,13 @@ class ModelPatcherDynamic(ModelPatcher):
                    m.seed_key = n
                    set_dirty(m, dirty)

-                    force_load, v_weight_size = setup_param(self, m, n, "weight")
-                    force_load_bias, v_weight_bias = setup_param(self, m, n, "bias")
-                    force_load = force_load or force_load_bias
-                    v_weight_size += v_weight_bias
+                    v_weight_size = 0
+                    v_weight_size += setup_param(self, m, n, "weight")
+                    v_weight_size += setup_param(self, m, n, "bias")

-                    if force_load:
-                        logging.info(f"Module {n} has resizing Lora - force loading")
-                        force_load_param(self, "weight", device_to)
-                        force_load_param(self, "bias", device_to)
-                    else:
-                        if vbar is not None and not hasattr(m, "_v"):
-                            m._v = vbar.alloc(v_weight_size)
-                        allocated_size += v_weight_size
+                    if vbar is not None and not hasattr(m, "_v"):
+                        m._v = vbar.alloc(v_weight_size)
+                    allocated_size += v_weight_size

                else:
                    for param in params:
@@ -1699,8 +1560,6 @@ class ModelPatcherDynamic(ModelPatcher):
                        allocated_size += weight_size
                    vbar.set_watermark_limit(allocated_size)

-                move_weight_functions(m, device_to)
-
            logging.info(f"Model {self.model.__class__.__name__} prepared for dynamic VRAM loading. {allocated_size // (1024 ** 2)}MB Staged. {num_patches} patches attached.")

            self.model.device = device_to
@@ -1720,7 +1579,7 @@ class ModelPatcherDynamic(ModelPatcher):
        return 0 if vbar is None else vbar.free_memory(memory_to_free)

    def partially_unload_ram(self, ram_to_unload):
-        loading = self._load_list(prio_comfy_cast_weights=True, default_device=self.offload_device)
+        loading = self._load_list(prio_comfy_cast_weights=True)
        for x in loading:
            _, _, _, _, m, _ = x
            ram_to_unload -= comfy.pinned_memory.unpin_memory(m)
@@ -1741,13 +1600,6 @@ class ModelPatcherDynamic(ModelPatcher):
        if unpatch_weights:
            self.partially_unload_ram(1e32)
            self.partially_unload(None, 1e32)
-            for m in self.model.modules():
-                move_weight_functions(m, device_to)
-
-            keys = list(self.backup.keys())
-            for k in keys:
-                bk = self.backup[k]
-                comfy.utils.set_attr_param(self.model, k, bk.weight)

    def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
        assert not force_patch_weights #See above
--- a/comfy/multigpu.py
+++ b/comfy/multigpu.py
@@ -1,167 +0,0 @@
-from __future__ import annotations
-import torch
-import logging
-
-from collections import namedtuple
-from typing import TYPE_CHECKING
-if TYPE_CHECKING:
-    from comfy.model_patcher import ModelPatcher
-import comfy.utils
-import comfy.patcher_extension
-import comfy.model_management
-
-
-class GPUOptions:
-    def __init__(self, device_index: int, relative_speed: float):
-        self.device_index = device_index
-        self.relative_speed = relative_speed
-
-    def clone(self):
-        return GPUOptions(self.device_index, self.relative_speed)
-
-    def create_dict(self):
-        return {
-            "relative_speed": self.relative_speed
-        }
-
-class GPUOptionsGroup:
-    def __init__(self):
-        self.options: dict[int, GPUOptions] = {}
-
-    def add(self, info: GPUOptions):
-        self.options[info.device_index] = info
-
-    def clone(self):
-        c = GPUOptionsGroup()
-        for opt in self.options.values():
-            c.add(opt)
-        return c
-
-    def register(self, model: ModelPatcher):
-        opts_dict = {}
-        # get devices that are valid for this model
-        devices: list[torch.device] = [model.load_device]
-        for extra_model in model.get_additional_models_with_key("multigpu"):
-            extra_model: ModelPatcher
-            devices.append(extra_model.load_device)
-        # create dictionary with actual device mapped to its GPUOptions
-        device_opts_list: list[GPUOptions] = []
-        for device in devices:
-            device_opts = self.options.get(device.index, GPUOptions(device_index=device.index, relative_speed=1.0))
-            opts_dict[device] = device_opts.create_dict()
-            device_opts_list.append(device_opts)
-        # make relative_speed relative to 1.0
-        min_speed = min([x.relative_speed for x in device_opts_list])
-        for value in opts_dict.values():
-            value['relative_speed'] /= min_speed
-        model.model_options['multigpu_options'] = opts_dict
-
-
-def create_multigpu_deepclones(model: ModelPatcher, max_gpus: int, gpu_options: GPUOptionsGroup=None, reuse_loaded=False):
-    'Prepare ModelPatcher to contain deepclones of its BaseModel and related properties.'
-    model = model.clone()
-    # check if multigpu is already prepared - get the load devices from them if possible to exclude
-    skip_devices = set()
-    multigpu_models = model.get_additional_models_with_key("multigpu")
-    if len(multigpu_models) > 0:
-        for mm in multigpu_models:
-            skip_devices.add(mm.load_device)
-    skip_devices = list(skip_devices)
-
-    full_extra_devices = comfy.model_management.get_all_torch_devices(exclude_current=True)
-    limit_extra_devices = full_extra_devices[:max_gpus-1]
-    extra_devices = limit_extra_devices.copy()
-    # exclude skipped devices
-    for skip in skip_devices:
-        if skip in extra_devices:
-            extra_devices.remove(skip)
-    # create new deepclones
-    if len(extra_devices) > 0:
-        for device in extra_devices:
-            device_patcher = None
-            if reuse_loaded:
-                # check if there are any ModelPatchers currently loaded that could be referenced here after a clone
-                loaded_models: list[ModelPatcher] = comfy.model_management.loaded_models()
-                for lm in loaded_models:
-                    if lm.model is not None and lm.clone_base_uuid == model.clone_base_uuid and lm.load_device == device:
-                        device_patcher = lm.clone()
-                        logging.info(f"Reusing loaded deepclone of {device_patcher.model.__class__.__name__} for {device}")
-                        break
-            if device_patcher is None:
-                device_patcher = model.deepclone_multigpu(new_load_device=device)
-                device_patcher.is_multigpu_base_clone = True
-            multigpu_models = model.get_additional_models_with_key("multigpu")
-            multigpu_models.append(device_patcher)
-            model.set_additional_models("multigpu", multigpu_models)
-        model.match_multigpu_clones()
-        if gpu_options is None:
-            gpu_options = GPUOptionsGroup()
-        gpu_options.register(model)
-    else:
-        logging.info("No extra torch devices need initialization, skipping initializing MultiGPU Work Units.")
-    # TODO: only keep model clones that don't go 'past' the intended max_gpu count
-    # multigpu_models = model.get_additional_models_with_key("multigpu")
-    # new_multigpu_models = []
-    # for m in multigpu_models:
-    #     if m.load_device in limit_extra_devices:
-    #         new_multigpu_models.append(m)
-    # model.set_additional_models("multigpu", new_multigpu_models)
-    # persist skip_devices for use in sampling code
-    # if len(skip_devices) > 0 or "multigpu_skip_devices" in model.model_options:
-    #     model.model_options["multigpu_skip_devices"] = skip_devices
-    return model
-
-
-LoadBalance = namedtuple('LoadBalance', ['work_per_device', 'idle_time'])
-def load_balance_devices(model_options: dict[str], total_work: int, return_idle_time=False, work_normalized: int=None):
-    'Optimize work assigned to different devices, accounting for their relative speeds and splittable work.'
-    opts_dict = model_options['multigpu_options']
-    devices = list(model_options['multigpu_clones'].keys())
-    speed_per_device = []
-    work_per_device = []
-    # get sum of each device's relative_speed
-    total_speed = 0.0
-    for opts in opts_dict.values():
-        total_speed += opts['relative_speed']
-    # get relative work for each device;
-    # obtained by w = (W*r)/R
-    for device in devices:
-        relative_speed = opts_dict[device]['relative_speed']
-        relative_work = (total_work*relative_speed) / total_speed
-        speed_per_device.append(relative_speed)
-        work_per_device.append(relative_work)
-    # relative work must be expressed in whole numbers, but likely is a decimal;
-    # perform rounding while maintaining total sum equal to total work (sum of relative works)
-    work_per_device = round_preserved(work_per_device)
-    dict_work_per_device = {}
-    for device, relative_work in zip(devices, work_per_device):
-        dict_work_per_device[device] = relative_work
-    if not return_idle_time:
-        return LoadBalance(dict_work_per_device, None)
-    # divide relative work by relative speed to get estimated completion time of said work by each device;
-    # time here is relative and does not correspond to real-world units
-    completion_time = [w/r for w,r in zip(work_per_device, speed_per_device)]
-    # calculate relative time spent by the devices waiting on each other after their work is completed
-    idle_time = abs(min(completion_time) - max(completion_time))
-    # if need to compare work idle time, need to normalize to a common total work
-    if work_normalized:
-        idle_time *= (work_normalized/total_work)
-
-    return LoadBalance(dict_work_per_device, idle_time)
-
-def round_preserved(values: list[float]):
-    'Round all values in a list, preserving the combined sum of values.'
-    # get floor of values; casting to int does it too
-    floored = [int(x) for x in values]
-    total_floored = sum(floored)
-    # get remainder to distribute
-    remainder = round(sum(values)) - total_floored
-    # pair values with fractional portions
-    fractional = [(i, x-floored[i]) for i, x in enumerate(values)]
-    # sort by fractional part in descending order
-    fractional.sort(key=lambda x: x[1], reverse=True)
-    # distribute the remainder
-    for i in range(remainder):
-        index = fractional[i][0]
-        floored[index] += 1
-    return floored
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -21,6 +21,7 @@ import logging
 import comfy.model_management
 from comfy.cli_args import args, PerformanceFeature, enables_dynamic_vram
 import comfy.float
+import comfy.rmsnorm
 import json
 import comfy.memory_management
 import comfy.pinned_memory
@@ -462,7 +463,7 @@ class disable_weight_init:
            else:
                return super().forward(*args, **kwargs)

-    class RMSNorm(torch.nn.RMSNorm, CastWeightBiasOp):
+    class RMSNorm(comfy.rmsnorm.RMSNorm, CastWeightBiasOp):
        def reset_parameters(self):
            self.bias = None
            return None
@@ -474,7 +475,8 @@ class disable_weight_init:
                weight = None
                bias = None
                offload_stream = None
-            x = torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps)
+            x = comfy.rmsnorm.rms_norm(input, weight, self.eps)  # TODO: switch to commented out line when old torch is deprecated
+            # x = torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps)
            uncast_bias_weight(self, weight, bias, offload_stream)
            return x

--- a/comfy/patcher_extension.py
+++ b/comfy/patcher_extension.py
@@ -3,8 +3,6 @@ from typing import Callable

 class CallbacksMP:
    ON_CLONE = "on_clone"
-    ON_DEEPCLONE_MULTIGPU = "on_deepclone_multigpu"
-    ON_MATCH_MULTIGPU_CLONES = "on_match_multigpu_clones"
    ON_LOAD = "on_load_after"
    ON_DETACH = "on_detach_after"
    ON_CLEANUP = "on_cleanup"
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -20,7 +20,7 @@ try:
        if cuda_version < (13,):
            ck.registry.disable("cuda")
            logging.warning("WARNING: You need pytorch with cu130 or higher to use optimized CUDA operations.")
-    ck.registry.disable("cuda") # multigpu will not work rn with comfy-kitchen on cuda backend
+
    ck.registry.disable("triton")
    for k, v in ck.list_backends().items():
        logging.info(f"Found comfy_kitchen backend {k}: {v}")
--- a/comfy/rmsnorm.py
+++ b/comfy/rmsnorm.py
@@ -1,10 +1,57 @@
 import torch
 import comfy.model_management
+import numbers
+import logging
+
+RMSNorm = None
+
+try:
+    rms_norm_torch = torch.nn.functional.rms_norm
+    RMSNorm = torch.nn.RMSNorm
+except:
+    rms_norm_torch = None
+    logging.warning("Please update pytorch to use native RMSNorm")

-RMSNorm = torch.nn.RMSNorm

 def rms_norm(x, weight=None, eps=1e-6):
-    if weight is None:
-        return torch.nn.functional.rms_norm(x, (x.shape[-1],), eps=eps)
+    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        if weight is None:
+            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
+        else:
+            return rms_norm_torch(x, weight.shape, weight=comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
    else:
-        return torch.nn.functional.rms_norm(x, weight.shape, weight=comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
+        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        if weight is None:
+            return r
+        else:
+            return r * comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device)
+
+
+if RMSNorm is None:
+    class RMSNorm(torch.nn.Module):
+        def __init__(
+            self,
+            normalized_shape,
+            eps=1e-6,
+            elementwise_affine=True,
+            device=None,
+            dtype=None,
+        ):
+            factory_kwargs = {"device": device, "dtype": dtype}
+            super().__init__()
+            if isinstance(normalized_shape, numbers.Integral):
+                # mypy error: incompatible types in assignment
+                normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+            self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+            self.eps = eps
+            self.elementwise_affine = elementwise_affine
+            if self.elementwise_affine:
+                self.weight = torch.nn.Parameter(
+                    torch.empty(self.normalized_shape, **factory_kwargs)
+                )
+            else:
+                self.register_parameter("weight", None)
+            self.bias = None
+
+        def forward(self, x):
+            return rms_norm(x, self.weight, self.eps)
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -1,17 +1,16 @@
 from __future__ import annotations
-import torch
 import uuid
 import math
 import collections
 import comfy.model_management
 import comfy.conds
-import comfy.model_patcher
 import comfy.utils
 import comfy.hooks
 import comfy.patcher_extension
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from comfy.model_patcher import ModelPatcher
+    from comfy.model_base import BaseModel
    from comfy.controlnet import ControlBase

 def prepare_mask(noise_mask, shape, device):
@@ -107,47 +106,6 @@ def cleanup_additional_models(models):
        if hasattr(m, 'cleanup'):
            m.cleanup()

-def preprocess_multigpu_conds(conds: dict[str, list[dict[str]]], model: ModelPatcher, model_options: dict[str]):
-    '''If multigpu acceleration required, creates deepclones of ControlNets and GLIGEN per device.'''
-    multigpu_models: list[ModelPatcher] = model.get_additional_models_with_key("multigpu")
-    if len(multigpu_models) == 0:
-        return
-    extra_devices = [x.load_device for x in multigpu_models]
-    # handle controlnets
-    controlnets: set[ControlBase] = set()
-    for k in conds:
-        for kk in conds[k]:
-            if 'control' in kk:
-                controlnets.add(kk['control'])
-    if len(controlnets) > 0:
-        # first, unload all controlnet clones
-        for cnet in list(controlnets):
-            cnet_models = cnet.get_models()
-            for cm in cnet_models:
-                comfy.model_management.unload_model_and_clones(cm, unload_additional_models=True)
-
-        # next, make sure each controlnet has a deepclone for all relevant devices
-        for cnet in controlnets:
-            curr_cnet = cnet
-            while curr_cnet is not None:
-                for device in extra_devices:
-                    if device not in curr_cnet.multigpu_clones:
-                        curr_cnet.deepclone_multigpu(device, autoregister=True)
-                curr_cnet = curr_cnet.previous_controlnet
-        # since all device clones are now present, recreate the linked list for cloned cnets per device
-        for cnet in controlnets:
-            curr_cnet = cnet
-            while curr_cnet is not None:
-                prev_cnet = curr_cnet.previous_controlnet
-                for device in extra_devices:
-                    device_cnet = curr_cnet.get_instance_for_device(device)
-                    prev_device_cnet = None
-                    if prev_cnet is not None:
-                        prev_device_cnet = prev_cnet.get_instance_for_device(device)
-                    device_cnet.set_previous_controlnet(prev_device_cnet)
-                curr_cnet = prev_cnet
-    # potentially handle gligen - since not widely used, ignored for now
-
 def estimate_memory(model, noise_shape, conds):
    cond_shapes = collections.defaultdict(list)
    cond_shapes_min = {}
@@ -172,8 +130,7 @@ def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None
    return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load, force_offload=force_offload)

 def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False):
-    model.match_multigpu_clones()
-    preprocess_multigpu_conds(conds, model, model_options)
+    real_model: BaseModel = None
    models, inference_memory = get_additional_models(conds, model.model_dtype())
    models += get_additional_models_from_model_options(model_options)
    models += model.get_nested_additional_models()  # TODO: does this require inference_memory update?
@@ -185,7 +142,7 @@ def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=Non
        memory_required += inference_memory
        minimum_memory_required += inference_memory
    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required, force_full_load=force_full_load)
-    real_model: BaseModel = model.model
+    real_model = model.model

    return real_model, conds, models

@@ -231,18 +188,3 @@ def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict):
        comfy.patcher_extension.merge_nested_dicts(to_load_options.setdefault(wc_name, {}), model_options["transformer_options"][wc_name],
                                                    copy_dict1=False)
    return to_load_options
-
-def prepare_model_patcher_multigpu_clones(model_patcher: ModelPatcher, loaded_models: list[ModelPatcher], model_options: dict):
-    '''
-    In case multigpu acceleration is enabled, prep ModelPatchers for each device.
-    '''
-    multigpu_patchers: list[ModelPatcher] = [x for x in loaded_models if x.is_multigpu_base_clone]
-    if len(multigpu_patchers) > 0:
-        multigpu_dict: dict[torch.device, ModelPatcher] = {}
-        multigpu_dict[model_patcher.load_device] = model_patcher
-        for x in multigpu_patchers:
-            x.hook_patches = comfy.model_patcher.create_hook_patches_clone(model_patcher.hook_patches, copy_tuples=True)
-            x.hook_mode = model_patcher.hook_mode # match main model's hook_mode
-            multigpu_dict[x.load_device] = x
-        model_options["multigpu_clones"] = multigpu_dict
-    return multigpu_patchers
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -1,9 +1,7 @@
 from __future__ import annotations
-
-import comfy.model_management
 from .k_diffusion import sampling as k_diffusion_sampling
 from .extra_samplers import uni_pc
-from typing import TYPE_CHECKING, Callable, NamedTuple, Any
+from typing import TYPE_CHECKING, Callable, NamedTuple
 if TYPE_CHECKING:
    from comfy.model_patcher import ModelPatcher
    from comfy.model_base import BaseModel
@@ -21,7 +19,6 @@ import comfy.context_windows
 import comfy.utils
 import scipy.stats
 import numpy
-import threading


 def add_area_dims(area, num_dims):
@@ -144,7 +141,7 @@ def can_concat_cond(c1, c2):

    return cond_equal_size(c1.conditioning, c2.conditioning)

-def cond_cat(c_list, device=None):
+def cond_cat(c_list):
    temp = {}
    for x in c_list:
        for k in x:
@@ -156,8 +153,6 @@ def cond_cat(c_list, device=None):
    for k in temp:
        conds = temp[k]
        out[k] = conds[0].concat(conds[1:])
-        if device is not None and hasattr(out[k], 'to'):
-            out[k] = out[k].to(device)

    return out

@@ -217,9 +212,7 @@ def _calc_cond_batch_outer(model: BaseModel, conds: list[list[dict]], x_in: torc
    )
    return executor.execute(model, conds, x_in, timestep, model_options)

-def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
-    if 'multigpu_clones' in model_options:
-        return _calc_cond_batch_multigpu(model, conds, x_in, timestep, model_options)
+def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options):
    out_conds = []
    out_counts = []
    # separate conds by matching hooks
@@ -251,7 +244,7 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens
    if has_default_conds:
        finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options)

-    model.current_patcher.prepare_state(timestep, model_options)
+    model.current_patcher.prepare_state(timestep)

    # run every hooked_to_run separately
    for hooks, to_run in hooked_to_run.items():
@@ -352,196 +345,6 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens

    return out_conds

-def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
-    out_conds = []
-    out_counts = []
-    # separate conds by matching hooks
-    hooked_to_run: dict[comfy.hooks.HookGroup,list[tuple[tuple,int]]] = {}
-    default_conds = []
-    has_default_conds = False
-
-    output_device = x_in.device
-
-    for i in range(len(conds)):
-        out_conds.append(torch.zeros_like(x_in))
-        out_counts.append(torch.ones_like(x_in) * 1e-37)
-
-        cond = conds[i]
-        default_c = []
-        if cond is not None:
-            for x in cond:
-                if 'default' in x:
-                    default_c.append(x)
-                    has_default_conds = True
-                    continue
-                p = get_area_and_mult(x, x_in, timestep)
-                if p is None:
-                    continue
-                if p.hooks is not None:
-                    model.current_patcher.prepare_hook_patches_current_keyframe(timestep, p.hooks, model_options)
-                hooked_to_run.setdefault(p.hooks, list())
-                hooked_to_run[p.hooks] += [(p, i)]
-        default_conds.append(default_c)
-
-    if has_default_conds:
-        finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options)
-
-    model.current_patcher.prepare_state(timestep, model_options)
-
-    devices = [dev_m for dev_m in model_options['multigpu_clones'].keys()]
-    device_batched_hooked_to_run: dict[torch.device, list[tuple[comfy.hooks.HookGroup, tuple]]] = {}
-
-    total_conds = 0
-    for to_run in hooked_to_run.values():
-        total_conds += len(to_run)
-    conds_per_device = max(1, math.ceil(total_conds//len(devices)))
-    index_device = 0
-    current_device = devices[index_device]
-    # run every hooked_to_run separately
-    for hooks, to_run in hooked_to_run.items():
-        while len(to_run) > 0:
-            current_device = devices[index_device % len(devices)]
-            batched_to_run = device_batched_hooked_to_run.setdefault(current_device, [])
-            # keep track of conds currently scheduled onto this device
-            batched_to_run_length = 0
-            for btr in batched_to_run:
-                batched_to_run_length += len(btr[1])
-
-            first = to_run[0]
-            first_shape = first[0][0].shape
-            to_batch_temp = []
-            # make sure not over conds_per_device limit when creating temp batch
-            for x in range(len(to_run)):
-                if can_concat_cond(to_run[x][0], first[0]) and len(to_batch_temp) < (conds_per_device - batched_to_run_length):
-                    to_batch_temp += [x]
-
-            to_batch_temp.reverse()
-            to_batch = to_batch_temp[:1]
-
-            free_memory = comfy.model_management.get_free_memory(current_device)
-            for i in range(1, len(to_batch_temp) + 1):
-                batch_amount = to_batch_temp[:len(to_batch_temp)//i]
-                input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
-                if model.memory_required(input_shape) * 1.5 < free_memory:
-                    to_batch = batch_amount
-                    break
-            conds_to_batch = []
-            for x in to_batch:
-                conds_to_batch.append(to_run.pop(x))
-            batched_to_run_length += len(conds_to_batch)
-
-            batched_to_run.append((hooks, conds_to_batch))
-            if batched_to_run_length >= conds_per_device:
-                index_device += 1
-
-    class thread_result(NamedTuple):
-        output: Any
-        mult: Any
-        area: Any
-        batch_chunks: int
-        cond_or_uncond: Any
-        error: Exception = None
-
-    def _handle_batch(device: torch.device, batch_tuple: tuple[comfy.hooks.HookGroup, tuple], results: list[thread_result]):
-        try:
-            model_current: BaseModel = model_options["multigpu_clones"][device].model
-            # run every hooked_to_run separately
-            with torch.no_grad():
-                for hooks, to_batch in batch_tuple:
-                    input_x = []
-                    mult = []
-                    c = []
-                    cond_or_uncond = []
-                    uuids = []
-                    area = []
-                    control: ControlBase = None
-                    patches = None
-                    for x in to_batch:
-                        o = x
-                        p = o[0]
-                        input_x.append(p.input_x)
-                        mult.append(p.mult)
-                        c.append(p.conditioning)
-                        area.append(p.area)
-                        cond_or_uncond.append(o[1])
-                        uuids.append(p.uuid)
-                        control = p.control
-                        patches = p.patches
-
-                    batch_chunks = len(cond_or_uncond)
-                    input_x = torch.cat(input_x).to(device)
-                    c = cond_cat(c, device=device)
-                    timestep_ = torch.cat([timestep.to(device)] * batch_chunks)
-
-                    transformer_options = model_current.current_patcher.apply_hooks(hooks=hooks)
-                    if 'transformer_options' in model_options:
-                        transformer_options = comfy.patcher_extension.merge_nested_dicts(transformer_options,
-                                                                                        model_options['transformer_options'],
-                                                                                        copy_dict1=False)
-
-                    if patches is not None:
-                        transformer_options["patches"] = comfy.patcher_extension.merge_nested_dicts(
-                            transformer_options.get("patches", {}),
-                            patches
-                        )
-
-                    transformer_options["cond_or_uncond"] = cond_or_uncond[:]
-                    transformer_options["uuids"] = uuids[:]
-                    transformer_options["sigmas"] = timestep.to(device)
-                    transformer_options["sample_sigmas"] = transformer_options["sample_sigmas"].to(device)
-                    transformer_options["multigpu_thread_device"] = device
-
-                    cast_transformer_options(transformer_options, device=device)
-                    c['transformer_options'] = transformer_options
-
-                    if control is not None:
-                        device_control = control.get_instance_for_device(device)
-                        c['control'] = device_control.get_control(input_x, timestep_, c, len(cond_or_uncond), transformer_options)
-
-                    if 'model_function_wrapper' in model_options:
-                        output = model_options['model_function_wrapper'](model_current.apply_model, {"input": input_x, "timestep": timestep_, "c": c, "cond_or_uncond": cond_or_uncond}).to(output_device).chunk(batch_chunks)
-                    else:
-                        output = model_current.apply_model(input_x, timestep_, **c).to(output_device).chunk(batch_chunks)
-                    results.append(thread_result(output, mult, area, batch_chunks, cond_or_uncond))
-        except Exception as e:
-            results.append(thread_result(None, None, None, None, None, error=e))
-            raise
-
-
-    results: list[thread_result] = []
-    threads: list[threading.Thread] = []
-    for device, batch_tuple in device_batched_hooked_to_run.items():
-        new_thread = threading.Thread(target=_handle_batch, args=(device, batch_tuple, results))
-        threads.append(new_thread)
-        new_thread.start()
-
-    for thread in threads:
-        thread.join()
-
-    for output, mult, area, batch_chunks, cond_or_uncond, error in results:
-        if error is not None:
-            raise error
-        for o in range(batch_chunks):
-            cond_index = cond_or_uncond[o]
-            a = area[o]
-            if a is None:
-                out_conds[cond_index] += output[o] * mult[o]
-                out_counts[cond_index] += mult[o]
-            else:
-                out_c = out_conds[cond_index]
-                out_cts = out_counts[cond_index]
-                dims = len(a) // 2
-                for i in range(dims):
-                    out_c = out_c.narrow(i + 2, a[i + dims], a[i])
-                    out_cts = out_cts.narrow(i + 2, a[i + dims], a[i])
-                out_c += output[o] * mult[o]
-                out_cts += mult[o]
-
-    for i in range(len(out_conds)):
-        out_conds[i] /= out_counts[i]
-
-    return out_conds
-
 def calc_cond_uncond_batch(model, cond, uncond, x_in, timestep, model_options): #TODO: remove
    logging.warning("WARNING: The comfy.samplers.calc_cond_uncond_batch function is deprecated please use the calc_cond_batch one instead.")
    return tuple(calc_cond_batch(model, [cond, uncond], x_in, timestep, model_options))
@@ -846,8 +649,6 @@ def pre_run_control(model, conds):
        percent_to_timestep_function = lambda a: s.percent_to_sigma(a)
        if 'control' in x:
            x['control'].pre_run(model, percent_to_timestep_function)
-            for device_cnet in x['control'].multigpu_clones.values():
-                device_cnet.pre_run(model, percent_to_timestep_function)

 def apply_empty_x_to_equal_area(conds, uncond, name, uncond_fill_func):
    cond_cnets = []
@@ -1090,9 +891,7 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None):
    to_load_options = model_options.get("to_load_options", None)
    if to_load_options is None:
        return
-    cast_transformer_options(to_load_options, device, dtype)

-def cast_transformer_options(transformer_options: dict[str], device=None, dtype=None):
    casts = []
    if device is not None:
        casts.append(device)
@@ -1101,17 +900,18 @@ def cast_transformer_options(transformer_options: dict[str], device=None, dtype=
    # if nothing to apply, do nothing
    if len(casts) == 0:
        return
+
    # try to call .to on patches
-    if "patches" in transformer_options:
-        patches = transformer_options["patches"]
+    if "patches" in to_load_options:
+        patches = to_load_options["patches"]
        for name in patches:
            patch_list = patches[name]
            for i in range(len(patch_list)):
                if hasattr(patch_list[i], "to"):
                    for cast in casts:
                        patch_list[i] = patch_list[i].to(cast)
-    if "patches_replace" in transformer_options:
-        patches = transformer_options["patches_replace"]
+    if "patches_replace" in to_load_options:
+        patches = to_load_options["patches_replace"]
        for name in patches:
            patch_list = patches[name]
            for k in patch_list:
@@ -1121,8 +921,8 @@ def cast_transformer_options(transformer_options: dict[str], device=None, dtype=
    # try to call .to on any wrappers/callbacks
    wrappers_and_callbacks = ["wrappers", "callbacks"]
    for wc_name in wrappers_and_callbacks:
-        if wc_name in transformer_options:
-            wc: dict[str, list] = transformer_options[wc_name]
+        if wc_name in to_load_options:
+            wc: dict[str, list] = to_load_options[wc_name]
            for wc_dict in wc.values():
                for wc_list in wc_dict.values():
                    for i in range(len(wc_list)):
@@ -1130,6 +930,7 @@ def cast_transformer_options(transformer_options: dict[str], device=None, dtype=
                            for cast in casts:
                                wc_list[i] = wc_list[i].to(cast)

+
 class CFGGuider:
    def __init__(self, model_patcher: ModelPatcher):
        self.model_patcher = model_patcher
@@ -1182,8 +983,6 @@ class CFGGuider:
        self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options)
        device = self.model_patcher.load_device

-        multigpu_patchers = comfy.sampler_helpers.prepare_model_patcher_multigpu_clones(self.model_patcher, self.loaded_models, self.model_options)
-
        noise = noise.to(device)
        latent_image = latent_image.to(device)
        sigmas = sigmas.to(device)
@@ -1191,13 +990,9 @@ class CFGGuider:

        try:
            self.model_patcher.pre_run()
-            for multigpu_patcher in multigpu_patchers:
-                multigpu_patcher.pre_run()
            output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes)
        finally:
            self.model_patcher.cleanup()
-            for multigpu_patcher in multigpu_patchers:
-                multigpu_patcher.cleanup()

        comfy.sampler_helpers.cleanup_models(self.conds, self.loaded_models)
        del self.inner_model
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1510,7 +1510,6 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata)
    if out is None:
        raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd)))
-    out[0].cached_patcher_init = (load_checkpoint_guess_config, (ckpt_path, False, False, False, embedding_directory, output_model, model_options, te_model_options), 0)
    return out

 def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None):
@@ -1712,7 +1711,6 @@ def load_diffusion_model(unet_path, model_options={}):
    if model is None:
        logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path))
        raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd)))
-    model.cached_patcher_init = (load_diffusion_model, (unet_path, model_options))
    return model

 def load_unet(unet_path, dtype=None):
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -171,9 +171,8 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):

    def process_tokens(self, tokens, device):
        end_token = self.special_tokens.get("end", None)
-        pad_token = self.special_tokens.get("pad", -1)
        if end_token is None:
-            cmp_token = pad_token
+            cmp_token = self.special_tokens.get("pad", -1)
        else:
            cmp_token = end_token

@@ -187,21 +186,15 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
            other_embeds = []
            eos = False
            index = 0
-            left_pad = False
            for y in x:
                if isinstance(y, numbers.Integral):
-                    token = int(y)
-                    if index == 0 and token == pad_token:
-                        left_pad = True
-
-                    if eos or (left_pad and token == pad_token):
+                    if eos:
                        attention_mask.append(0)
                    else:
                        attention_mask.append(1)
-                        left_pad = False
-
+                    token = int(y)
                    tokens_temp += [token]
-                    if not eos and token == cmp_token and not left_pad:
+                    if not eos and token == cmp_token:
                        if end_token is None:
                            attention_mask[-1] = 0
                        eos = True
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -710,15 +710,6 @@ class Flux(supported_models_base.BASE):

    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]

-    def process_unet_state_dict(self, state_dict):
-        out_sd = {}
-        for k in list(state_dict.keys()):
-            key_out = k
-            if key_out.endswith("_norm.scale"):
-                key_out = "{}.weight".format(key_out[:-len(".scale")])
-            out_sd[key_out] = state_dict[k]
-        return out_sd
-
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]

@@ -907,13 +898,11 @@ class HunyuanVideo(supported_models_base.BASE):
            key_out = key_out.replace("txt_in.c_embedder.linear_1.", "txt_in.c_embedder.in_layer.").replace("txt_in.c_embedder.linear_2.", "txt_in.c_embedder.out_layer.")
            key_out = key_out.replace("_mod.linear.", "_mod.lin.").replace("_attn_qkv.", "_attn.qkv.")
            key_out = key_out.replace("mlp.fc1.", "mlp.0.").replace("mlp.fc2.", "mlp.2.")
-            key_out = key_out.replace("_attn_q_norm.weight", "_attn.norm.query_norm.weight").replace("_attn_k_norm.weight", "_attn.norm.key_norm.weight")
-            key_out = key_out.replace(".q_norm.weight", ".norm.query_norm.weight").replace(".k_norm.weight", ".norm.key_norm.weight")
+            key_out = key_out.replace("_attn_q_norm.weight", "_attn.norm.query_norm.scale").replace("_attn_k_norm.weight", "_attn.norm.key_norm.scale")
+            key_out = key_out.replace(".q_norm.weight", ".norm.query_norm.scale").replace(".k_norm.weight", ".norm.key_norm.scale")
            key_out = key_out.replace("_attn_proj.", "_attn.proj.")
            key_out = key_out.replace(".modulation.linear.", ".modulation.lin.")
            key_out = key_out.replace("_in.mlp.2.", "_in.out_layer.").replace("_in.mlp.0.", "_in.in_layer.")
-            if key_out.endswith(".scale"):
-                key_out = "{}.weight".format(key_out[:-len(".scale")])
            out_sd[key_out] = state_dict[k]
        return out_sd

@@ -1275,15 +1264,6 @@ class Hunyuan3Dv2(supported_models_base.BASE):

    latent_format = latent_formats.Hunyuan3Dv2

-    def process_unet_state_dict(self, state_dict):
-        out_sd = {}
-        for k in list(state_dict.keys()):
-            key_out = k
-            if key_out.endswith(".scale"):
-                key_out = "{}.weight".format(key_out[:-len(".scale")])
-            out_sd[key_out] = state_dict[k]
-        return out_sd
-
    def process_unet_state_dict_for_saving(self, state_dict):
        replace_prefix = {"": "model."}
        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
@@ -1361,14 +1341,6 @@ class Chroma(supported_models_base.BASE):

    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]

-    def process_unet_state_dict(self, state_dict):
-        out_sd = {}
-        for k in list(state_dict.keys()):
-            key_out = k
-            if key_out.endswith(".scale"):
-                key_out = "{}.weight".format(key_out[:-len(".scale")])
-            out_sd[key_out] = state_dict[k]
-        return out_sd

    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Chroma(self, device=device)
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -10,6 +10,7 @@ import comfy.utils
 def sample_manual_loop_no_classes(
    model,
    ids=None,
+    paddings=[],
    execution_dtype=None,
    cfg_scale: float = 2.0,
    temperature: float = 0.85,
@@ -35,6 +36,9 @@ def sample_manual_loop_no_classes(

    embeds, attention_mask, num_tokens, embeds_info = model.process_tokens(ids, device)
    embeds_batch = embeds.shape[0]
+    for i, t in enumerate(paddings):
+        attention_mask[i, :t] = 0
+        attention_mask[i, t:] = 1

    output_audio_codes = []
    past_key_values = []
@@ -131,11 +135,13 @@ def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=102
            pos_pad = (len(negative) - len(positive))
            positive = [model.special_tokens["pad"]] * pos_pad + positive

+        paddings = [pos_pad, neg_pad]
        ids = [positive, negative]
    else:
+        paddings = []
        ids = [positive]

-    return sample_manual_loop_no_classes(model, ids, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+    return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)


 class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -355,6 +355,13 @@ class RMSNorm(nn.Module):



+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
 def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None):
    if not isinstance(theta, list):
        theta = [theta]
@@ -383,30 +390,20 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_di
        else:
            cos = cos.unsqueeze(1)
            sin = sin.unsqueeze(1)
-        sin_split = sin.shape[-1] // 2
-        out.append((cos, sin[..., : sin_split], -sin[..., sin_split :]))
+        out.append((cos, sin))

    if len(out) == 1:
        return out[0]

    return out

+
 def apply_rope(xq, xk, freqs_cis):
    org_dtype = xq.dtype
    cos = freqs_cis[0]
    sin = freqs_cis[1]
-    nsin = freqs_cis[2]
-
-    q_embed = (xq * cos)
-    q_split = q_embed.shape[-1] // 2
-    q_embed[..., : q_split].addcmul_(xq[..., q_split :], nsin)
-    q_embed[..., q_split :].addcmul_(xq[..., : q_split], sin)
-
-    k_embed = (xk * cos)
-    k_split = k_embed.shape[-1] // 2
-    k_embed[..., : k_split].addcmul_(xk[..., k_split :], nsin)
-    k_embed[..., k_split :].addcmul_(xk[..., : k_split], sin)
-
+    q_embed = (xq * cos) + (rotate_half(xq) * sin)
+    k_embed = (xk * cos) + (rotate_half(xk) * sin)
    return q_embed.to(org_dtype), k_embed.to(org_dtype)


--- a/comfy/text_encoders/lt.py
+++ b/comfy/text_encoders/lt.py
@@ -25,7 +25,7 @@ def ltxv_te(*args, **kwargs):
 class Gemma3_12BTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer = tokenizer_data.get("spiece_model", None)
-        super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_left=True, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)

    def state_dict(self):
        return {"spiece_model": self.tokenizer.serialize_model()}
@@ -97,7 +97,6 @@ class LTXAVTEModel(torch.nn.Module):
        token_weight_pairs = token_weight_pairs["gemma3_12b"]

        out, pooled, extra = self.gemma3_12b.encode_token_weights(token_weight_pairs)
-        out = out[:, :, -torch.sum(extra["attention_mask"]).item():]
        out_device = out.device
        if comfy.model_management.should_use_bf16(self.execution_device):
            out = out.to(device=self.execution_device, dtype=torch.bfloat16)
@@ -139,7 +138,6 @@ class LTXAVTEModel(torch.nn.Module):

        token_weight_pairs = token_weight_pairs.get("gemma3_12b", [])
        num_tokens = sum(map(lambda a: len(a), token_weight_pairs))
-        num_tokens = max(num_tokens, 64)
        return num_tokens * constant * 1024 * 1024

 def ltxav_te(dtype_llama=None, llama_quantization_metadata=None):
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -20,7 +20,7 @@
 import torch
 import math
 import struct
-import comfy.memory_management
+import comfy.checkpoint_pickle
 import safetensors.torch
 import numpy as np
 from PIL import Image
@@ -38,26 +38,26 @@ import warnings
 MMAP_TORCH_FILES = args.mmap_torch_files
 DISABLE_MMAP = args.disable_mmap

-
-if True:  # ckpt/pt file whitelist for safe loading of old sd files
+ALWAYS_SAFE_LOAD = False
+if hasattr(torch.serialization, "add_safe_globals"):  # TODO: this was added in pytorch 2.4, the unsafe path should be removed once earlier versions are deprecated
    class ModelCheckpoint:
        pass
    ModelCheckpoint.__module__ = "pytorch_lightning.callbacks.model_checkpoint"

    def scalar(*args, **kwargs):
-        return None
+        from numpy.core.multiarray import scalar as sc
+        return sc(*args, **kwargs)
    scalar.__module__ = "numpy.core.multiarray"

    from numpy import dtype
    from numpy.dtypes import Float64DType
-
-    def encode(*args, **kwargs):  # no longer necessary on newer torch
-        return None
-    encode.__module__ = "_codecs"
+    from _codecs import encode

    torch.serialization.add_safe_globals([ModelCheckpoint, scalar, dtype, Float64DType, encode])
+    ALWAYS_SAFE_LOAD = True
    logging.info("Checkpoint files will always be loaded safely.")
-
+else:
+    logging.warning("Warning, you are using an old pytorch version and some ckpt/pt files might be loaded unsafely. Upgrading to 2.4 or above is recommended as older versions of pytorch are no longer supported.")

 # Current as of safetensors 0.7.0
 _TYPES = {
@@ -140,8 +140,11 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
        if MMAP_TORCH_FILES:
            torch_args["mmap"] = True

-        pl_sd = torch.load(ckpt, map_location=device, weights_only=True, **torch_args)
-
+        if safe_load or ALWAYS_SAFE_LOAD:
+            pl_sd = torch.load(ckpt, map_location=device, weights_only=True, **torch_args)
+        else:
+            logging.warning("WARNING: loading {} unsafely, upgrade your pytorch to 2.4 or newer to load this file safely.".format(ckpt))
+            pl_sd = torch.load(ckpt, map_location=device, pickle_module=comfy.checkpoint_pickle)
        if "state_dict" in pl_sd:
            sd = pl_sd["state_dict"]
        else:
@@ -672,10 +675,10 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
                        "ff_context.linear_in.bias": "txt_mlp.0.bias",
                        "ff_context.linear_out.weight": "txt_mlp.2.weight",
                        "ff_context.linear_out.bias": "txt_mlp.2.bias",
-                        "attn.norm_q.weight": "img_attn.norm.query_norm.weight",
-                        "attn.norm_k.weight": "img_attn.norm.key_norm.weight",
-                        "attn.norm_added_q.weight": "txt_attn.norm.query_norm.weight",
-                        "attn.norm_added_k.weight": "txt_attn.norm.key_norm.weight",
+                        "attn.norm_q.weight": "img_attn.norm.query_norm.scale",
+                        "attn.norm_k.weight": "img_attn.norm.key_norm.scale",
+                        "attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale",
+                        "attn.norm_added_k.weight": "txt_attn.norm.key_norm.scale",
                    }

        for k in block_map:
@@ -698,8 +701,8 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
                        "norm.linear.bias": "modulation.lin.bias",
                        "proj_out.weight": "linear2.weight",
                        "proj_out.bias": "linear2.bias",
-                        "attn.norm_q.weight": "norm.query_norm.weight",
-                        "attn.norm_k.weight": "norm.key_norm.weight",
+                        "attn.norm_q.weight": "norm.query_norm.scale",
+                        "attn.norm_k.weight": "norm.key_norm.scale",
                        "attn.to_qkv_mlp_proj.weight": "linear1.weight", # Flux 2
                        "attn.to_out.weight": "linear2.weight", # Flux 2
                    }
--- a/comfy/weight_adapter/base.py
+++ b/comfy/weight_adapter/base.py
@@ -49,12 +49,6 @@ class WeightAdapterBase:
        """
        raise NotImplementedError

-    def calculate_shape(
-        self,
-        key
-    ):
-        return None
-
    def calculate_weight(
        self,
        weight,
--- a/comfy/weight_adapter/lora.py
+++ b/comfy/weight_adapter/lora.py
@@ -214,13 +214,6 @@ class LoRAAdapter(WeightAdapterBase):
        else:
            return None

-    def calculate_shape(
-        self,
-        key
-    ):
-        reshape = self.weights[5]
-        return tuple(reshape) if reshape is not None else None
-
    def calculate_weight(
        self,
        weight,
--- a/comfy_api/feature_flags.py
+++ b/comfy_api/feature_flags.py
@@ -14,7 +14,6 @@ SERVER_FEATURE_FLAGS: dict[str, Any] = {
    "supports_preview_metadata": True,
    "max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes
    "extension": {"manager": {"supports_v4": True}},
-    "node_replacements": True,
 }


--- a/comfy_api/latest/init.py
+++ b/comfy_api/latest/init.py
@@ -21,17 +21,6 @@ class ComfyAPI_latest(ComfyAPIBase):
    VERSION = "latest"
    STABLE = False

-    def __init__(self):
-        super().__init__()
-        self.node_replacement = self.NodeReplacement()
-        self.execution = self.Execution()
-
-    class NodeReplacement(ProxiedSingleton):
-        async def register(self, node_replace: io.NodeReplace) -> None:
-            """Register a node replacement mapping."""
-            from server import PromptServer
-            PromptServer.instance.node_replace_manager.register(node_replace)
-
    class Execution(ProxiedSingleton):
        async def set_progress(
            self,
@@ -84,6 +73,8 @@ class ComfyAPI_latest(ComfyAPIBase):
                image=to_display,
            )

+    execution: Execution
+
 class ComfyExtension(ABC):
    async def on_load(self) -> None:
        """
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -75,12 +75,6 @@ class NumberDisplay(str, Enum):
    slider = "slider"


-class ControlAfterGenerate(str, Enum):
-    fixed = "fixed"
-    increment = "increment"
-    decrement = "decrement"
-    randomize = "randomize"
-
 class _ComfyType(ABC):
    Type = Any
    io_type: str = None
@@ -269,7 +263,7 @@ class Int(ComfyTypeIO):
    class Input(WidgetInput):
        '''Integer input.'''
        def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None,
-                    default: int=None, min: int=None, max: int=None, step: int=None, control_after_generate: bool | ControlAfterGenerate=None,
+                    default: int=None, min: int=None, max: int=None, step: int=None, control_after_generate: bool=None,
                    display_mode: NumberDisplay=None, socketless: bool=None, force_input: bool=None, extra_dict=None, raw_link: bool=None, advanced: bool=None):
            super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input, extra_dict, raw_link, advanced)
            self.min = min
@@ -351,7 +345,7 @@ class Combo(ComfyTypeIO):
            tooltip: str=None,
            lazy: bool=None,
            default: str | int | Enum = None,
-            control_after_generate: bool | ControlAfterGenerate=None,
+            control_after_generate: bool=None,
            upload: UploadType=None,
            image_folder: FolderType=None,
            remote: RemoteOptions=None,
@@ -395,7 +389,7 @@ class MultiCombo(ComfyTypeI):
    Type = list[str]
    class Input(Combo.Input):
        def __init__(self, id: str, options: list[str], display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None,
-                    default: list[str]=None, placeholder: str=None, chip: bool=None, control_after_generate: bool | ControlAfterGenerate=None,
+                    default: list[str]=None, placeholder: str=None, chip: bool=None, control_after_generate: bool=None,
                    socketless: bool=None, extra_dict=None, raw_link: bool=None, advanced: bool=None):
            super().__init__(id, options, display_name, optional, tooltip, lazy, default, control_after_generate, socketless=socketless, extra_dict=extra_dict, raw_link=raw_link, advanced=advanced)
            self.multiselect = True
@@ -1209,6 +1203,89 @@ class Color(ComfyTypeIO):
      def as_dict(self):
          return super().as_dict()

+@comfytype(io_type="COLOR_CORRECT")
+class ColorCorrect(ComfyTypeIO):
+    Type = dict
+
+    class Input(WidgetInput):
+        def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+                     socketless: bool=True, default: dict=None, advanced: bool=None):
+            super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
+            if default is None:
+                self.default = {
+                    "temperature": 0,
+                    "hue": 0,
+                    "brightness": 0,
+                    "contrast": 0,
+                    "saturation": 0,
+                    "gamma": 1.0
+                }
+
+        def as_dict(self):
+            return super().as_dict()
+
+@comfytype(io_type="COLOR_BALANCE")
+class ColorBalance(ComfyTypeIO):
+    Type = dict
+
+    class Input(WidgetInput):
+        def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+                     socketless: bool=True, default: dict=None, advanced: bool=None):
+            super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
+            if default is None:
+                self.default = {
+                    "shadows_red": 0,
+                    "shadows_green": 0,
+                    "shadows_blue": 0,
+                    "midtones_red": 0,
+                    "midtones_green": 0,
+                    "midtones_blue": 0,
+                    "highlights_red": 0,
+                    "highlights_green": 0,
+                    "highlights_blue": 0
+                }
+
+        def as_dict(self):
+            return super().as_dict()
+
+@comfytype(io_type="COLOR_CURVES")
+class ColorCurves(ComfyTypeIO):
+    Type = dict
+
+    class Input(WidgetInput):
+        def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+                     socketless: bool=True, default: dict=None, advanced: bool=None):
+            super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
+            if default is None:
+                self.default = {
+                    "rgb": [[0, 0], [1, 1]],
+                    "red": [[0, 0], [1, 1]],
+                    "green": [[0, 0], [1, 1]],
+                    "blue": [[0, 0], [1, 1]]
+                }
+
+        def as_dict(self):
+            return super().as_dict()
+
+@comfytype(io_type="BOUNDING_BOX")
+class BoundingBox(ComfyTypeIO):
+    Type = dict
+
+    class Input(WidgetInput):
+        def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+                     socketless: bool=True, default: dict=None, component: str=None):
+            super().__init__(id, display_name, optional, tooltip, None, default, socketless)
+            self.component = component
+            if default is None:
+                self.default = {"x": 0, "y": 0, "width": 512, "height": 512}
+
+        def as_dict(self):
+            d = super().as_dict()
+            if self.component:
+                d["component"] = self.component
+            return d
+
+
 DYNAMIC_INPUT_LOOKUP: dict[str, Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]] = {}
 def register_dynamic_input_func(io_type: str, func: Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]):
    DYNAMIC_INPUT_LOOKUP[io_type] = func
@@ -2036,74 +2113,11 @@ class _UIOutput(ABC):
        ...


-class InputMapOldId(TypedDict):
-    """Map an old node input to a new node input by ID."""
-    new_id: str
-    old_id: str
-
-class InputMapSetValue(TypedDict):
-    """Set a specific value for a new node input."""
-    new_id: str
-    set_value: Any
-
-InputMap = InputMapOldId | InputMapSetValue
-"""
-Input mapping for node replacement. Type is inferred by dictionary keys:
- {"new_id": str, "old_id": str} - maps old input to new input
- {"new_id": str, "set_value": Any} - sets a specific value for new input
-"""
-
-class OutputMap(TypedDict):
-    """Map outputs of node replacement via indexes."""
-    new_idx: int
-    old_idx: int
-
-class NodeReplace:
-    """
-    Defines a possible node replacement, mapping inputs and outputs of the old node to the new node.
-
-    Also supports assigning specific values to the input widgets of the new node.
-
-    Args:
-        new_node_id: The class name of the new replacement node.
-        old_node_id: The class name of the deprecated node.
-        old_widget_ids: Ordered list of input IDs for widgets that may not have an input slot
-            connected. The workflow JSON stores widget values by their relative position index,
-            not by ID. This list maps those positional indexes to input IDs, enabling the
-            replacement system to correctly identify widget values during node migration.
-        input_mapping: List of input mappings from old node to new node.
-        output_mapping: List of output mappings from old node to new node.
-    """
-    def __init__(self,
-        new_node_id: str,
-        old_node_id: str,
-        old_widget_ids: list[str] | None=None,
-        input_mapping: list[InputMap] | None=None,
-        output_mapping: list[OutputMap] | None=None,
-    ):
-        self.new_node_id = new_node_id
-        self.old_node_id = old_node_id
-        self.old_widget_ids = old_widget_ids
-        self.input_mapping = input_mapping
-        self.output_mapping = output_mapping
-
-    def as_dict(self):
-        """Create serializable representation of the node replacement."""
-        return {
-            "new_node_id": self.new_node_id,
-            "old_node_id": self.old_node_id,
-            "old_widget_ids": self.old_widget_ids,
-            "input_mapping": list(self.input_mapping) if self.input_mapping else None,
-            "output_mapping": list(self.output_mapping) if self.output_mapping else None,
-        }
-
-
 __all__ = [
    "FolderType",
    "UploadType",
    "RemoteOptions",
    "NumberDisplay",
-    "ControlAfterGenerate",

    "comfytype",
    "Custom",
@@ -2190,5 +2204,8 @@ __all__ = [
    "ImageCompare",
    "PriceBadgeDepends",
    "PriceBadge",
-    "NodeReplace",
+    "BoundingBox",
+    "ColorCorrect",
+    "ColorBalance",
+    "ColorCurves"
 ]
--- a/comfy_api_nodes/apis/bria.py
+++ b/comfy_api_nodes/apis/bria.py
@@ -45,55 +45,17 @@ class BriaEditImageRequest(BaseModel):
    )


-class BriaRemoveBackgroundRequest(BaseModel):
-    image: str = Field(...)
-    sync: bool = Field(False)
-    visual_input_content_moderation: bool = Field(
-        False, description="If true, returns 422 on input image moderation failure."
-    )
-    visual_output_content_moderation: bool = Field(
-        False, description="If true, returns 422 on visual output moderation failure."
-    )
-    seed: int = Field(...)
-
-
 class BriaStatusResponse(BaseModel):
    request_id: str = Field(...)
    status_url: str = Field(...)
    warning: str | None = Field(None)


-class BriaRemoveBackgroundResult(BaseModel):
-    image_url: str = Field(...)
-
-
-class BriaRemoveBackgroundResponse(BaseModel):
-    status: str = Field(...)
-    result: BriaRemoveBackgroundResult | None = Field(None)
-
-
-class BriaImageEditResult(BaseModel):
+class BriaResult(BaseModel):
    structured_prompt: str = Field(...)
    image_url: str = Field(...)


-class BriaImageEditResponse(BaseModel):
+class BriaResponse(BaseModel):
    status: str = Field(...)
-    result: BriaImageEditResult | None = Field(None)
-
-
-class BriaRemoveVideoBackgroundRequest(BaseModel):
-    video: str = Field(...)
-    background_color: str = Field(default="transparent", description="Background color for the output video.")
-    output_container_and_codec: str = Field(...)
-    preserve_audio: bool = Field(True)
-    seed: int = Field(...)
-
-
-class BriaRemoveVideoBackgroundResult(BaseModel):
-    video_url: str = Field(...)
-
-
-class BriaRemoveVideoBackgroundResponse(BaseModel):
-    status: str = Field(...)
-    result: BriaRemoveVideoBackgroundResult | None = Field(None)
+    result: BriaResult | None = Field(None)
--- a/comfy_api_nodes/apis/hunyuan3d.py
+++ b/comfy_api_nodes/apis/hunyuan3d.py
@@ -64,23 +64,3 @@ class To3DProTaskResultResponse(BaseModel):

 class To3DProTaskQueryRequest(BaseModel):
    JobId: str = Field(...)
-
-
-class To3DUVFileInput(BaseModel):
-    Type: str = Field(..., description="File type: GLB, OBJ, or FBX")
-    Url: str = Field(...)
-
-
-class To3DUVTaskRequest(BaseModel):
-    File: To3DUVFileInput = Field(...)
-
-
-class TextureEditImageInfo(BaseModel):
-    Url: str = Field(...)
-
-
-class TextureEditTaskRequest(BaseModel):
-    File3D: To3DUVFileInput = Field(...)
-    Image: TextureEditImageInfo | None = Field(None)
-    Prompt: str | None = Field(None)
-    EnablePBR: bool | None = Field(None)
--- a/comfy_api_nodes/nodes_bria.py
+++ b/comfy_api_nodes/nodes_bria.py
@@ -3,11 +3,7 @@ from typing_extensions import override
 from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api_nodes.apis.bria import (
    BriaEditImageRequest,
-    BriaRemoveBackgroundRequest,
-    BriaRemoveBackgroundResponse,
-    BriaRemoveVideoBackgroundRequest,
-    BriaRemoveVideoBackgroundResponse,
-    BriaImageEditResponse,
+    BriaResponse,
    BriaStatusResponse,
    InputModerationSettings,
 )
@@ -15,12 +11,10 @@ from comfy_api_nodes.util import (
    ApiEndpoint,
    convert_mask_to_image,
    download_url_to_image_tensor,
-    download_url_to_video_output,
+    get_number_of_images,
    poll_op,
    sync_op,
-    upload_image_to_comfyapi,
-    upload_video_to_comfyapi,
-    validate_video_duration,
+    upload_images_to_comfyapi,
 )


@@ -79,15 +73,21 @@ class BriaImageEditNode(IO.ComfyNode):
                IO.DynamicCombo.Input(
                    "moderation",
                    options=[
-                        IO.DynamicCombo.Option("false", []),
                        IO.DynamicCombo.Option(
                            "true",
                            [
-                                IO.Boolean.Input("prompt_content_moderation", default=False),
-                                IO.Boolean.Input("visual_input_moderation", default=False),
-                                IO.Boolean.Input("visual_output_moderation", default=True),
+                                IO.Boolean.Input(
+                                    "prompt_content_moderation", default=False
+                                ),
+                                IO.Boolean.Input(
+                                    "visual_input_moderation", default=False
+                                ),
+                                IO.Boolean.Input(
+                                    "visual_output_moderation", default=True
+                                ),
                            ],
                        ),
+                        IO.DynamicCombo.Option("false", []),
                    ],
                    tooltip="Moderation settings",
                ),
@@ -127,26 +127,50 @@ class BriaImageEditNode(IO.ComfyNode):
        mask: Input.Image | None = None,
    ) -> IO.NodeOutput:
        if not prompt and not structured_prompt:
-            raise ValueError("One of prompt or structured_prompt is required to be non-empty.")
+            raise ValueError(
+                "One of prompt or structured_prompt is required to be non-empty."
+            )
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
        mask_url = None
        if mask is not None:
-            mask_url = await upload_image_to_comfyapi(cls, convert_mask_to_image(mask), wait_label="Uploading mask")
+            mask_url = (
+                await upload_images_to_comfyapi(
+                    cls,
+                    convert_mask_to_image(mask),
+                    max_images=1,
+                    mime_type="image/png",
+                    wait_label="Uploading mask",
+                )
+            )[0]
        response = await sync_op(
            cls,
            ApiEndpoint(path="proxy/bria/v2/image/edit", method="POST"),
            data=BriaEditImageRequest(
                instruction=prompt if prompt else None,
                structured_instruction=structured_prompt if structured_prompt else None,
-                images=[await upload_image_to_comfyapi(cls, image, wait_label="Uploading image")],
+                images=await upload_images_to_comfyapi(
+                    cls,
+                    image,
+                    max_images=1,
+                    mime_type="image/png",
+                    wait_label="Uploading image",
+                ),
                mask=mask_url,
                negative_prompt=negative_prompt if negative_prompt else None,
                guidance_scale=guidance_scale,
                seed=seed,
                model_version=model,
                steps_num=steps,
-                prompt_content_moderation=moderation.get("prompt_content_moderation", False),
-                visual_input_content_moderation=moderation.get("visual_input_moderation", False),
-                visual_output_content_moderation=moderation.get("visual_output_moderation", False),
+                prompt_content_moderation=moderation.get(
+                    "prompt_content_moderation", False
+                ),
+                visual_input_content_moderation=moderation.get(
+                    "visual_input_moderation", False
+                ),
+                visual_output_content_moderation=moderation.get(
+                    "visual_output_moderation", False
+                ),
            ),
            response_model=BriaStatusResponse,
        )
@@ -154,7 +178,7 @@ class BriaImageEditNode(IO.ComfyNode):
            cls,
            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
            status_extractor=lambda r: r.status,
-            response_model=BriaImageEditResponse,
+            response_model=BriaResponse,
        )
        return IO.NodeOutput(
            await download_url_to_image_tensor(response.result.image_url),
@@ -162,167 +186,11 @@ class BriaImageEditNode(IO.ComfyNode):
        )


-class BriaRemoveImageBackground(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="BriaRemoveImageBackground",
-            display_name="Bria Remove Image Background",
-            category="api node/image/Bria",
-            description="Remove the background from an image using Bria RMBG 2.0.",
-            inputs=[
-                IO.Image.Input("image"),
-                IO.DynamicCombo.Input(
-                    "moderation",
-                    options=[
-                        IO.DynamicCombo.Option("false", []),
-                        IO.DynamicCombo.Option(
-                            "true",
-                            [
-                                IO.Boolean.Input("visual_input_moderation", default=False),
-                                IO.Boolean.Input("visual_output_moderation", default=True),
-                            ],
-                        ),
-                    ],
-                    tooltip="Moderation settings",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=2147483647,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                    tooltip="Seed controls whether the node should re-run; "
-                    "results are non-deterministic regardless of seed.",
-                ),
-            ],
-            outputs=[IO.Image.Output()],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                expr="""{"type":"usd","usd":0.018}""",
-            ),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        image: Input.Image,
-        moderation: dict,
-        seed: int,
-    ) -> IO.NodeOutput:
-        response = await sync_op(
-            cls,
-            ApiEndpoint(path="/proxy/bria/v2/image/edit/remove_background", method="POST"),
-            data=BriaRemoveBackgroundRequest(
-                image=await upload_image_to_comfyapi(cls, image, wait_label="Uploading image"),
-                sync=False,
-                visual_input_content_moderation=moderation.get("visual_input_moderation", False),
-                visual_output_content_moderation=moderation.get("visual_output_moderation", False),
-                seed=seed,
-            ),
-            response_model=BriaStatusResponse,
-        )
-        response = await poll_op(
-            cls,
-            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
-            status_extractor=lambda r: r.status,
-            response_model=BriaRemoveBackgroundResponse,
-        )
-        return IO.NodeOutput(await download_url_to_image_tensor(response.result.image_url))
-
-
-class BriaRemoveVideoBackground(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="BriaRemoveVideoBackground",
-            display_name="Bria Remove Video Background",
-            category="api node/video/Bria",
-            description="Remove the background from a video using Bria. ",
-            inputs=[
-                IO.Video.Input("video"),
-                IO.Combo.Input(
-                    "background_color",
-                    options=[
-                        "Black",
-                        "White",
-                        "Gray",
-                        "Red",
-                        "Green",
-                        "Blue",
-                        "Yellow",
-                        "Cyan",
-                        "Magenta",
-                        "Orange",
-                    ],
-                    tooltip="Background color for the output video.",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=2147483647,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                    tooltip="Seed controls whether the node should re-run; "
-                    "results are non-deterministic regardless of seed.",
-                ),
-            ],
-            outputs=[IO.Video.Output()],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
-            ),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        video: Input.Video,
-        background_color: str,
-        seed: int,
-    ) -> IO.NodeOutput:
-        validate_video_duration(video, max_duration=60.0)
-        response = await sync_op(
-            cls,
-            ApiEndpoint(path="/proxy/bria/v2/video/edit/remove_background", method="POST"),
-            data=BriaRemoveVideoBackgroundRequest(
-                video=await upload_video_to_comfyapi(cls, video),
-                background_color=background_color,
-                output_container_and_codec="mp4_h264",
-                seed=seed,
-            ),
-            response_model=BriaStatusResponse,
-        )
-        response = await poll_op(
-            cls,
-            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
-            status_extractor=lambda r: r.status,
-            response_model=BriaRemoveVideoBackgroundResponse,
-        )
-        return IO.NodeOutput(await download_url_to_video_output(response.result.video_url))
-
-
 class BriaExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
        return [
            BriaImageEditNode,
-            BriaRemoveImageBackground,
-            BriaRemoveVideoBackground,
        ]


--- a/comfy_api_nodes/nodes_hunyuan3d.py
+++ b/comfy_api_nodes/nodes_hunyuan3d.py
@@ -1,48 +1,31 @@
 from typing_extensions import override

-from comfy_api.latest import IO, ComfyExtension, Input, Types
+from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api_nodes.apis.hunyuan3d import (
    Hunyuan3DViewImage,
    InputGenerateType,
    ResultFile3D,
-    TextureEditTaskRequest,
    To3DProTaskCreateResponse,
    To3DProTaskQueryRequest,
    To3DProTaskRequest,
    To3DProTaskResultResponse,
-    To3DUVFileInput,
-    To3DUVTaskRequest,
 )
 from comfy_api_nodes.util import (
    ApiEndpoint,
    download_url_to_file_3d,
-    download_url_to_image_tensor,
    downscale_image_tensor_by_max_side,
    poll_op,
    sync_op,
-    upload_3d_model_to_comfyapi,
    upload_image_to_comfyapi,
    validate_image_dimensions,
    validate_string,
 )


-def _is_tencent_rate_limited(status: int, body: object) -> bool:
-    return (
-        status == 400
-        and isinstance(body, dict)
-        and "RequestLimitExceeded" in str(body.get("Response", {}).get("Error", {}).get("Code", ""))
-    )
-
-
-def get_file_from_response(
-    response_objs: list[ResultFile3D], file_type: str, raise_if_not_found: bool = True
-) -> ResultFile3D | None:
+def get_file_from_response(response_objs: list[ResultFile3D], file_type: str) -> ResultFile3D | None:
    for i in response_objs:
        if i.Type.lower() == file_type.lower():
            return i
-    if raise_if_not_found:
-        raise ValueError(f"'{file_type}' file type is not found in the response.")
    return None


@@ -52,7 +35,7 @@ class TencentTextToModelNode(IO.ComfyNode):
    def define_schema(cls):
        return IO.Schema(
            node_id="TencentTextToModelNode",
-            display_name="Hunyuan3D: Text to Model",
+            display_name="Hunyuan3D: Text to Model (Pro)",
            category="api node/3d/Tencent",
            inputs=[
                IO.Combo.Input(
@@ -137,7 +120,6 @@ class TencentTextToModelNode(IO.ComfyNode):
                EnablePBR=generate_type.get("pbr", None),
                PolygonType=generate_type.get("polygon_type", None),
            ),
-            is_rate_limited=_is_tencent_rate_limited,
        )
        if response.Error:
            raise ValueError(f"Task creation failed with code {response.Error.Code}: {response.Error.Message}")
@@ -149,14 +131,11 @@ class TencentTextToModelNode(IO.ComfyNode):
            response_model=To3DProTaskResultResponse,
            status_extractor=lambda r: r.Status,
        )
+        glb_result = get_file_from_response(result.ResultFile3Ds, "glb")
+        obj_result = get_file_from_response(result.ResultFile3Ds, "obj")
+        file_glb = await download_url_to_file_3d(glb_result.Url, "glb", task_id=task_id) if glb_result else None
        return IO.NodeOutput(
-            f"{task_id}.glb",
-            await download_url_to_file_3d(
-                get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
-            ),
-            await download_url_to_file_3d(
-                get_file_from_response(result.ResultFile3Ds, "obj").Url, "obj", task_id=task_id
-            ),
+            file_glb, file_glb, await download_url_to_file_3d(obj_result.Url, "obj", task_id=task_id) if obj_result else None
        )


@@ -166,7 +145,7 @@ class TencentImageToModelNode(IO.ComfyNode):
    def define_schema(cls):
        return IO.Schema(
            node_id="TencentImageToModelNode",
-            display_name="Hunyuan3D: Image(s) to Model",
+            display_name="Hunyuan3D: Image(s) to Model (Pro)",
            category="api node/3d/Tencent",
            inputs=[
                IO.Combo.Input(
@@ -289,7 +268,6 @@ class TencentImageToModelNode(IO.ComfyNode):
                EnablePBR=generate_type.get("pbr", None),
                PolygonType=generate_type.get("polygon_type", None),
            ),
-            is_rate_limited=_is_tencent_rate_limited,
        )
        if response.Error:
            raise ValueError(f"Task creation failed with code {response.Error.Code}: {response.Error.Message}")
@@ -301,257 +279,11 @@ class TencentImageToModelNode(IO.ComfyNode):
            response_model=To3DProTaskResultResponse,
            status_extractor=lambda r: r.Status,
        )
+        glb_result = get_file_from_response(result.ResultFile3Ds, "glb")
+        obj_result = get_file_from_response(result.ResultFile3Ds, "obj")
+        file_glb = await download_url_to_file_3d(glb_result.Url, "glb", task_id=task_id) if glb_result else None
        return IO.NodeOutput(
-            f"{task_id}.glb",
-            await download_url_to_file_3d(
-                get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
-            ),
-            await download_url_to_file_3d(
-                get_file_from_response(result.ResultFile3Ds, "obj").Url, "obj", task_id=task_id
-            ),
-        )
-
-
-class TencentModelTo3DUVNode(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="TencentModelTo3DUVNode",
-            display_name="Hunyuan3D: Model to UV",
-            category="api node/3d/Tencent",
-            description="Perform UV unfolding on a 3D model to generate UV texture. "
-            "Input model must have less than 30000 faces.",
-            inputs=[
-                IO.MultiType.Input(
-                    "model_3d",
-                    types=[IO.File3DGLB, IO.File3DOBJ, IO.File3DFBX, IO.File3DAny],
-                    tooltip="Input 3D model (GLB, OBJ, or FBX)",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=1,
-                    min=0,
-                    max=2147483647,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                    tooltip="Seed controls whether the node should re-run; "
-                    "results are non-deterministic regardless of seed.",
-                ),
-            ],
-            outputs=[
-                IO.File3DOBJ.Output(display_name="OBJ"),
-                IO.File3DFBX.Output(display_name="FBX"),
-                IO.Image.Output(),
-            ],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(expr='{"type":"usd","usd":0.2}'),
-        )
-
-    SUPPORTED_FORMATS = {"glb", "obj", "fbx"}
-
-    @classmethod
-    async def execute(
-        cls,
-        model_3d: Types.File3D,
-        seed: int,
-    ) -> IO.NodeOutput:
-        _ = seed
-        file_format = model_3d.format.lower()
-        if file_format not in cls.SUPPORTED_FORMATS:
-            raise ValueError(
-                f"Unsupported file format: '{file_format}'. "
-                f"Supported formats: {', '.join(sorted(cls.SUPPORTED_FORMATS))}."
-            )
-        response = await sync_op(
-            cls,
-            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-uv", method="POST"),
-            response_model=To3DProTaskCreateResponse,
-            data=To3DUVTaskRequest(
-                File=To3DUVFileInput(
-                    Type=file_format.upper(),
-                    Url=await upload_3d_model_to_comfyapi(cls, model_3d, file_format),
-                )
-            ),
-            is_rate_limited=_is_tencent_rate_limited,
-        )
-        if response.Error:
-            raise ValueError(f"Task creation failed with code {response.Error.Code}: {response.Error.Message}")
-        result = await poll_op(
-            cls,
-            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-uv/query", method="POST"),
-            data=To3DProTaskQueryRequest(JobId=response.JobId),
-            response_model=To3DProTaskResultResponse,
-            status_extractor=lambda r: r.Status,
-        )
-        return IO.NodeOutput(
-            await download_url_to_file_3d(get_file_from_response(result.ResultFile3Ds, "obj").Url, "obj"),
-            await download_url_to_file_3d(get_file_from_response(result.ResultFile3Ds, "fbx").Url, "fbx"),
-            await download_url_to_image_tensor(get_file_from_response(result.ResultFile3Ds, "image").Url),
-        )
-
-
-class Tencent3DTextureEditNode(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="Tencent3DTextureEditNode",
-            display_name="Hunyuan3D: 3D Texture Edit",
-            category="api node/3d/Tencent",
-            description="After inputting the 3D model, perform 3D model texture redrawing.",
-            inputs=[
-                IO.MultiType.Input(
-                    "model_3d",
-                    types=[IO.File3DFBX, IO.File3DAny],
-                    tooltip="3D model in FBX format. Model should have less than 100000 faces.",
-                ),
-                IO.String.Input(
-                    "prompt",
-                    multiline=True,
-                    default="",
-                    tooltip="Describes texture editing. Supports up to 1024 UTF-8 characters.",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=2147483647,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                    tooltip="Seed controls whether the node should re-run; "
-                    "results are non-deterministic regardless of seed.",
-                ),
-            ],
-            outputs=[
-                IO.File3DGLB.Output(display_name="GLB"),
-                IO.File3DFBX.Output(display_name="FBX"),
-            ],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                expr="""{"type":"usd","usd": 0.6}""",
-            ),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        model_3d: Types.File3D,
-        prompt: str,
-        seed: int,
-    ) -> IO.NodeOutput:
-        _ = seed
-        file_format = model_3d.format.lower()
-        if file_format != "fbx":
-            raise ValueError(f"Unsupported file format: '{file_format}'. Only FBX format is supported.")
-        validate_string(prompt, field_name="prompt", min_length=1, max_length=1024)
-        model_url = await upload_3d_model_to_comfyapi(cls, model_3d, file_format)
-        response = await sync_op(
-            cls,
-            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-texture-edit", method="POST"),
-            response_model=To3DProTaskCreateResponse,
-            data=TextureEditTaskRequest(
-                File3D=To3DUVFileInput(Type=file_format.upper(), Url=model_url),
-                Prompt=prompt,
-                EnablePBR=True,
-            ),
-            is_rate_limited=_is_tencent_rate_limited,
-        )
-        if response.Error:
-            raise ValueError(f"Task creation failed with code {response.Error.Code}: {response.Error.Message}")
-
-        result = await poll_op(
-            cls,
-            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-texture-edit/query", method="POST"),
-            data=To3DProTaskQueryRequest(JobId=response.JobId),
-            response_model=To3DProTaskResultResponse,
-            status_extractor=lambda r: r.Status,
-        )
-        return IO.NodeOutput(
-            await download_url_to_file_3d(get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb"),
-            await download_url_to_file_3d(get_file_from_response(result.ResultFile3Ds, "fbx").Url, "fbx"),
-        )
-
-
-class Tencent3DPartNode(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="Tencent3DPartNode",
-            display_name="Hunyuan3D: 3D Part",
-            category="api node/3d/Tencent",
-            description="Automatically perform component identification and generation based on the model structure.",
-            inputs=[
-                IO.MultiType.Input(
-                    "model_3d",
-                    types=[IO.File3DFBX, IO.File3DAny],
-                    tooltip="3D model in FBX format. Model should have less than 30000 faces.",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=2147483647,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                    tooltip="Seed controls whether the node should re-run; "
-                    "results are non-deterministic regardless of seed.",
-                ),
-            ],
-            outputs=[
-                IO.File3DFBX.Output(display_name="FBX"),
-            ],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(expr='{"type":"usd","usd":0.6}'),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        model_3d: Types.File3D,
-        seed: int,
-    ) -> IO.NodeOutput:
-        _ = seed
-        file_format = model_3d.format.lower()
-        if file_format != "fbx":
-            raise ValueError(f"Unsupported file format: '{file_format}'. Only FBX format is supported.")
-        model_url = await upload_3d_model_to_comfyapi(cls, model_3d, file_format)
-        response = await sync_op(
-            cls,
-            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-part", method="POST"),
-            response_model=To3DProTaskCreateResponse,
-            data=To3DUVTaskRequest(
-                File=To3DUVFileInput(Type=file_format.upper(), Url=model_url),
-            ),
-            is_rate_limited=_is_tencent_rate_limited,
-        )
-        if response.Error:
-            raise ValueError(f"Task creation failed with code {response.Error.Code}: {response.Error.Message}")
-        result = await poll_op(
-            cls,
-            ApiEndpoint(path="/proxy/tencent/hunyuan/3d-part/query", method="POST"),
-            data=To3DProTaskQueryRequest(JobId=response.JobId),
-            response_model=To3DProTaskResultResponse,
-            status_extractor=lambda r: r.Status,
-        )
-        return IO.NodeOutput(
-            await download_url_to_file_3d(get_file_from_response(result.ResultFile3Ds, "fbx").Url, "fbx"),
+            file_glb, file_glb, await download_url_to_file_3d(obj_result.Url, "obj", task_id=task_id) if obj_result else None
        )


@@ -561,9 +293,6 @@ class TencentHunyuan3DExtension(ComfyExtension):
        return [
            TencentTextToModelNode,
            TencentImageToModelNode,
-            # TencentModelTo3DUVNode,
-            # Tencent3DTextureEditNode,
-            Tencent3DPartNode,
        ]


--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@@ -43,6 +43,7 @@ class SupportedOpenAIModel(str, Enum):
    o1 = "o1"
    o3 = "o3"
    o1_pro = "o1-pro"
+    gpt_4o = "gpt-4o"
    gpt_4_1 = "gpt-4.1"
    gpt_4_1_mini = "gpt-4.1-mini"
    gpt_4_1_nano = "gpt-4.1-nano"
@@ -648,6 +649,11 @@ class OpenAIChatNode(IO.ComfyNode):
                    "usd": [0.01, 0.04],
                    "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
                  }
+                  : $contains($m, "gpt-4o") ? {
+                    "type": "list_usd",
+                    "usd": [0.0025, 0.01],
+                    "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+                  }
                  : $contains($m, "gpt-4.1-nano") ? {
                    "type": "list_usd",
                    "usd": [0.0001, 0.0004],
--- a/comfy_api_nodes/util/init.py
+++ b/comfy_api_nodes/util/init.py
@@ -33,7 +33,6 @@ from .download_helpers import (
    download_url_to_video_output,
 )
 from .upload_helpers import (
-    upload_3d_model_to_comfyapi,
    upload_audio_to_comfyapi,
    upload_file_to_comfyapi,
    upload_image_to_comfyapi,
@@ -63,7 +62,6 @@ __all__ = [
    "sync_op",
    "sync_op_raw",
    # Upload helpers
-    "upload_3d_model_to_comfyapi",
    "upload_audio_to_comfyapi",
    "upload_file_to_comfyapi",
    "upload_image_to_comfyapi",
--- a/comfy_api_nodes/util/conversions.py
+++ b/comfy_api_nodes/util/conversions.py
@@ -57,7 +57,7 @@ def tensor_to_bytesio(
    image: torch.Tensor,
    *,
    total_pixels: int | None = 2048 * 2048,
-    mime_type: str | None = "image/png",
+    mime_type: str = "image/png",
 ) -> BytesIO:
    """Converts a torch.Tensor image to a named BytesIO object.

--- a/comfy_api_nodes/util/upload_helpers.py
+++ b/comfy_api_nodes/util/upload_helpers.py
@@ -164,27 +164,6 @@ async def upload_video_to_comfyapi(
    return await upload_file_to_comfyapi(cls, video_bytes_io, filename, upload_mime_type, wait_label)


-_3D_MIME_TYPES = {
-    "glb": "model/gltf-binary",
-    "obj": "model/obj",
-    "fbx": "application/octet-stream",
-}
-
-
-async def upload_3d_model_to_comfyapi(
-    cls: type[IO.ComfyNode],
-    model_3d: Types.File3D,
-    file_format: str,
-) -> str:
-    """Uploads a 3D model file to ComfyUI API and returns its download URL."""
-    return await upload_file_to_comfyapi(
-        cls,
-        model_3d.get_data(),
-        f"{uuid.uuid4()}.{file_format}",
-        _3D_MIME_TYPES.get(file_format, "application/octet-stream"),
-    )
-
-
 async def upload_file_to_comfyapi(
    cls: type[IO.ComfyNode],
    file_bytes_io: BytesIO,
--- a/comfy_extras/nodes_color_balance.py
+++ b/comfy_extras/nodes_color_balance.py
@@ -0,0 +1,78 @@
+from typing_extensions import override
+import torch
+
+from comfy_api.latest import ComfyExtension, io, ui
+
+
+def _smoothstep(edge0: float, edge1: float, x: torch.Tensor) -> torch.Tensor:
+    t = torch.clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0)
+    return t * t * (3.0 - 2.0 * t)
+
+
+class ColorBalanceNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ColorBalance",
+            display_name="Color Balance",
+            category="image/adjustment",
+            inputs=[
+                io.Image.Input("image"),
+                io.ColorBalance.Input("settings"),
+            ],
+            outputs=[
+                io.Image.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, image: torch.Tensor, settings: dict) -> io.NodeOutput:
+        shadows_red = settings.get("shadows_red", 0)
+        shadows_green = settings.get("shadows_green", 0)
+        shadows_blue = settings.get("shadows_blue", 0)
+        midtones_red = settings.get("midtones_red", 0)
+        midtones_green = settings.get("midtones_green", 0)
+        midtones_blue = settings.get("midtones_blue", 0)
+        highlights_red = settings.get("highlights_red", 0)
+        highlights_green = settings.get("highlights_green", 0)
+        highlights_blue = settings.get("highlights_blue", 0)
+
+        result = image.clone().float()
+
+        # Compute per-pixel luminance
+        luminance = (
+            0.2126 * result[..., 0]
+            + 0.7152 * result[..., 1]
+            + 0.0722 * result[..., 2]
+        )
+
+        # Compute tonal range weights
+        shadow_weight = 1.0 - _smoothstep(0.0, 0.5, luminance)
+        highlight_weight = _smoothstep(0.5, 1.0, luminance)
+        midtone_weight = 1.0 - shadow_weight - highlight_weight
+
+        # Apply offsets per channel
+        for ch, (s, m, h) in enumerate([
+            (shadows_red, midtones_red, highlights_red),
+            (shadows_green, midtones_green, highlights_green),
+            (shadows_blue, midtones_blue, highlights_blue),
+        ]):
+            offset = (
+                shadow_weight * (s / 100.0)
+                + midtone_weight * (m / 100.0)
+                + highlight_weight * (h / 100.0)
+            )
+            result[..., ch] = result[..., ch] + offset
+
+        result = torch.clamp(result, 0, 1)
+        return io.NodeOutput(result, ui=ui.PreviewImage(result))
+
+
+class ColorBalanceExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [ColorBalanceNode]
+
+
+async def comfy_entrypoint() -> ColorBalanceExtension:
+    return ColorBalanceExtension()
--- a/comfy_extras/nodes_color_correct.py
+++ b/comfy_extras/nodes_color_correct.py
@@ -0,0 +1,88 @@
+from typing_extensions import override
+import torch
+import numpy as np
+
+from comfy_api.latest import ComfyExtension, io, ui
+
+
+class ColorCorrectNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ColorCorrect",
+            display_name="Color Correct",
+            category="image/adjustment",
+            inputs=[
+                io.Image.Input("image"),
+                io.ColorCorrect.Input("settings"),
+            ],
+            outputs=[
+                io.Image.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, image: torch.Tensor, settings: dict) -> io.NodeOutput:
+        temperature = settings.get("temperature", 0)
+        hue = settings.get("hue", 0)
+        brightness = settings.get("brightness", 0)
+        contrast = settings.get("contrast", 0)
+        saturation = settings.get("saturation", 0)
+        gamma = settings.get("gamma", 1.0)
+
+        result = image.clone()
+
+        # Brightness: scale RGB values
+        if brightness != 0:
+            factor = 1.0 + brightness / 100.0
+            result = result * factor
+
+        # Contrast: adjust around midpoint
+        if contrast != 0:
+            factor = 1.0 + contrast / 100.0
+            mean = result[..., :3].mean()
+            result[..., :3] = (result[..., :3] - mean) * factor + mean
+
+        # Temperature: shift warm (red+) / cool (blue+)
+        if temperature != 0:
+            temp_factor = temperature / 100.0
+            result[..., 0] = result[..., 0] + temp_factor * 0.1  # Red
+            result[..., 2] = result[..., 2] - temp_factor * 0.1  # Blue
+
+        # Gamma correction
+        if gamma != 1.0:
+            result[..., :3] = torch.pow(torch.clamp(result[..., :3], 0, 1), 1.0 / gamma)
+
+        # Saturation: convert to HSV-like space
+        if saturation != 0:
+            factor = 1.0 + saturation / 100.0
+            gray = result[..., :3].mean(dim=-1, keepdim=True)
+            result[..., :3] = gray + (result[..., :3] - gray) * factor
+
+        # Hue rotation: rotate in RGB space using rotation matrix
+        if hue != 0:
+            angle = np.radians(hue)
+            cos_a = np.cos(angle)
+            sin_a = np.sin(angle)
+            # Rodrigues' rotation formula around (1,1,1)/sqrt(3) axis
+            k = 1.0 / 3.0
+            rotation = torch.tensor([
+                [cos_a + k * (1 - cos_a), k * (1 - cos_a) - sin_a / np.sqrt(3), k * (1 - cos_a) + sin_a / np.sqrt(3)],
+                [k * (1 - cos_a) + sin_a / np.sqrt(3), cos_a + k * (1 - cos_a), k * (1 - cos_a) - sin_a / np.sqrt(3)],
+                [k * (1 - cos_a) - sin_a / np.sqrt(3), k * (1 - cos_a) + sin_a / np.sqrt(3), cos_a + k * (1 - cos_a)]
+            ], dtype=result.dtype, device=result.device)
+            rgb = result[..., :3]
+            result[..., :3] = torch.matmul(rgb, rotation.T)
+
+        result = torch.clamp(result, 0, 1)
+        return io.NodeOutput(result, ui=ui.PreviewImage(result))
+
+
+class ColorCorrectExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [ColorCorrectNode]
+
+
+async def comfy_entrypoint() -> ColorCorrectExtension:
+    return ColorCorrectExtension()
--- a/comfy_extras/nodes_color_curves.py
+++ b/comfy_extras/nodes_color_curves.py
@@ -0,0 +1,137 @@
+from typing_extensions import override
+import torch
+import numpy as np
+
+from comfy_api.latest import ComfyExtension, io, ui
+
+
+def _monotone_cubic_hermite(xs, ys, x_query):
+    """Evaluate monotone cubic Hermite interpolation at x_query points."""
+    n = len(xs)
+    if n == 0:
+        return np.zeros_like(x_query)
+    if n == 1:
+        return np.full_like(x_query, ys[0])
+
+    # Compute slopes
+    deltas = np.diff(ys) / np.maximum(np.diff(xs), 1e-10)
+
+    # Compute tangents (Fritsch-Carlson)
+    slopes = np.zeros(n)
+    slopes[0] = deltas[0]
+    slopes[-1] = deltas[-1]
+    for i in range(1, n - 1):
+        if deltas[i - 1] * deltas[i] <= 0:
+            slopes[i] = 0
+        else:
+            slopes[i] = (deltas[i - 1] + deltas[i]) / 2
+
+    # Enforce monotonicity
+    for i in range(n - 1):
+        if deltas[i] == 0:
+            slopes[i] = 0
+            slopes[i + 1] = 0
+        else:
+            alpha = slopes[i] / deltas[i]
+            beta = slopes[i + 1] / deltas[i]
+            s = alpha ** 2 + beta ** 2
+            if s > 9:
+                t = 3 / np.sqrt(s)
+                slopes[i] = t * alpha * deltas[i]
+                slopes[i + 1] = t * beta * deltas[i]
+
+    # Evaluate
+    result = np.zeros_like(x_query, dtype=np.float64)
+    indices = np.searchsorted(xs, x_query, side='right') - 1
+    indices = np.clip(indices, 0, n - 2)
+
+    for i in range(n - 1):
+        mask = indices == i
+        if not np.any(mask):
+            continue
+        dx = xs[i + 1] - xs[i]
+        if dx == 0:
+            result[mask] = ys[i]
+            continue
+        t = (x_query[mask] - xs[i]) / dx
+        t2 = t * t
+        t3 = t2 * t
+        h00 = 2 * t3 - 3 * t2 + 1
+        h10 = t3 - 2 * t2 + t
+        h01 = -2 * t3 + 3 * t2
+        h11 = t3 - t2
+        result[mask] = h00 * ys[i] + h10 * dx * slopes[i] + h01 * ys[i + 1] + h11 * dx * slopes[i + 1]
+
+    # Clamp edges
+    result[x_query <= xs[0]] = ys[0]
+    result[x_query >= xs[-1]] = ys[-1]
+
+    return result
+
+
+def _build_lut(points):
+    """Build a 256-entry LUT from curve control points in [0,1] space."""
+    if not points or len(points) < 2:
+        return np.arange(256, dtype=np.float64) / 255.0
+
+    pts = sorted(points, key=lambda p: p[0])
+    xs = np.array([p[0] for p in pts], dtype=np.float64)
+    ys = np.array([p[1] for p in pts], dtype=np.float64)
+
+    x_query = np.linspace(0, 1, 256)
+    lut = _monotone_cubic_hermite(xs, ys, x_query)
+    return np.clip(lut, 0, 1)
+
+
+class ColorCurvesNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ColorCurves",
+            display_name="Color Curves",
+            category="image/adjustment",
+            inputs=[
+                io.Image.Input("image"),
+                io.ColorCurves.Input("settings"),
+            ],
+            outputs=[
+                io.Image.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, image: torch.Tensor, settings: dict) -> io.NodeOutput:
+        rgb_pts = settings.get("rgb", [[0, 0], [1, 1]])
+        red_pts = settings.get("red", [[0, 0], [1, 1]])
+        green_pts = settings.get("green", [[0, 0], [1, 1]])
+        blue_pts = settings.get("blue", [[0, 0], [1, 1]])
+
+        rgb_lut = _build_lut(rgb_pts)
+        red_lut = _build_lut(red_pts)
+        green_lut = _build_lut(green_pts)
+        blue_lut = _build_lut(blue_pts)
+
+        # Convert to numpy for LUT application
+        img_np = image.cpu().numpy().copy()
+
+        # Apply per-channel curves then RGB master curve
+        for ch, ch_lut in enumerate([red_lut, green_lut, blue_lut]):
+            # Per-channel curve
+            indices = np.clip(img_np[..., ch] * 255, 0, 255).astype(np.int32)
+            img_np[..., ch] = ch_lut[indices]
+            # RGB master curve
+            indices = np.clip(img_np[..., ch] * 255, 0, 255).astype(np.int32)
+            img_np[..., ch] = rgb_lut[indices]
+
+        result = torch.from_numpy(np.clip(img_np, 0, 1)).to(image.device, dtype=image.dtype)
+        return io.NodeOutput(result, ui=ui.PreviewImage(result))
+
+
+class ColorCurvesExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [ColorCurvesNode]
+
+
+async def comfy_entrypoint() -> ColorCurvesExtension:
+    return ColorCurvesExtension()
--- a/comfy_extras/nodes_images.py
+++ b/comfy_extras/nodes_images.py
@@ -23,8 +23,9 @@ class ImageCrop(IO.ComfyNode):
        return IO.Schema(
            node_id="ImageCrop",
            search_aliases=["trim"],
-            display_name="Image Crop",
+            display_name="Image Crop (Deprecated)",
            category="image/transform",
+            is_deprecated=True,
            inputs=[
                IO.Image.Input("image"),
                IO.Int.Input("width", default=512, min=1, max=nodes.MAX_RESOLUTION, step=1),
@@ -47,6 +48,57 @@ class ImageCrop(IO.ComfyNode):
    crop = execute  # TODO: remove


+class ImageCropV2(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="ImageCropV2",
+            search_aliases=["trim"],
+            display_name="Image Crop",
+            category="image/transform",
+            inputs=[
+                IO.Image.Input("image"),
+                IO.BoundingBox.Input("crop_region", component="ImageCrop"),
+            ],
+            outputs=[IO.Image.Output()],
+        )
+
+    @classmethod
+    def execute(cls, image, crop_region) -> IO.NodeOutput:
+        x = crop_region.get("x", 0)
+        y = crop_region.get("y", 0)
+        width = crop_region.get("width", 512)
+        height = crop_region.get("height", 512)
+
+        x = min(x, image.shape[2] - 1)
+        y = min(y, image.shape[1] - 1)
+        to_x = width + x
+        to_y = height + y
+        img = image[:,y:to_y, x:to_x, :]
+        return IO.NodeOutput(img)
+
+
+class BoundingBox(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="PrimitiveBoundingBox",
+            display_name="Bounding Box",
+            category="utils/primitive",
+            inputs=[
+                IO.Int.Input("x", default=0, min=0, max=MAX_RESOLUTION),
+                IO.Int.Input("y", default=0, min=0, max=MAX_RESOLUTION),
+                IO.Int.Input("width", default=512, min=1, max=MAX_RESOLUTION),
+                IO.Int.Input("height", default=512, min=1, max=MAX_RESOLUTION),
+            ],
+            outputs=[IO.BoundingBox.Output()],
+        )
+
+    @classmethod
+    def execute(cls, x, y, width, height) -> IO.NodeOutput:
+        return IO.NodeOutput({"x": x, "y": y, "width": width, "height": height})
+
+
 class RepeatImageBatch(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -632,6 +684,8 @@ class ImagesExtension(ComfyExtension):
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
        return [
            ImageCrop,
+            ImageCropV2,
+            BoundingBox,
            RepeatImageBatch,
            ImageFromBatch,
            ImageAddNoise,
--- a/comfy_extras/nodes_lora_extract.py
+++ b/comfy_extras/nodes_lora_extract.py
@@ -7,7 +7,6 @@ import logging
 from enum import Enum
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
-from tqdm.auto import trange

 CLAMP_QUANTILE = 0.99

@@ -50,22 +49,12 @@ LORA_TYPES = {"standard": LORAType.STANDARD,
              "full_diff": LORAType.FULL_DIFF}

 def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora_type, bias_diff=False):
-    comfy.model_management.load_models_gpu([model_diff])
+    comfy.model_management.load_models_gpu([model_diff], force_patch_weights=True)
    sd = model_diff.model_state_dict(filter_prefix=prefix_model)

-    sd_keys = list(sd.keys())
-    for index in trange(len(sd_keys), unit="weight"):
-        k = sd_keys[index]
-        op_keys = sd_keys[index].rsplit('.', 1)
-        if len(op_keys) < 2 or op_keys[1] not in ["weight", "bias"] or (op_keys[1] == "bias" and not bias_diff):
-            continue
-        op = comfy.utils.get_attr(model_diff.model, op_keys[0])
-        if hasattr(op, "comfy_cast_weights") and not getattr(op, "comfy_patched_weights", False):
-            weight_diff = model_diff.patch_weight_to_device(k, model_diff.load_device, return_weight=True)
-        else:
+    for k in sd:
+        if k.endswith(".weight"):
            weight_diff = sd[k]
-
-        if op_keys[1] == "weight":
            if lora_type == LORAType.STANDARD:
                if weight_diff.ndim < 2:
                    if bias_diff:
@@ -80,8 +69,8 @@ def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora
            elif lora_type == LORAType.FULL_DIFF:
                output_sd["{}{}.diff".format(prefix_lora, k[len(prefix_model):-7])] = weight_diff.contiguous().half().cpu()

-        elif bias_diff and op_keys[1] == "bias":
-            output_sd["{}{}.diff_b".format(prefix_lora, k[len(prefix_model):-5])] = weight_diff.contiguous().half().cpu()
+        elif bias_diff and k.endswith(".bias"):
+            output_sd["{}{}.diff_b".format(prefix_lora, k[len(prefix_model):-5])] = sd[k].contiguous().half().cpu()
    return output_sd

 class LoraSave(io.ComfyNode):
--- a/comfy_extras/nodes_multigpu.py
+++ b/comfy_extras/nodes_multigpu.py
@@ -1,86 +0,0 @@
-from __future__ import annotations
-from inspect import cleandoc
-
-from typing import TYPE_CHECKING
-if TYPE_CHECKING:
-    from comfy.model_patcher import ModelPatcher
-import comfy.multigpu
-
-
-class MultiGPUWorkUnitsNode:
-    """
-    Prepares model to have sampling accelerated via splitting work units.
-
-    Should be placed after nodes that modify the model object itself, such as compile or attention-switch nodes.
-
-    Other than those exceptions, this node can be placed in any order.
-    """
-
-    NodeId = "MultiGPU_WorkUnits"
-    NodeName = "MultiGPU Work Units"
-    @classmethod
-    def INPUT_TYPES(cls):
-        return {
-            "required": {
-                "model": ("MODEL",),
-                "max_gpus" : ("INT", {"default": 8, "min": 1, "step": 1}),
-            },
-            "optional": {
-                "gpu_options": ("GPU_OPTIONS",)
-            }
-        }
-
-    RETURN_TYPES = ("MODEL",)
-    FUNCTION = "init_multigpu"
-    CATEGORY = "advanced/multigpu"
-    DESCRIPTION = cleandoc(__doc__)
-
-    def init_multigpu(self, model: ModelPatcher, max_gpus: int, gpu_options: comfy.multigpu.GPUOptionsGroup=None):
-        model = comfy.multigpu.create_multigpu_deepclones(model, max_gpus, gpu_options, reuse_loaded=True)
-        return (model,)
-
-class MultiGPUOptionsNode:
-    """
-    Select the relative speed of GPUs in the special case they have significantly different performance from one another.
-    """
-
-    NodeId = "MultiGPU_Options"
-    NodeName = "MultiGPU Options"
-    @classmethod
-    def INPUT_TYPES(cls):
-        return {
-            "required": {
-                "device_index": ("INT", {"default": 0, "min": 0, "max": 64}),
-                "relative_speed": ("FLOAT", {"default": 1.0, "min": 0.0, "step": 0.01})
-            },
-            "optional": {
-                "gpu_options": ("GPU_OPTIONS",)
-            }
-        }
-
-    RETURN_TYPES = ("GPU_OPTIONS",)
-    FUNCTION = "create_gpu_options"
-    CATEGORY = "advanced/multigpu"
-    DESCRIPTION = cleandoc(__doc__)
-
-    def create_gpu_options(self, device_index: int, relative_speed: float, gpu_options: comfy.multigpu.GPUOptionsGroup=None):
-        if not gpu_options:
-            gpu_options = comfy.multigpu.GPUOptionsGroup()
-        gpu_options.clone()
-
-        opt = comfy.multigpu.GPUOptions(device_index=device_index, relative_speed=relative_speed)
-        gpu_options.add(opt)
-
-        return (gpu_options,)
-
-
-node_list = [
-    MultiGPUWorkUnitsNode,
-    MultiGPUOptionsNode
-]
-NODE_CLASS_MAPPINGS = {}
-NODE_DISPLAY_NAME_MAPPINGS = {}
-
-for node in node_list:
-    NODE_CLASS_MAPPINGS[node.NodeId] = node
-    NODE_DISPLAY_NAME_MAPPINGS[node.NodeId] = node.NodeName
--- a/comfy_extras/nodes_nag.py
+++ b/comfy_extras/nodes_nag.py
@@ -1,99 +0,0 @@
-import torch
-from comfy_api.latest import ComfyExtension, io
-from typing_extensions import override
-
-
-class NAGuidance(io.ComfyNode):
-    @classmethod
-    def define_schema(cls) -> io.Schema:
-        return io.Schema(
-            node_id="NAGuidance",
-            display_name="Normalized Attention Guidance",
-            description="Applies Normalized Attention Guidance to models, enabling negative prompts on distilled/schnell models.",
-            category="",
-            is_experimental=True,
-            inputs=[
-                io.Model.Input("model", tooltip="The model to apply NAG to."),
-                io.Float.Input("nag_scale", min=0.0, default=5.0, max=50.0, step=0.1, tooltip="The guidance scale factor. Higher values push further from the negative prompt."),
-                io.Float.Input("nag_alpha", min=0.0, default=0.5, max=1.0, step=0.01, tooltip="Blending factor for the normalized attention. 1.0 is full replacement, 0.0 is no effect."),
-                io.Float.Input("nag_tau", min=1.0, default=1.5, max=10.0, step=0.01),
-                # io.Float.Input("start_percent", min=0.0, default=0.0, max=1.0, step=0.01, tooltip="The relative sampling step to begin applying NAG."),
-                # io.Float.Input("end_percent", min=0.0, default=1.0, max=1.0, step=0.01, tooltip="The relative sampling step to stop applying NAG."),
-            ],
-            outputs=[
-                io.Model.Output(tooltip="The patched model with NAG enabled."),
-            ],
-        )
-
-    @classmethod
-    def execute(cls, model: io.Model.Type, nag_scale: float, nag_alpha: float, nag_tau: float) -> io.NodeOutput:
-        m = model.clone()
-
-        # sigma_start = m.get_model_object("model_sampling").percent_to_sigma(start_percent)
-        # sigma_end = m.get_model_object("model_sampling").percent_to_sigma(end_percent)
-
-        def nag_attention_output_patch(out, extra_options):
-            cond_or_uncond = extra_options.get("cond_or_uncond", None)
-            if cond_or_uncond is None:
-                return out
-
-            if not (1 in cond_or_uncond and 0 in cond_or_uncond):
-                return out
-
-            # sigma = extra_options.get("sigmas", None)
-            # if sigma is not None and len(sigma) > 0:
-            #     sigma = sigma[0].item()
-            #     if sigma > sigma_start or sigma < sigma_end:
-            #         return out
-
-            img_slice = extra_options.get("img_slice", None)
-
-            if img_slice is not None:
-                orig_out = out
-                out = out[:, img_slice[0]:img_slice[1]]  # only apply on img part
-
-            batch_size = out.shape[0]
-            half_size = batch_size // len(cond_or_uncond)
-
-            ind_neg = cond_or_uncond.index(1)
-            ind_pos = cond_or_uncond.index(0)
-            z_pos = out[half_size * ind_pos:half_size * (ind_pos + 1)]
-            z_neg = out[half_size * ind_neg:half_size * (ind_neg + 1)]
-
-            guided = z_pos * nag_scale - z_neg * (nag_scale - 1.0)
-
-            eps = 1e-6
-            norm_pos = torch.norm(z_pos, p=1, dim=-1, keepdim=True).clamp_min(eps)
-            norm_guided = torch.norm(guided, p=1, dim=-1, keepdim=True).clamp_min(eps)
-
-            ratio = norm_guided / norm_pos
-            scale_factor = torch.minimum(ratio, torch.full_like(ratio, nag_tau)) / ratio
-
-            guided_normalized = guided * scale_factor
-
-            z_final = guided_normalized * nag_alpha + z_pos * (1.0 - nag_alpha)
-
-            if img_slice is not None:
-                orig_out[half_size * ind_neg:half_size * (ind_neg + 1), img_slice[0]:img_slice[1]] = z_final
-                orig_out[half_size * ind_pos:half_size * (ind_pos + 1), img_slice[0]:img_slice[1]] = z_final
-                return orig_out
-            else:
-                out[half_size * ind_pos:half_size * (ind_pos + 1)] = z_final
-            return out
-
-        m.set_model_attn1_output_patch(nag_attention_output_patch)
-        m.disable_model_cfg1_optimization()
-
-        return io.NodeOutput(m)
-
-
-class NagExtension(ComfyExtension):
-    @override
-    async def get_node_list(self) -> list[type[io.ComfyNode]]:
-        return [
-            NAGuidance,
-        ]
-
-
-async def comfy_entrypoint() -> NagExtension:
-    return NagExtension()
--- a/comfy_extras/nodes_post_processing.py
+++ b/comfy_extras/nodes_post_processing.py
@@ -655,7 +655,6 @@ class BatchImagesMasksLatentsNode(io.ComfyNode):
            batched = batch_masks(values)
        return io.NodeOutput(batched)

-
 class PostProcessingExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
--- a/comfy_extras/nodes_replacements.py
+++ b/comfy_extras/nodes_replacements.py
@@ -1,103 +0,0 @@
-from comfy_api.latest import ComfyExtension, io, ComfyAPI
-
-api = ComfyAPI()
-
-
-async def register_replacements():
-    """Register all built-in node replacements."""
-    await register_replacements_longeredge()
-    await register_replacements_batchimages()
-    await register_replacements_upscaleimage()
-    await register_replacements_controlnet()
-    await register_replacements_load3d()
-    await register_replacements_preview3d()
-    await register_replacements_svdimg2vid()
-    await register_replacements_conditioningavg()
-
-async def register_replacements_longeredge():
-    # No dynamic inputs here
-    await api.node_replacement.register(io.NodeReplace(
-            new_node_id="ImageScaleToMaxDimension",
-            old_node_id="ResizeImagesByLongerEdge",
-            old_widget_ids=["longer_edge"],
-            input_mapping=[
-                {"new_id": "image", "old_id": "images"},
-                {"new_id": "largest_size", "old_id": "longer_edge"},
-                {"new_id": "upscale_method", "set_value": "lanczos"},
-            ],
-            # just to test the frontend output_mapping code, does nothing really here
-            output_mapping=[{"new_idx": 0, "old_idx": 0}],
-        ))
-
-async def register_replacements_batchimages():
-    # BatchImages node uses Autogrow
-    await api.node_replacement.register(io.NodeReplace(
-            new_node_id="BatchImagesNode",
-            old_node_id="ImageBatch",
-            input_mapping=[
-                {"new_id": "images.image0", "old_id": "image1"},
-                {"new_id": "images.image1", "old_id": "image2"},
-            ],
-        ))
-
-async def register_replacements_upscaleimage():
-    # ResizeImageMaskNode uses DynamicCombo
-    await api.node_replacement.register(io.NodeReplace(
-            new_node_id="ResizeImageMaskNode",
-            old_node_id="ImageScaleBy",
-            old_widget_ids=["upscale_method", "scale_by"],
-            input_mapping=[
-                {"new_id": "input", "old_id": "image"},
-                {"new_id": "resize_type", "set_value": "scale by multiplier"},
-                {"new_id": "resize_type.multiplier", "old_id": "scale_by"},
-                {"new_id": "scale_method", "old_id": "upscale_method"},
-            ],
-        ))
-
-async def register_replacements_controlnet():
-    # T2IAdapterLoader → ControlNetLoader
-    await api.node_replacement.register(io.NodeReplace(
-            new_node_id="ControlNetLoader",
-            old_node_id="T2IAdapterLoader",
-            input_mapping=[
-                {"new_id": "control_net_name", "old_id": "t2i_adapter_name"},
-            ],
-        ))
-
-async def register_replacements_load3d():
-    # Load3DAnimation merged into Load3D
-    await api.node_replacement.register(io.NodeReplace(
-            new_node_id="Load3D",
-            old_node_id="Load3DAnimation",
-        ))
-
-async def register_replacements_preview3d():
-    # Preview3DAnimation merged into Preview3D
-    await api.node_replacement.register(io.NodeReplace(
-            new_node_id="Preview3D",
-            old_node_id="Preview3DAnimation",
-        ))
-
-async def register_replacements_svdimg2vid():
-    # Typo fix: SDV → SVD
-    await api.node_replacement.register(io.NodeReplace(
-            new_node_id="SVD_img2vid_Conditioning",
-            old_node_id="SDV_img2vid_Conditioning",
-        ))
-
-async def register_replacements_conditioningavg():
-    # Typo fix: trailing space in node name
-    await api.node_replacement.register(io.NodeReplace(
-            new_node_id="ConditioningAverage",
-            old_node_id="ConditioningAverage ",
-        ))
-
-class NodeReplacementsExtension(ComfyExtension):
-    async def on_load(self) -> None:
-        await register_replacements()
-
-    async def get_node_list(self) -> list[type[io.ComfyNode]]:
-        return []
-
-async def comfy_entrypoint() -> NodeReplacementsExtension:
-    return NodeReplacementsExtension()
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@@ -1035,7 +1035,7 @@ class TrainLoraNode(io.ComfyNode):
                io.Boolean.Input(
                    "offloading",
                    default=False,
-                    tooltip="Offload the Model to RAM. Requires Bypass Mode.",
+                    tooltip="Depth level for gradient checkpointing.",
                ),
                io.Combo.Input(
                    "existing_lora",
@@ -1124,15 +1124,6 @@ class TrainLoraNode(io.ComfyNode):
        lora_dtype = node_helpers.string_to_torch_dtype(lora_dtype)
        mp.set_model_compute_dtype(dtype)

-        if mp.is_dynamic():
-            if not bypass_mode:
-                logging.info("Training MP is Dynamic - forcing bypass mode. Start comfy with --highvram to force weight diff mode")
-                bypass_mode = True
-            offloading = True
-        elif offloading:
-            if not bypass_mode:
-                logging.info("Training Offload selected - forcing bypass mode. Set bypass = True to remove this message")
-
        # Prepare latents and compute counts
        latents, num_images, multi_res = _prepare_latents_and_count(
            latents, dtype, bucket_mode
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.14.0"
+__version__ = "0.13.0"
--- a/nodes.py
+++ b/nodes.py
@@ -2264,7 +2264,6 @@ async def load_custom_node(module_path: str, ignore=set(), module_parent="custom
                if not isinstance(extension, ComfyExtension):
                    logging.warning(f"comfy_entrypoint in {module_path} did not return a ComfyExtension, skipping.")
                    return False
-                await extension.on_load()
                node_list = await extension.get_node_list()
                if not isinstance(node_list, list):
                    logging.warning(f"comfy_entrypoint in {module_path} did not return a list of nodes, skipping.")
@@ -2401,7 +2400,6 @@ async def init_builtin_extra_nodes():
        "nodes_lt_audio.py",
        "nodes_lt.py",
        "nodes_hooks.py",
-        "nodes_multigpu.py",
        "nodes_load_3d.py",
        "nodes_cosmos.py",
        "nodes_video.py",
@@ -2437,8 +2435,9 @@ async def init_builtin_extra_nodes():
        "nodes_lora_debug.py",
        "nodes_color.py",
        "nodes_toolkit.py",
-        "nodes_replacements.py",
-        "nodes_nag.py",
+        "nodes_color_correct.py",
+        "nodes_color_balance.py",
+        "nodes_color_curves.py"
    ]

    import_failed = []
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.14.0"
+version = "0.13.0"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-comfyui-frontend-package==1.38.14
-comfyui-workflow-templates==0.8.42
+comfyui-frontend-package==1.38.13
+comfyui-workflow-templates==0.8.38
 comfyui-embedded-docs==0.4.1
 torch
 torchsde
--- a/server.py
+++ b/server.py
@@ -40,7 +40,6 @@ from app.user_manager import UserManager
 from app.model_manager import ModelFileManager
 from app.custom_node_manager import CustomNodeManager
 from app.subgraph_manager import SubgraphManager
-from app.node_replace_manager import NodeReplaceManager
 from typing import Optional, Union
 from api_server.routes.internal.internal_routes import InternalRoutes
 from protocol import BinaryEventTypes
@@ -205,7 +204,6 @@ class PromptServer():
        self.model_file_manager = ModelFileManager()
        self.custom_node_manager = CustomNodeManager()
        self.subgraph_manager = SubgraphManager()
-        self.node_replace_manager = NodeReplaceManager()
        self.internal_routes = InternalRoutes(self)
        self.supports = ["custom_nodes_from_web"]
        self.prompt_queue = execution.PromptQueue(self)
@@ -889,8 +887,6 @@ class PromptServer():
                if "partial_execution_targets" in json_data:
                    partial_execution_targets = json_data["partial_execution_targets"]

-                self.node_replace_manager.apply_replacements(prompt)
-
                valid = await execution.validate_prompt(prompt_id, prompt, partial_execution_targets)
                extra_data = {}
                if "extra_data" in json_data:
@@ -999,7 +995,6 @@ class PromptServer():
        self.model_file_manager.add_routes(self.routes)
        self.custom_node_manager.add_routes(self.routes, self.app, nodes.LOADED_MODULE_DIRS.items())
        self.subgraph_manager.add_routes(self.routes, nodes.LOADED_MODULE_DIRS.items())
-        self.node_replace_manager.add_routes(self.routes)
        self.app.add_subapp('/internal', self.internal_routes.get_app())

        # Prefix every route with /api for easier matching for delegation.
Author	SHA1	Message	Date
Terry Jia	30c87e2a37	color correct	2026-02-14 08:15:25 -05:00
Terry Jia	85b8ee1390	Boundingbox widget	2026-02-13 07:56:20 -05:00