Merge branch 'master' into flipflop-stream

2026-04-26 09:29:07 +00:00 · 2025-10-28 15:08:27 -07:00
parent 61133af772 210f7a1ba5
commit 386e854aab
71 changed files with 7456 additions and 5444 deletions
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@@ -189,15 +189,15 @@ class ChromaRadiance(Chroma):
        nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size)
        nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P]

+        # Reshape for per-patch processing
+        nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
+        nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
+
        if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size:
            # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than
            # the tile size.
-            img_dct = self.forward_tiled_nerf(img_out, nerf_pixels, B, C, num_patches, patch_size, params)
+            img_dct = self.forward_tiled_nerf(nerf_hidden, nerf_pixels, B, C, num_patches, patch_size, params)
        else:
-            # Reshape for per-patch processing
-            nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
-            nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
-
            # Get DCT-encoded pixel embeddings [pixel-dct]
            img_dct = self.nerf_image_embedder(nerf_pixels)

@@ -240,17 +240,8 @@ class ChromaRadiance(Chroma):
            end = min(i + tile_size, num_patches)

            # Slice the current tile from the input tensors
-            nerf_hidden_tile = nerf_hidden[:, i:end, :]
-            nerf_pixels_tile = nerf_pixels[:, i:end, :]
-
-            # Get the actual number of patches in this tile (can be smaller for the last tile)
-            num_patches_tile = nerf_hidden_tile.shape[1]
-
-            # Reshape the tile for per-patch processing
-            # [B, NumPatches_tile, D] -> [B * NumPatches_tile, D]
-            nerf_hidden_tile = nerf_hidden_tile.reshape(batch * num_patches_tile, params.hidden_size)
-            # [B, NumPatches_tile, C*P*P] -> [B*NumPatches_tile, C, P*P] -> [B*NumPatches_tile, P*P, C]
-            nerf_pixels_tile = nerf_pixels_tile.reshape(batch * num_patches_tile, channels, patch_size**2).transpose(1, 2)
+            nerf_hidden_tile = nerf_hidden[i * batch:end * batch]
+            nerf_pixels_tile = nerf_pixels[i * batch:end * batch]

            # get DCT-encoded pixel embeddings [pixel-dct]
            img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile)
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -134,7 +134,7 @@ class BaseModel(torch.nn.Module):
        if not unet_config.get("disable_unet_model_creation", False):
            if model_config.custom_operations is None:
                fp8 = model_config.optimizations.get("fp8", False)
-                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8)
+                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8, model_config=model_config)
            else:
                operations = model_config.custom_operations
            self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
@@ -197,8 +197,14 @@ class BaseModel(torch.nn.Module):
            extra_conds[o] = extra

        t = self.process_timestep(t, x=x, **extra_conds)
-        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
-        return self.model_sampling.calculate_denoised(sigma, model_output, x)
+        if "latent_shapes" in extra_conds:
+            xc = utils.unpack_latents(xc, extra_conds.pop("latent_shapes"))
+
+        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds)
+        if len(model_output) > 1 and not torch.is_tensor(model_output):
+            model_output, _ = utils.pack_latents(model_output)
+
+        return self.model_sampling.calculate_denoised(sigma, model_output.float(), x)

    def process_timestep(self, timestep, **kwargs):
        return timestep
@@ -327,6 +333,14 @@ class BaseModel(torch.nn.Module):
        if self.model_config.scaled_fp8 is not None:
            unet_state_dict["scaled_fp8"] = torch.tensor([], dtype=self.model_config.scaled_fp8)

+        # Save mixed precision metadata
+        if hasattr(self.model_config, 'layer_quant_config') and self.model_config.layer_quant_config:
+            metadata = {
+                "format_version": "1.0",
+                "layers": self.model_config.layer_quant_config
+            }
+            unet_state_dict["_quantization_metadata"] = metadata
+
        unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)

        if self.model_type == ModelType.V_PREDICTION:
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -6,6 +6,20 @@ import math
 import logging
 import torch

+
+def detect_layer_quantization(metadata):
+    quant_key = "_quantization_metadata"
+    if metadata is not None and quant_key in metadata:
+        quant_metadata = metadata.pop(quant_key)
+        quant_metadata = json.loads(quant_metadata)
+        if isinstance(quant_metadata, dict) and "layers" in quant_metadata:
+            logging.info(f"Found quantization metadata (version {quant_metadata.get('format_version', 'unknown')})")
+            return quant_metadata["layers"]
+        else:
+            raise ValueError("Invalid quantization metadata format")
+    return None
+
+
 def count_blocks(state_dict_keys, prefix_string):
    count = 0
    while True:
@@ -213,7 +227,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                dit_config["nerf_mlp_ratio"] = 4
                dit_config["nerf_depth"] = 4
                dit_config["nerf_max_freqs"] = 8
-                dit_config["nerf_tile_size"] = 32
+                dit_config["nerf_tile_size"] = 512
                dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
                dit_config["nerf_embedder_dtype"] = torch.float32
        else:
@@ -701,6 +715,12 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
        else:
            model_config.optimizations["fp8"] = True

+    # Detect per-layer quantization (mixed precision)
+    layer_quant_config = detect_layer_quantization(metadata)
+    if layer_quant_config:
+        model_config.layer_quant_config = layer_quant_config
+        logging.info(f"Detected mixed precision quantization: {len(layer_quant_config)} layers quantized")
+
    return model_config

 def unet_prefix_from_state_dict(state_dict):
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -89,6 +89,7 @@ if args.deterministic:

 directml_enabled = False
 if args.directml is not None:
+    logging.warning("WARNING: torch-directml barely works, is very slow, has not been updated in over 1 year and might be removed soon, please don't use it, there are better options.")
    import torch_directml
    directml_enabled = True
    device_index = args.directml
@@ -330,14 +331,21 @@ except:


 SUPPORT_FP8_OPS = args.supports_fp8_compute
+
+AMD_RDNA2_AND_OLDER_ARCH = ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]
+
 try:
    if is_amd():
-        torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
+        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
+        if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)):
+            torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
+            logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.")
+
        try:
            rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2]))
        except:
            rocm_version = (6, -1)
-        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
+
        logging.info("AMD arch: {}".format(arch))
        logging.info("ROCm version: {}".format(rocm_version))
        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
@@ -349,7 +357,7 @@ try:
                   if any((a in arch) for a in ["gfx1201"]):
                       ENABLE_PYTORCH_ATTENTION = True
        if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
-            if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
+            if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx950"]):  # TODO: more arches, "gfx942" gives error on pytorch nightly 2.10 1013 rocm7.0
                SUPPORT_FP8_OPS = True

 except:
@@ -371,6 +379,9 @@ try:
 except:
    pass

+if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
+    torch.backends.cudnn.benchmark = True
+
 try:
    if torch_version_numeric >= (2, 5):
        torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
@@ -988,12 +999,6 @@ def device_supports_non_blocking(device):
        return False
    return True

-def device_should_use_non_blocking(device):
-    if not device_supports_non_blocking(device):
-        return False
-    return False
-    # return True #TODO: figure out why this causes memory issues on Nvidia and possibly others
-
 def force_channels_last():
    if args.force_channels_last:
        return True
@@ -1329,7 +1334,7 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma

    if is_amd():
        arch = torch.cuda.get_device_properties(device).gcnArchName
-        if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]):  # RDNA2 and older don't support bf16
+        if any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH):  # RDNA2 and older don't support bf16
            if manual_cast:
                return True
            return False
--- a/comfy/nested_tensor.py
+++ b/comfy/nested_tensor.py
@@ -0,0 +1,91 @@
+import torch
+
+class NestedTensor:
+    def __init__(self, tensors):
+        self.tensors = list(tensors)
+        self.is_nested = True
+
+    def _copy(self):
+        return NestedTensor(self.tensors)
+
+    def apply_operation(self, other, operation):
+        o = self._copy()
+        if isinstance(other, NestedTensor):
+            for i, t in enumerate(o.tensors):
+                o.tensors[i] = operation(t, other.tensors[i])
+        else:
+            for i, t in enumerate(o.tensors):
+                o.tensors[i] = operation(t, other)
+        return o
+
+    def __add__(self, b):
+        return self.apply_operation(b, lambda x, y: x + y)
+
+    def __sub__(self, b):
+        return self.apply_operation(b, lambda x, y: x - y)
+
+    def __mul__(self, b):
+        return self.apply_operation(b, lambda x, y: x * y)
+
+    # def __itruediv__(self, b):
+    #     return self.apply_operation(b, lambda x, y: x / y)
+
+    def __truediv__(self, b):
+        return self.apply_operation(b, lambda x, y: x / y)
+
+    def __getitem__(self, *args, **kwargs):
+        return self.apply_operation(None, lambda x, y: x.__getitem__(*args, **kwargs))
+
+    def unbind(self):
+        return self.tensors
+
+    def to(self, *args, **kwargs):
+        o = self._copy()
+        for i, t in enumerate(o.tensors):
+            o.tensors[i] = t.to(*args, **kwargs)
+        return o
+
+    def new_ones(self, *args, **kwargs):
+        return self.tensors[0].new_ones(*args, **kwargs)
+
+    def float(self):
+        return self.to(dtype=torch.float)
+
+    def chunk(self, *args, **kwargs):
+        return self.apply_operation(None, lambda x, y: x.chunk(*args, **kwargs))
+
+    def size(self):
+        return self.tensors[0].size()
+
+    @property
+    def shape(self):
+        return self.tensors[0].shape
+
+    @property
+    def ndim(self):
+        dims = 0
+        for t in self.tensors:
+            dims = max(t.ndim, dims)
+        return dims
+
+    @property
+    def device(self):
+        return self.tensors[0].device
+
+    @property
+    def dtype(self):
+        return self.tensors[0].dtype
+
+    @property
+    def layout(self):
+        return self.tensors[0].layout
+
+
+def cat_nested(tensors, *args, **kwargs):
+    cated_tensors = []
+    for i in range(len(tensors[0].tensors)):
+        tens = []
+        for j in range(len(tensors)):
+            tens.append(tensors[j].tensors[i])
+        cated_tensors.append(torch.cat(tens, *args, **kwargs))
+    return NestedTensor(cated_tensors)
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -25,6 +25,9 @@ import comfy.rmsnorm
 import contextlib

 def run_every_op():
+    if torch.compiler.is_compiling():
+        return
+
    comfy.model_management.throw_exception_if_processing_interrupted()

 def scaled_dot_product_attention(q, k, v, *args, **kwargs):
@@ -52,14 +55,22 @@ try:
 except (ModuleNotFoundError, TypeError):
    logging.warning("Could not set sdpa backend priority.")

-cast_to = comfy.model_management.cast_to #TODO: remove once no more references
+NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False
+try:
+    if comfy.model_management.is_nvidia():
+        if torch.backends.cudnn.version() >= 91002 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10):
+            #TODO: change upper bound version once it's fixed'
+            NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True
+            logging.info("working around nvidia conv3d memory bug.")
+except:
+    pass

-if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
-    torch.backends.cudnn.benchmark = True
+cast_to = comfy.model_management.cast_to #TODO: remove once no more references

 def cast_to_input(weight, input, non_blocking=False, copy=True):
    return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)

+@torch.compiler.disable()
 def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
    if input is not None:
        if dtype is None:
@@ -151,6 +162,15 @@ class disable_weight_init:
        def reset_parameters(self):
            return None

+        def _conv_forward(self, input, weight, bias, *args, **kwargs):
+            if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16):
+                out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True)
+                if bias is not None:
+                    out += bias.reshape((1, -1) + (1,) * (out.ndim - 2))
+                return out
+            else:
+                return super()._conv_forward(input, weight, bias, *args, **kwargs)
+
        def forward_comfy_cast_weights(self, input):
            weight, bias = cast_bias_weight(self, input)
            return self._conv_forward(input, weight, bias)
@@ -324,6 +344,10 @@ class manual_cast(disable_weight_init):


 def fp8_linear(self, input):
+    """
+    Legacy FP8 linear function for backward compatibility.
+    Uses QuantizedTensor subclass for dispatch.
+    """
    dtype = self.weight.dtype
    if dtype not in [torch.float8_e4m3fn]:
        return None
@@ -335,9 +359,9 @@ def fp8_linear(self, input):

    input_shape = input.shape
    input_dtype = input.dtype
+
    if len(input.shape) == 3:
        w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype)
-        w = w.t()

        scale_weight = self.scale_weight
        scale_input = self.scale_input
@@ -348,23 +372,18 @@ def fp8_linear(self, input):

        if scale_input is None:
            scale_input = torch.ones((), device=input.device, dtype=torch.float32)
-            input = torch.clamp(input, min=-448, max=448, out=input)
-            input = input.reshape(-1, input_shape[2]).to(dtype).contiguous()
        else:
            scale_input = scale_input.to(input.device)
-            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype).contiguous()

-        if bias is not None:
-            o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
-        else:
-            o = torch._scaled_mm(input, w, out_dtype=input_dtype, scale_a=scale_input, scale_b=scale_weight)
-
-        if isinstance(o, tuple):
-            o = o[0]
+        # Wrap weight in QuantizedTensor - this enables unified dispatch
+        # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py!
+        layout_params_weight = {'scale': scale_weight, 'orig_dtype': input_dtype}
+        quantized_weight = QuantizedTensor(w, TensorCoreFP8Layout, layout_params_weight)
+        quantized_input = QuantizedTensor.from_float(input.reshape(-1, input_shape[2]), TensorCoreFP8Layout, scale=scale_input, dtype=dtype)
+        o = torch.nn.functional.linear(quantized_input, quantized_weight, bias)

        if tensor_2d:
            return o.reshape(input_shape[0], -1)
-
        return o.reshape((-1, input_shape[1], self.weight.shape[0]))

    return None
@@ -458,7 +477,128 @@ if CUBLAS_IS_AVAILABLE:
            def forward(self, *args, **kwargs):
                return super().forward(*args, **kwargs)

-def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None):
+
+# ==============================================================================
+# Mixed Precision Operations
+# ==============================================================================
+from .quant_ops import QuantizedTensor, TensorCoreFP8Layout
+
+QUANT_FORMAT_MIXINS = {
+    "float8_e4m3fn": {
+        "dtype": torch.float8_e4m3fn,
+        "layout_type": TensorCoreFP8Layout,
+        "parameters": {
+            "weight_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False),
+            "input_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False),
+        }
+    }
+}
+
+class MixedPrecisionOps(disable_weight_init):
+    _layer_quant_config = {}
+    _compute_dtype = torch.bfloat16
+
+    class Linear(torch.nn.Module, CastWeightBiasOp):
+        def __init__(
+            self,
+            in_features: int,
+            out_features: int,
+            bias: bool = True,
+            device=None,
+            dtype=None,
+        ) -> None:
+            super().__init__()
+
+            self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype}
+            # self.factory_kwargs = {"device": device, "dtype": dtype}
+
+            self.in_features = in_features
+            self.out_features = out_features
+            if bias:
+                self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs))
+            else:
+                self.register_parameter("bias", None)
+
+            self.tensor_class = None
+
+        def reset_parameters(self):
+            return None
+
+        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
+                                  strict, missing_keys, unexpected_keys, error_msgs):
+
+            device = self.factory_kwargs["device"]
+            layer_name = prefix.rstrip('.')
+            weight_key = f"{prefix}weight"
+            weight = state_dict.pop(weight_key, None)
+            if weight is None:
+                raise ValueError(f"Missing weight for layer {layer_name}")
+
+            manually_loaded_keys = [weight_key]
+
+            if layer_name not in MixedPrecisionOps._layer_quant_config:
+                self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False)
+            else:
+                quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None)
+                if quant_format is None:
+                    raise ValueError(f"Unknown quantization format for layer {layer_name}")
+
+                mixin = QUANT_FORMAT_MIXINS[quant_format]
+                self.layout_type = mixin["layout_type"]
+
+                scale_key = f"{prefix}weight_scale"
+                layout_params = {
+                    'scale': state_dict.pop(scale_key, None),
+                    'orig_dtype': MixedPrecisionOps._compute_dtype
+                }
+                if layout_params['scale'] is not None:
+                    manually_loaded_keys.append(scale_key)
+
+                self.weight = torch.nn.Parameter(
+                    QuantizedTensor(weight.to(device=device, dtype=mixin["dtype"]), self.layout_type, layout_params),
+                    requires_grad=False
+                )
+
+                for param_name, param_value in mixin["parameters"].items():
+                    param_key = f"{prefix}{param_name}"
+                    _v = state_dict.pop(param_key, None)
+                    if _v is None:
+                        continue
+                    setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
+                    manually_loaded_keys.append(param_key)
+
+            super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+
+            for key in manually_loaded_keys:
+                if key in missing_keys:
+                    missing_keys.remove(key)
+
+        def _forward(self, input, weight, bias):
+            return torch.nn.functional.linear(input, weight, bias)
+
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return self._forward(input, weight, bias)
+
+        def forward(self, input, *args, **kwargs):
+            run_every_op()
+
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(input, *args, **kwargs)
+            if (getattr(self, 'layout_type', None) is not None and
+                getattr(self, 'input_scale', None) is not None and
+                not isinstance(input, QuantizedTensor)):
+                input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, fp8_dtype=self.weight.dtype)
+            return self._forward(input, self.weight, self.bias)
+
+
+def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None):
+    if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config:
+        MixedPrecisionOps._layer_quant_config = model_config.layer_quant_config
+        MixedPrecisionOps._compute_dtype = compute_dtype
+        logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers")
+        return MixedPrecisionOps
+
    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
    if scaled_fp8 is not None:
        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
--- a/comfy/patcher_extension.py
+++ b/comfy/patcher_extension.py
@@ -150,7 +150,7 @@ def merge_nested_dicts(dict1: dict, dict2: dict, copy_dict1=True):
    for key, value in dict2.items():
        if isinstance(value, dict):
            curr_value = merged_dict.setdefault(key, {})
-            merged_dict[key] = merge_nested_dicts(value, curr_value)
+            merged_dict[key] = merge_nested_dicts(curr_value, value)
        elif isinstance(value, list):
            merged_dict.setdefault(key, []).extend(value)
        else:
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -0,0 +1,437 @@
+import torch
+import logging
+from typing import Tuple, Dict
+
+_LAYOUT_REGISTRY = {}
+_GENERIC_UTILS = {}
+
+
+def register_layout_op(torch_op, layout_type):
+    """
+    Decorator to register a layout-specific operation handler.
+    Args:
+        torch_op: PyTorch operation (e.g., torch.ops.aten.linear.default)
+        layout_type: Layout class (e.g., TensorCoreFP8Layout)
+    Example:
+        @register_layout_op(torch.ops.aten.linear.default, TensorCoreFP8Layout)
+        def fp8_linear(func, args, kwargs):
+            # FP8-specific linear implementation
+            ...
+    """
+    def decorator(handler_func):
+        if torch_op not in _LAYOUT_REGISTRY:
+            _LAYOUT_REGISTRY[torch_op] = {}
+        _LAYOUT_REGISTRY[torch_op][layout_type] = handler_func
+        return handler_func
+    return decorator
+
+
+def register_generic_util(torch_op):
+    """
+    Decorator to register a generic utility that works for all layouts.
+    Args:
+        torch_op: PyTorch operation (e.g., torch.ops.aten.detach.default)
+
+    Example:
+        @register_generic_util(torch.ops.aten.detach.default)
+        def generic_detach(func, args, kwargs):
+            # Works for any layout
+            ...
+    """
+    def decorator(handler_func):
+        _GENERIC_UTILS[torch_op] = handler_func
+        return handler_func
+    return decorator
+
+
+def _get_layout_from_args(args):
+    for arg in args:
+        if isinstance(arg, QuantizedTensor):
+            return arg._layout_type
+        elif isinstance(arg, (list, tuple)):
+            for item in arg:
+                if isinstance(item, QuantizedTensor):
+                    return item._layout_type
+    return None
+
+
+def _move_layout_params_to_device(params, device):
+    new_params = {}
+    for k, v in params.items():
+        if isinstance(v, torch.Tensor):
+            new_params[k] = v.to(device=device)
+        else:
+            new_params[k] = v
+    return new_params
+
+
+def _copy_layout_params(params):
+    new_params = {}
+    for k, v in params.items():
+        if isinstance(v, torch.Tensor):
+            new_params[k] = v.clone()
+        else:
+            new_params[k] = v
+    return new_params
+
+
+class QuantizedLayout:
+    """
+    Base class for quantization layouts.
+
+    A layout encapsulates the format-specific logic for quantization/dequantization
+    and provides a uniform interface for extracting raw tensors needed for computation.
+
+    New quantization formats should subclass this and implement the required methods.
+    """
+    @classmethod
+    def quantize(cls, tensor, **kwargs) -> Tuple[torch.Tensor, Dict]:
+        raise NotImplementedError(f"{cls.__name__} must implement quantize()")
+
+    @staticmethod
+    def dequantize(qdata, **layout_params) -> torch.Tensor:
+        raise NotImplementedError("TensorLayout must implement dequantize()")
+
+    @classmethod
+    def get_plain_tensors(cls, qtensor) -> torch.Tensor:
+        raise NotImplementedError(f"{cls.__name__} must implement get_plain_tensors()")
+
+
+class QuantizedTensor(torch.Tensor):
+    """
+    Universal quantized tensor that works with any layout.
+
+    This tensor subclass uses a pluggable layout system to support multiple
+    quantization formats (FP8, INT4, INT8, etc.) without code duplication.
+
+    The layout_type determines format-specific behavior, while common operations
+    (detach, clone, to) are handled generically.
+
+    Attributes:
+        _qdata: The quantized tensor data
+        _layout_type: Layout class (e.g., TensorCoreFP8Layout)
+        _layout_params: Dict with layout-specific params (scale, zero_point, etc.)
+    """
+
+    @staticmethod
+    def __new__(cls, qdata, layout_type, layout_params):
+        """
+        Create a quantized tensor.
+
+        Args:
+            qdata: The quantized data tensor
+            layout_type: Layout class (subclass of QuantizedLayout)
+            layout_params: Dict with layout-specific parameters
+        """
+        return torch.Tensor._make_subclass(cls, qdata, require_grad=False)
+
+    def __init__(self, qdata, layout_type, layout_params):
+        self._qdata = qdata.contiguous()
+        self._layout_type = layout_type
+        self._layout_params = layout_params
+
+    def __repr__(self):
+        layout_name = self._layout_type.__name__
+        param_str = ", ".join(f"{k}={v}" for k, v in list(self._layout_params.items())[:2])
+        return f"QuantizedTensor(shape={self.shape}, layout={layout_name}, {param_str})"
+
+    @property
+    def layout_type(self):
+        return self._layout_type
+
+    def __tensor_flatten__(self):
+        """
+        Tensor flattening protocol for proper device movement.
+        """
+        inner_tensors = ["_qdata"]
+        ctx = {
+            "layout_type": self._layout_type,
+        }
+
+        tensor_params = {}
+        non_tensor_params = {}
+        for k, v in self._layout_params.items():
+            if isinstance(v, torch.Tensor):
+                tensor_params[k] = v
+            else:
+                non_tensor_params[k] = v
+
+        ctx["tensor_param_keys"] = list(tensor_params.keys())
+        ctx["non_tensor_params"] = non_tensor_params
+
+        for k, v in tensor_params.items():
+            attr_name = f"_layout_param_{k}"
+            object.__setattr__(self, attr_name, v)
+            inner_tensors.append(attr_name)
+
+        return inner_tensors, ctx
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, ctx, outer_size, outer_stride):
+        """
+        Tensor unflattening protocol for proper device movement.
+        Reconstructs the QuantizedTensor after device movement.
+        """
+        layout_type = ctx["layout_type"]
+        layout_params = dict(ctx["non_tensor_params"])
+
+        for key in ctx["tensor_param_keys"]:
+            attr_name = f"_layout_param_{key}"
+            layout_params[key] = inner_tensors[attr_name]
+
+        return QuantizedTensor(inner_tensors["_q_data"], layout_type, layout_params)
+
+    @classmethod
+    def from_float(cls, tensor, layout_type, **quantize_kwargs) -> 'QuantizedTensor':
+        qdata, layout_params = layout_type.quantize(tensor, **quantize_kwargs)
+        return cls(qdata, layout_type, layout_params)
+
+    def dequantize(self) -> torch.Tensor:
+        return self._layout_type.dequantize(self._qdata, **self._layout_params)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        # Step 1: Check generic utilities first (detach, clone, to, etc.)
+        if func in _GENERIC_UTILS:
+            return _GENERIC_UTILS[func](func, args, kwargs)
+
+        # Step 2: Check layout-specific handlers (linear, matmul, etc.)
+        layout_type = _get_layout_from_args(args)
+        if layout_type and func in _LAYOUT_REGISTRY:
+            handler = _LAYOUT_REGISTRY[func].get(layout_type)
+            if handler:
+                return handler(func, args, kwargs)
+
+        # Step 3: Fallback to dequantization
+        if isinstance(args[0] if args else None, QuantizedTensor):
+            logging.info(f"QuantizedTensor: Unhandled operation {func}, falling back to dequantization. kwargs={kwargs}")
+        return cls._dequant_and_fallback(func, args, kwargs)
+
+    @classmethod
+    def _dequant_and_fallback(cls, func, args, kwargs):
+        def dequant_arg(arg):
+            if isinstance(arg, QuantizedTensor):
+                return arg.dequantize()
+            elif isinstance(arg, (list, tuple)):
+                return type(arg)(dequant_arg(a) for a in arg)
+            return arg
+
+        new_args = dequant_arg(args)
+        new_kwargs = dequant_arg(kwargs)
+        return func(*new_args, **new_kwargs)
+
+
+# ==============================================================================
+# Generic Utilities (Layout-Agnostic Operations)
+# ==============================================================================
+
+def _create_transformed_qtensor(qt, transform_fn):
+    new_data = transform_fn(qt._qdata)
+    new_params = _copy_layout_params(qt._layout_params)
+    return QuantizedTensor(new_data, qt._layout_type, new_params)
+
+
+def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout=None, op_name="to"):
+    if target_dtype is not None and target_dtype != qt.dtype:
+        logging.warning(
+            f"QuantizedTensor: dtype conversion requested to {target_dtype}, "
+            f"but not supported for quantized tensors. Ignoring dtype."
+        )
+
+    if target_layout is not None and target_layout != torch.strided:
+        logging.warning(
+            f"QuantizedTensor: layout change requested to {target_layout}, "
+            f"but not supported. Ignoring layout."
+        )
+
+    # Handle device transfer
+    current_device = qt._qdata.device
+    if target_device is not None:
+        # Normalize device for comparison
+        if isinstance(target_device, str):
+            target_device = torch.device(target_device)
+        if isinstance(current_device, str):
+            current_device = torch.device(current_device)
+
+        if target_device != current_device:
+            logging.debug(f"QuantizedTensor.{op_name}: Moving from {current_device} to {target_device}")
+            new_q_data = qt._qdata.to(device=target_device)
+            new_params = _move_layout_params_to_device(qt._layout_params, target_device)
+            new_qt = QuantizedTensor(new_q_data, qt._layout_type, new_params)
+            logging.debug(f"QuantizedTensor.{op_name}: Created new tensor on {target_device}")
+            return new_qt
+
+    logging.debug(f"QuantizedTensor.{op_name}: No device change needed, returning original")
+    return qt
+
+
+@register_generic_util(torch.ops.aten.detach.default)
+def generic_detach(func, args, kwargs):
+    """Detach operation - creates a detached copy of the quantized tensor."""
+    qt = args[0]
+    if isinstance(qt, QuantizedTensor):
+        return _create_transformed_qtensor(qt, lambda x: x.detach())
+    return func(*args, **kwargs)
+
+
+@register_generic_util(torch.ops.aten.clone.default)
+def generic_clone(func, args, kwargs):
+    """Clone operation - creates a deep copy of the quantized tensor."""
+    qt = args[0]
+    if isinstance(qt, QuantizedTensor):
+        return _create_transformed_qtensor(qt, lambda x: x.clone())
+    return func(*args, **kwargs)
+
+
+@register_generic_util(torch.ops.aten._to_copy.default)
+def generic_to_copy(func, args, kwargs):
+    """Device/dtype transfer operation - handles .to(device) calls."""
+    qt = args[0]
+    if isinstance(qt, QuantizedTensor):
+        return _handle_device_transfer(
+            qt,
+            target_device=kwargs.get('device', None),
+            target_dtype=kwargs.get('dtype', None),
+            op_name="_to_copy"
+        )
+    return func(*args, **kwargs)
+
+
+@register_generic_util(torch.ops.aten.to.dtype_layout)
+def generic_to_dtype_layout(func, args, kwargs):
+    """Handle .to(device) calls using the dtype_layout variant."""
+    qt = args[0]
+    if isinstance(qt, QuantizedTensor):
+        return _handle_device_transfer(
+            qt,
+            target_device=kwargs.get('device', None),
+            target_dtype=kwargs.get('dtype', None),
+            target_layout=kwargs.get('layout', None),
+            op_name="to"
+        )
+    return func(*args, **kwargs)
+
+
+@register_generic_util(torch.ops.aten.copy_.default)
+def generic_copy_(func, args, kwargs):
+    qt_dest = args[0]
+    src = args[1]
+
+    if isinstance(qt_dest, QuantizedTensor):
+        if isinstance(src, QuantizedTensor):
+            # Copy from another quantized tensor
+            qt_dest._qdata.copy_(src._qdata)
+            qt_dest._layout_type = src._layout_type
+            qt_dest._layout_params = _copy_layout_params(src._layout_params)
+        else:
+            # Copy from regular tensor - just copy raw data
+            qt_dest._qdata.copy_(src)
+        return qt_dest
+    return func(*args, **kwargs)
+
+
+@register_generic_util(torch.ops.aten._has_compatible_shallow_copy_type.default)
+def generic_has_compatible_shallow_copy_type(func, args, kwargs):
+    return True
+
+# ==============================================================================
+# FP8 Layout + Operation Handlers
+# ==============================================================================
+class TensorCoreFP8Layout(QuantizedLayout):
+    """
+    Storage format:
+    - qdata: FP8 tensor (torch.float8_e4m3fn or torch.float8_e5m2)
+    - scale: Scalar tensor (float32) for dequantization
+    - orig_dtype: Original dtype before quantization (for casting back)
+    """
+    @classmethod
+    def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn):
+        orig_dtype = tensor.dtype
+
+        if scale is None:
+            scale = torch.amax(tensor.abs()) / torch.finfo(dtype).max
+
+        if not isinstance(scale, torch.Tensor):
+            scale = torch.tensor(scale)
+        scale = scale.to(device=tensor.device, dtype=torch.float32)
+
+        lp_amax = torch.finfo(dtype).max
+        tensor_scaled = tensor.float() / scale
+        torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled)
+        qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format)
+
+        layout_params = {
+            'scale': scale,
+            'orig_dtype': orig_dtype
+        }
+        return qdata, layout_params
+
+    @staticmethod
+    def dequantize(qdata, scale, orig_dtype, **kwargs):
+        plain_tensor = torch.ops.aten._to_copy.default(qdata, dtype=orig_dtype)
+        return plain_tensor * scale
+
+    @classmethod
+    def get_plain_tensors(cls, qtensor):
+        return qtensor._qdata, qtensor._layout_params['scale']
+
+
+@register_layout_op(torch.ops.aten.linear.default, TensorCoreFP8Layout)
+def fp8_linear(func, args, kwargs):
+    input_tensor = args[0]
+    weight = args[1]
+    bias = args[2] if len(args) > 2 else None
+
+    if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor):
+        plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor)
+        plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight)
+
+        out_dtype = kwargs.get("out_dtype")
+        if out_dtype is None:
+            out_dtype = input_tensor._layout_params['orig_dtype']
+
+        weight_t = plain_weight.t()
+
+        tensor_2d = False
+        if len(plain_input.shape) == 2:
+            tensor_2d = True
+            plain_input = plain_input.unsqueeze(1)
+
+        input_shape = plain_input.shape
+        if len(input_shape) != 3:
+            return None
+
+        try:
+            output = torch._scaled_mm(
+                plain_input.reshape(-1, input_shape[2]),
+                weight_t,
+                bias=bias,
+                scale_a=scale_a,
+                scale_b=scale_b,
+                out_dtype=out_dtype,
+            )
+            if not tensor_2d:
+                output = output.reshape((-1, input_shape[1], weight.shape[0]))
+
+            if output.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+                output_scale = scale_a * scale_b
+                output_params = {
+                    'scale': output_scale,
+                    'orig_dtype': input_tensor._layout_params['orig_dtype']
+                }
+                return QuantizedTensor(output, TensorCoreFP8Layout, output_params)
+            else:
+                return output
+
+        except Exception as e:
+            raise RuntimeError(f"FP8 _scaled_mm failed, falling back to dequantization: {e}")
+
+    # Case 2: DQ Fallback
+    if isinstance(weight, QuantizedTensor):
+        weight = weight.dequantize()
+    if isinstance(input_tensor, QuantizedTensor):
+        input_tensor = input_tensor.dequantize()
+
+    return torch.nn.functional.linear(input_tensor, weight, bias)
--- a/comfy/sample.py
+++ b/comfy/sample.py
@@ -4,13 +4,9 @@ import comfy.samplers
 import comfy.utils
 import numpy as np
 import logging
+import comfy.nested_tensor

-def prepare_noise(latent_image, seed, noise_inds=None):
-    """
-    creates random noise given a latent image and a seed.
-    optional arg skip can be used to skip and discard x number of noise generations for a given seed
-    """
-    generator = torch.manual_seed(seed)
+def prepare_noise_inner(latent_image, generator, noise_inds=None):
    if noise_inds is None:
        return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")

@@ -21,10 +17,29 @@ def prepare_noise(latent_image, seed, noise_inds=None):
        if i in unique_inds:
            noises.append(noise)
    noises = [noises[i] for i in inverse]
-    noises = torch.cat(noises, axis=0)
+    return torch.cat(noises, axis=0)
+
+def prepare_noise(latent_image, seed, noise_inds=None):
+    """
+    creates random noise given a latent image and a seed.
+    optional arg skip can be used to skip and discard x number of noise generations for a given seed
+    """
+    generator = torch.manual_seed(seed)
+
+    if latent_image.is_nested:
+        tensors = latent_image.unbind()
+        noises = []
+        for t in tensors:
+            noises.append(prepare_noise_inner(t, generator, noise_inds))
+        noises = comfy.nested_tensor.NestedTensor(noises)
+    else:
+        noises = prepare_noise_inner(latent_image, generator, noise_inds)
+
    return noises

 def fix_empty_latent_channels(model, latent_image):
+    if latent_image.is_nested:
+        return latent_image
    latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels
    if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0:
        latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -306,17 +306,10 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens
                                                                                 copy_dict1=False)

            if patches is not None:
-                # TODO: replace with merge_nested_dicts function
-                if "patches" in transformer_options:
-                    cur_patches = transformer_options["patches"].copy()
-                    for p in patches:
-                        if p in cur_patches:
-                            cur_patches[p] = cur_patches[p] + patches[p]
-                        else:
-                            cur_patches[p] = patches[p]
-                    transformer_options["patches"] = cur_patches
-                else:
-                    transformer_options["patches"] = patches
+                transformer_options["patches"] = comfy.patcher_extension.merge_nested_dicts(
+                    transformer_options.get("patches", {}),
+                    patches
+                )

            transformer_options["cond_or_uncond"] = cond_or_uncond[:]
            transformer_options["uuids"] = uuids[:]
@@ -789,7 +782,7 @@ def ksampler(sampler_name, extra_options={}, inpaint_options={}):
    return KSAMPLER(sampler_function, extra_options, inpaint_options)


-def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=None, seed=None):
+def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=None, seed=None, latent_shapes=None):
    for k in conds:
        conds[k] = conds[k][:]
        resolve_areas_and_cond_masks_multidim(conds[k], noise.shape[2:], device)
@@ -799,7 +792,7 @@ def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=N

    if hasattr(model, 'extra_conds'):
        for k in conds:
-            conds[k] = encode_model_conds(model.extra_conds, conds[k], noise, device, k, latent_image=latent_image, denoise_mask=denoise_mask, seed=seed)
+            conds[k] = encode_model_conds(model.extra_conds, conds[k], noise, device, k, latent_image=latent_image, denoise_mask=denoise_mask, seed=seed, latent_shapes=latent_shapes)

    #make sure each cond area has an opposite one with the same area
    for k in conds:
@@ -969,11 +962,11 @@ class CFGGuider:
    def predict_noise(self, x, timestep, model_options={}, seed=None):
        return sampling_function(self.inner_model, x, timestep, self.conds.get("negative", None), self.conds.get("positive", None), self.cfg, model_options=model_options, seed=seed)

-    def inner_sample(self, noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed):
+    def inner_sample(self, noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=None):
        if latent_image is not None and torch.count_nonzero(latent_image) > 0: #Don't shift the empty latent image.
            latent_image = self.inner_model.process_latent_in(latent_image)

-        self.conds = process_conds(self.inner_model, noise, self.conds, device, latent_image, denoise_mask, seed)
+        self.conds = process_conds(self.inner_model, noise, self.conds, device, latent_image, denoise_mask, seed, latent_shapes=latent_shapes)

        extra_model_options = comfy.model_patcher.create_model_options_clone(self.model_options)
        extra_model_options.setdefault("transformer_options", {})["sample_sigmas"] = sigmas
@@ -987,7 +980,7 @@ class CFGGuider:
        samples = executor.execute(self, sigmas, extra_args, callback, noise, latent_image, denoise_mask, disable_pbar)
        return self.inner_model.process_latent_out(samples.to(torch.float32))

-    def outer_sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None):
+    def outer_sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None, latent_shapes=None):
        self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options)
        device = self.model_patcher.load_device

@@ -1001,7 +994,7 @@ class CFGGuider:

        try:
            self.model_patcher.pre_run()
-            output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed)
+            output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes)
        finally:
            self.model_patcher.cleanup()

@@ -1014,6 +1007,12 @@ class CFGGuider:
        if sigmas.shape[-1] == 0:
            return latent_image

+        if latent_image.is_nested:
+            latent_image, latent_shapes = comfy.utils.pack_latents(latent_image.unbind())
+            noise, _ = comfy.utils.pack_latents(noise.unbind())
+        else:
+            latent_shapes = [latent_image.shape]
+
        self.conds = {}
        for k in self.original_conds:
            self.conds[k] = list(map(lambda a: a.copy(), self.original_conds[k]))
@@ -1033,7 +1032,7 @@ class CFGGuider:
                self,
                comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, self.model_options, is_model_options=True)
            )
-            output = executor.execute(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed)
+            output = executor.execute(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes)
        finally:
            cast_to_load_options(self.model_options, device=self.model_patcher.offload_device)
            self.model_options = orig_model_options
@@ -1041,6 +1040,9 @@ class CFGGuider:
            self.model_patcher.restore_hook_patches()

        del self.conds
+
+        if len(latent_shapes) > 1:
+            output = comfy.nested_tensor.NestedTensor(comfy.utils.unpack_latents(output, latent_shapes))
        return output


--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1262,7 +1262,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
    return (model_patcher, clip, vae, clipvision)


-def load_diffusion_model_state_dict(sd, model_options={}):
+def load_diffusion_model_state_dict(sd, model_options={}, metadata=None):
    """
    Loads a UNet diffusion model from a state dictionary, supporting both diffusers and regular formats.

@@ -1296,7 +1296,7 @@ def load_diffusion_model_state_dict(sd, model_options={}):
    weight_dtype = comfy.utils.weight_dtype(sd)

    load_device = model_management.get_torch_device()
-    model_config = model_detection.model_config_from_unet(sd, "")
+    model_config = model_detection.model_config_from_unet(sd, "", metadata=metadata)

    if model_config is not None:
        new_sd = sd
@@ -1330,7 +1330,10 @@ def load_diffusion_model_state_dict(sd, model_options={}):
    else:
        unet_dtype = dtype

-    manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
+    if hasattr(model_config, "layer_quant_config"):
+        manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes)
+    else:
+        manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
    model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations)
    if model_options.get("fp8_optimizations", False):
@@ -1346,8 +1349,8 @@ def load_diffusion_model_state_dict(sd, model_options={}):


 def load_diffusion_model(unet_path, model_options={}):
-    sd = comfy.utils.load_torch_file(unet_path)
-    model = load_diffusion_model_state_dict(sd, model_options=model_options)
+    sd, metadata = comfy.utils.load_torch_file(unet_path, return_metadata=True)
+    model = load_diffusion_model_state_dict(sd, model_options=model_options, metadata=metadata)
    if model is None:
        logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path))
        raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd)))
--- a/comfy/supported_models_base.py
+++ b/comfy/supported_models_base.py
@@ -50,6 +50,7 @@ class BASE:
    manual_cast_dtype = None
    custom_operations = None
    scaled_fp8 = None
+    layer_quant_config = None  # Per-layer quantization configuration for mixed precision
    optimizations = {"fp8": False}

    @classmethod
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -1106,3 +1106,25 @@ def upscale_dit_mask(mask: torch.Tensor, img_size_in, img_size_out):
            dim=1
        )
        return out
+
+def pack_latents(latents):
+    latent_shapes = []
+    tensors = []
+    for tensor in latents:
+        latent_shapes.append(tensor.shape)
+        tensors.append(tensor.reshape(tensor.shape[0], 1, -1))
+
+    latent = torch.cat(tensors, dim=-1)
+    return latent, latent_shapes
+
+def unpack_latents(combined_latent, latent_shapes):
+    if len(latent_shapes) > 1:
+        output_tensors = []
+        for shape in latent_shapes:
+            cut = math.prod(shape[1:])
+            tens = combined_latent[:, :, :cut]
+            combined_latent = combined_latent[:, :, cut:]
+            output_tensors.append(tens.reshape([tens.shape[0]] + list(shape)[1:]))
+    else:
+        output_tensors = combined_latent
+    return output_tensors