test

test2
test
2026-02-13 03:30:01 +00:00 · 2026-02-10 22:57:00 -08:00 · 2026-02-10 22:54:55 -08:00 · 2026-02-10 22:54:55 -08:00 · 2026-02-10 22:04:32 -05:00 · 2026-02-10 21:45:19 -05:00
41 changed files with 1938 additions and 380 deletions
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
@@ -7,6 +7,67 @@ from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management
 from comfy.ldm.flux.layers import timestep_embedding

+def get_silence_latent(length, device):
+    head = torch.tensor([[[ 0.5707,  0.0982,  0.6909, -0.5658,  0.6266,  0.6996, -0.1365, -0.1291,
+                        -0.0776, -0.1171, -0.2743, -0.8422, -0.1168,  1.5539, -4.6936,  0.7436,
+                        -1.1846, -0.2637,  0.6933, -6.7266,  0.0966, -0.1187, -0.3501, -1.1736,
+                        0.0587, -2.0517, -1.3651,  0.7508, -0.2490, -1.3548, -0.1290, -0.7261,
+                        1.1132, -0.3249,  0.2337,  0.3004,  0.6605, -0.0298, -0.1989, -0.4041,
+                        0.2843, -1.0963, -0.5519,  0.2639, -1.0436, -0.1183,  0.0640,  0.4460,
+                        -1.1001, -0.6172, -1.3241,  1.1379,  0.5623, -0.1507, -0.1963, -0.4742,
+                        -2.4697,  0.5302,  0.5381,  0.4636, -0.1782, -0.0687,  1.0333,  0.4202],
+                        [ 0.3040, -0.1367,  0.6200,  0.0665, -0.0642,  0.4655, -0.1187, -0.0440,
+                        0.2941, -0.2753,  0.0173, -0.2421, -0.0147,  1.5603, -2.7025,  0.7907,
+                        -0.9736, -0.0682,  0.1294, -5.0707, -0.2167,  0.3302, -0.1513, -0.8100,
+                        -0.3894, -0.2884, -0.3149,  0.8660, -0.3817, -1.7061,  0.5824, -0.4840,
+                        0.6938,  0.1859,  0.1753,  0.3081,  0.0195,  0.1403, -0.0754, -0.2091,
+                        0.1251, -0.1578, -0.4968, -0.1052, -0.4554, -0.0320,  0.1284,  0.4974,
+                        -1.1889, -0.0344, -0.8313,  0.2953,  0.5445, -0.6249, -0.1595, -0.0682,
+                        -3.1412,  0.0484,  0.4153,  0.8260, -0.1526, -0.0625,  0.5366,  0.8473],
+                        [ 5.3524e-02, -1.7534e-01,  5.4443e-01, -4.3501e-01, -2.1317e-03,
+                        3.7200e-01, -4.0143e-03, -1.5516e-01, -1.2968e-01, -1.5375e-01,
+                        -7.7107e-02, -2.0593e-01, -3.2780e-01,  1.5142e+00, -2.6101e+00,
+                        5.8698e-01, -1.2716e+00, -2.4773e-01, -2.7933e-02, -5.0799e+00,
+                        1.1601e-01,  4.0987e-01, -2.2030e-02, -6.6495e-01, -2.0995e-01,
+                        -6.3474e-01, -1.5893e-01,  8.2745e-01, -2.2992e-01, -1.6816e+00,
+                        5.4440e-01, -4.9579e-01,  5.5128e-01,  3.0477e-01,  8.3052e-02,
+                        -6.1782e-02,  5.9036e-03,  2.9553e-01, -8.0645e-02, -1.0060e-01,
+                        1.9144e-01, -3.8124e-01, -7.2949e-01,  2.4520e-02, -5.0814e-01,
+                        2.3977e-01,  9.2943e-02,  3.9256e-01, -1.1993e+00, -3.2752e-01,
+                        -7.2707e-01,  2.9476e-01,  4.3542e-01, -8.8597e-01, -4.1686e-01,
+                        -8.5390e-02, -2.9018e+00,  6.4988e-02,  5.3945e-01,  9.1988e-01,
+                        5.8762e-02, -7.0098e-02,  6.4772e-01,  8.9118e-01],
+                        [-3.2225e-02, -1.3195e-01,  5.6411e-01, -5.4766e-01, -5.2170e-03,
+                        3.1425e-01, -5.4367e-02, -1.9419e-01, -1.3059e-01, -1.3660e-01,
+                        -9.0984e-02, -1.9540e-01, -2.5590e-01,  1.5440e+00, -2.6349e+00,
+                        6.8273e-01, -1.2532e+00, -1.9810e-01, -2.2793e-02, -5.0506e+00,
+                        1.8818e-01,  5.0109e-01,  7.3546e-03, -6.8771e-01, -3.0676e-01,
+                        -7.3257e-01, -1.6687e-01,  9.2232e-01, -1.8987e-01, -1.7267e+00,
+                        5.3355e-01, -5.3179e-01,  4.4953e-01,  2.8820e-01,  1.3012e-01,
+                        -2.0943e-01, -1.1348e-01,  3.3929e-01, -1.5069e-01, -1.2919e-01,
+                        1.8929e-01, -3.6166e-01, -8.0756e-01,  6.6387e-02, -5.8867e-01,
+                        1.6978e-01,  1.0134e-01,  3.3877e-01, -1.2133e+00, -3.2492e-01,
+                        -8.1237e-01,  3.8101e-01,  4.3765e-01, -8.0596e-01, -4.4531e-01,
+                        -4.7513e-02, -2.9266e+00,  1.1741e-03,  4.5123e-01,  9.3075e-01,
+                        5.3688e-02, -1.9621e-01,  6.4530e-01,  9.3870e-01]]], device=device).movedim(-1, 1)
+
+    silence_latent = torch.tensor([[[-1.3672e-01, -1.5820e-01,  5.8594e-01, -5.7422e-01,  3.0273e-02,
+                                2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
+                                -2.7710e-02, -1.8066e-01, -2.9688e-01,  1.6016e+00, -2.6719e+00,
+                                7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
+                                2.4316e-01,  4.7266e-01,  4.6387e-02, -6.6406e-01, -2.1973e-01,
+                                -6.7578e-01, -1.5723e-01,  9.5312e-01, -2.0020e-01, -1.7109e+00,
+                                5.8984e-01, -5.7422e-01,  5.1562e-01,  2.8320e-01,  1.4551e-01,
+                                -1.8750e-01, -5.9814e-02,  3.6719e-01, -1.0059e-01, -1.5723e-01,
+                                2.0605e-01, -4.3359e-01, -8.2812e-01,  4.5654e-02, -6.6016e-01,
+                                1.4844e-01,  9.4727e-02,  3.8477e-01, -1.2578e+00, -3.3203e-01,
+                                -8.5547e-01,  4.3359e-01,  4.2383e-01, -8.9453e-01, -5.0391e-01,
+                                -5.6152e-02, -2.9219e+00, -2.4658e-02,  5.0391e-01,  9.8438e-01,
+                                7.2754e-02, -2.1582e-01,  6.3672e-01,  1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, length)
+    silence_latent[:, :, :head.shape[-1]] = head
+    return silence_latent
+
+
 def get_layer_class(operations, layer_name):
    if operations is not None and hasattr(operations, layer_name):
        return getattr(operations, layer_name)
@@ -183,7 +244,7 @@ class AceStepAttention(nn.Module):
            else:
                attn_bias = window_bias

-        attn_output = optimized_attention(query_states, key_states, value_states, self.num_heads, attn_bias, skip_reshape=True)
+        attn_output = optimized_attention(query_states, key_states, value_states, self.num_heads, attn_bias, skip_reshape=True, low_precision_attention=False)
        attn_output = self.o_proj(attn_output)

        return attn_output
@@ -677,7 +738,7 @@ class AttentionPooler(nn.Module):
    def forward(self, x):
        B, T, P, D = x.shape
        x = self.embed_tokens(x)
-        special = self.special_token.expand(B, T, 1, -1)
+        special = comfy.model_management.cast_to(self.special_token, device=x.device, dtype=x.dtype).expand(B, T, 1, -1)
        x = torch.cat([special, x], dim=2)
        x = x.view(B * T, P + 1, D)

@@ -728,7 +789,7 @@ class FSQ(nn.Module):
        self.register_buffer('implicit_codebook', implicit_codebook, persistent=False)

    def bound(self, z):
-        levels_minus_1 = (self._levels - 1).to(z.dtype)
+        levels_minus_1 = (comfy.model_management.cast_to(self._levels, device=z.device, dtype=z.dtype) - 1)
        scale = 2. / levels_minus_1
        bracket = (levels_minus_1 * (torch.tanh(z) + 1) / 2.) + 0.5

@@ -743,8 +804,8 @@ class FSQ(nn.Module):
        return codes_non_centered.float() * (2. / (self._levels.float() - 1)) - 1.

    def codes_to_indices(self, zhat):
-        zhat_normalized = (zhat + 1.) / (2. / (self._levels.to(zhat.dtype) - 1))
-        return (zhat_normalized * self._basis.to(zhat.dtype)).sum(dim=-1).round().to(torch.int32)
+        zhat_normalized = (zhat + 1.) / (2. / (comfy.model_management.cast_to(self._levels, device=zhat.device, dtype=zhat.dtype) - 1))
+        return (zhat_normalized * comfy.model_management.cast_to(self._basis, device=zhat.device, dtype=zhat.dtype)).sum(dim=-1).round().to(torch.int32)

    def forward(self, z):
        orig_dtype = z.dtype
@@ -826,7 +887,7 @@ class ResidualFSQ(nn.Module):
        x = self.project_in(x)

        if hasattr(self, 'soft_clamp_input_value'):
-            sc_val = self.soft_clamp_input_value.to(x.dtype)
+            sc_val = comfy.model_management.cast_to(self.soft_clamp_input_value, device=x.device, dtype=x.dtype)
            x = (x / sc_val).tanh() * sc_val

        quantized_out = torch.tensor(0., device=x.device, dtype=x.dtype)
@@ -834,7 +895,7 @@ class ResidualFSQ(nn.Module):
        all_indices = []

        for layer, scale in zip(self.layers, self.scales):
-            scale = scale.to(residual.dtype)
+            scale = comfy.model_management.cast_to(scale, device=x.device, dtype=x.dtype)

            quantized, indices = layer(residual / scale)
            quantized = quantized * scale
@@ -1035,28 +1096,26 @@ class AceStepConditionGenerationModel(nn.Module):
                    audio_codes = torch.nn.functional.pad(audio_codes, (0, math.ceil(src_latents.shape[1] / 5) - audio_codes.shape[1]), "constant", 35847)
                lm_hints_5Hz = self.tokenizer.quantizer.get_output_from_indices(audio_codes, dtype=text_hidden_states.dtype)
            else:
-                assert False
-                # TODO ?
+                lm_hints_5Hz, indices = self.tokenizer.tokenize(refer_audio_acoustic_hidden_states_packed)

            lm_hints = self.detokenizer(lm_hints_5Hz)

        lm_hints = lm_hints[:, :src_latents.shape[1], :]
-        if is_covers is None:
+        if is_covers is None or is_covers is True:
            src_latents = lm_hints
-        else:
-            src_latents = torch.where(is_covers.unsqueeze(-1).unsqueeze(-1) > 0, lm_hints, src_latents)
+        elif is_covers is False:
+            src_latents = refer_audio_acoustic_hidden_states_packed

        context_latents = torch.cat([src_latents, chunk_masks.to(src_latents.dtype)], dim=-1)

        return encoder_hidden, encoder_mask, context_latents

-    def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, **kwargs):
+    def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, is_covers=None, replace_with_null_embeds=False, **kwargs):
        text_attention_mask = None
        lyric_attention_mask = None
        refer_audio_order_mask = None
        attention_mask = None
        chunk_masks = None
-        is_covers = None
        src_latents = None
        precomputed_lm_hints_25Hz = None
        lyric_hidden_states = lyric_embed
@@ -1068,7 +1127,7 @@ class AceStepConditionGenerationModel(nn.Module):
        if refer_audio_order_mask is None:
            refer_audio_order_mask = torch.zeros((x.shape[0],), device=x.device, dtype=torch.long)

-        if src_latents is None and is_covers is None:
+        if src_latents is None:
            src_latents = x

        if chunk_masks is None:
@@ -1081,6 +1140,9 @@ class AceStepConditionGenerationModel(nn.Module):
            src_latents, chunk_masks, is_covers, precomputed_lm_hints_25Hz=precomputed_lm_hints_25Hz, audio_codes=audio_codes
        )

+        if replace_with_null_embeds:
+            enc_hidden[:] = self.null_condition_emb.to(enc_hidden)
+
        out = self.decoder(hidden_states=x,
                           timestep=timestep,
                           timestep_r=timestep,
--- a/comfy/ldm/anima/model.py
+++ b/comfy/ldm/anima/model.py
@@ -195,8 +195,20 @@ class Anima(MiniTrainDIT):
        super().__init__(*args, **kwargs)
        self.llm_adapter = LLMAdapter(device=kwargs.get("device"), dtype=kwargs.get("dtype"), operations=kwargs.get("operations"))

-    def preprocess_text_embeds(self, text_embeds, text_ids):
+    def preprocess_text_embeds(self, text_embeds, text_ids, t5xxl_weights=None):
        if text_ids is not None:
-            return self.llm_adapter(text_embeds, text_ids)
+            out = self.llm_adapter(text_embeds, text_ids)
+            if t5xxl_weights is not None:
+                out = out * t5xxl_weights
+
+            if out.shape[1] < 512:
+                out = torch.nn.functional.pad(out, (0, 0, 0, 512 - out.shape[1]))
+            return out
        else:
            return text_embeds
+
+    def forward(self, x, timesteps, context, **kwargs):
+        t5xxl_ids = kwargs.pop("t5xxl_ids", None)
+        if t5xxl_ids is not None:
+            context = self.preprocess_text_embeds(context, t5xxl_ids, t5xxl_weights=kwargs.pop("t5xxl_weights", None))
+        return super().forward(x, timesteps, context, **kwargs)
--- a/comfy/ldm/cosmos/predict2.py
+++ b/comfy/ldm/cosmos/predict2.py
@@ -335,7 +335,7 @@ class FinalLayer(nn.Module):
        device=None, dtype=None, operations=None
    ):
        super().__init__()
-        self.layer_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.layer_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = operations.Linear(
            hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False, device=device, dtype=dtype
        )
@@ -463,6 +463,8 @@ class Block(nn.Module):
        extra_per_block_pos_emb: Optional[torch.Tensor] = None,
        transformer_options: Optional[dict] = {},
    ) -> torch.Tensor:
+        residual_dtype = x_B_T_H_W_D.dtype
+        compute_dtype = emb_B_T_D.dtype
        if extra_per_block_pos_emb is not None:
            x_B_T_H_W_D = x_B_T_H_W_D + extra_per_block_pos_emb

@@ -512,7 +514,7 @@ class Block(nn.Module):
        result_B_T_H_W_D = rearrange(
            self.self_attn(
                # normalized_x_B_T_HW_D,
-                rearrange(normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
+                rearrange(normalized_x_B_T_H_W_D.to(compute_dtype), "b t h w d -> b (t h w) d"),
                None,
                rope_emb=rope_emb_L_1_1_D,
                transformer_options=transformer_options,
@@ -522,7 +524,7 @@ class Block(nn.Module):
            h=H,
            w=W,
        )
-        x_B_T_H_W_D = x_B_T_H_W_D + gate_self_attn_B_T_1_1_D * result_B_T_H_W_D
+        x_B_T_H_W_D = x_B_T_H_W_D + gate_self_attn_B_T_1_1_D.to(residual_dtype) * result_B_T_H_W_D.to(residual_dtype)

        def _x_fn(
            _x_B_T_H_W_D: torch.Tensor,
@@ -536,7 +538,7 @@ class Block(nn.Module):
            )
            _result_B_T_H_W_D = rearrange(
                self.cross_attn(
-                    rearrange(_normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
+                    rearrange(_normalized_x_B_T_H_W_D.to(compute_dtype), "b t h w d -> b (t h w) d"),
                    crossattn_emb,
                    rope_emb=rope_emb_L_1_1_D,
                    transformer_options=transformer_options,
@@ -555,7 +557,7 @@ class Block(nn.Module):
            shift_cross_attn_B_T_1_1_D,
            transformer_options=transformer_options,
        )
-        x_B_T_H_W_D = result_B_T_H_W_D * gate_cross_attn_B_T_1_1_D + x_B_T_H_W_D
+        x_B_T_H_W_D = result_B_T_H_W_D.to(residual_dtype) * gate_cross_attn_B_T_1_1_D.to(residual_dtype) + x_B_T_H_W_D

        normalized_x_B_T_H_W_D = _fn(
            x_B_T_H_W_D,
@@ -563,8 +565,8 @@ class Block(nn.Module):
            scale_mlp_B_T_1_1_D,
            shift_mlp_B_T_1_1_D,
        )
-        result_B_T_H_W_D = self.mlp(normalized_x_B_T_H_W_D)
-        x_B_T_H_W_D = x_B_T_H_W_D + gate_mlp_B_T_1_1_D * result_B_T_H_W_D
+        result_B_T_H_W_D = self.mlp(normalized_x_B_T_H_W_D.to(compute_dtype))
+        x_B_T_H_W_D = x_B_T_H_W_D + gate_mlp_B_T_1_1_D.to(residual_dtype) * result_B_T_H_W_D.to(residual_dtype)
        return x_B_T_H_W_D


@@ -876,6 +878,14 @@ class MiniTrainDIT(nn.Module):
            "extra_per_block_pos_emb": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
            "transformer_options": kwargs.get("transformer_options", {}),
        }
+
+        # The residual stream for this model has large values. To make fp16 compute_dtype work, we keep the residual stream
+        # in fp32, but run attention and MLP modules in fp16.
+        # An alternate method that clamps fp16 values "works" in the sense that it makes coherent images, but there is noticeable
+        # quality degradation and visual artifacts.
+        if x_B_T_H_W_D.dtype == torch.float16:
+            x_B_T_H_W_D = x_B_T_H_W_D.float()
+
        for block in self.blocks:
            x_B_T_H_W_D = block(
                x_B_T_H_W_D,
@@ -884,6 +894,6 @@ class MiniTrainDIT(nn.Module):
                **block_kwargs,
            )

-        x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D, t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D)
+        x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D.to(crossattn_emb.dtype), t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D)
        x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O)[:, :, :orig_shape[-3], :orig_shape[-2], :orig_shape[-1]]
        return x_B_C_Tt_Hp_Wp
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -29,19 +29,34 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    return out.to(dtype=torch.float32, device=pos.device)


+def _apply_rope1(x: Tensor, freqs_cis: Tensor):
+    x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2)
+
+    x_out = freqs_cis[..., 0] * x_[..., 0]
+    x_out.addcmul_(freqs_cis[..., 1], x_[..., 1])
+
+    return x_out.reshape(*x.shape).type_as(x)
+
+
+def _apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
+    return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
+
+
 try:
    import comfy.quant_ops
-    apply_rope = comfy.quant_ops.ck.apply_rope
-    apply_rope1 = comfy.quant_ops.ck.apply_rope1
+    q_apply_rope = comfy.quant_ops.ck.apply_rope
+    q_apply_rope1 = comfy.quant_ops.ck.apply_rope1
+    def apply_rope(xq, xk, freqs_cis):
+        if comfy.model_management.in_training:
+            return _apply_rope(xq, xk, freqs_cis)
+        else:
+            return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
+    def apply_rope1(x, freqs_cis):
+        if comfy.model_management.in_training:
+            return _apply_rope1(x, freqs_cis)
+        else:
+            return q_apply_rope1(x, freqs_cis)
 except:
    logging.warning("No comfy kitchen, using old apply_rope functions.")
-    def apply_rope1(x: Tensor, freqs_cis: Tensor):
-        x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2)
-
-        x_out = freqs_cis[..., 0] * x_[..., 0]
-        x_out.addcmul_(freqs_cis[..., 1], x_[..., 1])
-
-        return x_out.reshape(*x.shape).type_as(x)
-
-    def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-        return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis)
+    apply_rope = _apply_rope
+    apply_rope1 = _apply_rope1
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -524,6 +524,9 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha

@wrap_attn
 def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
+    if kwargs.get("low_precision_attention", True) is False:
+        return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=skip_reshape, skip_output_reshape=skip_output_reshape, **kwargs)
+
    exception_fallback = False
    if skip_reshape:
        b, _, _, dim_head = q.shape
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -147,11 +147,11 @@ class BaseModel(torch.nn.Module):
                self.diffusion_model.to(memory_format=torch.channels_last)
                logging.debug("using channels last mode for diffusion model")
            logging.info("model weight dtype {}, manual cast: {}".format(self.get_dtype(), self.manual_cast_dtype))
+            comfy.model_management.archive_model_dtypes(self.diffusion_model)
+
        self.model_type = model_type
        self.model_sampling = model_sampling(model_config, model_type)

-        comfy.model_management.archive_model_dtypes(self.diffusion_model)
-
        self.adm_channels = unet_config.get("adm_in_channels", None)
        if self.adm_channels is None:
            self.adm_channels = 0
@@ -1160,12 +1160,16 @@ class Anima(BaseModel):
        device = kwargs["device"]
        if cross_attn is not None:
            if t5xxl_ids is not None:
-                cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.unsqueeze(0).to(device=device))
                if t5xxl_weights is not None:
-                    cross_attn *= t5xxl_weights.unsqueeze(0).unsqueeze(-1).to(cross_attn)
+                    t5xxl_weights = t5xxl_weights.unsqueeze(0).unsqueeze(-1).to(cross_attn)
+                t5xxl_ids = t5xxl_ids.unsqueeze(0)
+
+                if torch.is_inference_mode_enabled():  # if not we are training
+                    cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.to(device=device), t5xxl_weights=t5xxl_weights.to(device=device, dtype=self.get_dtype()))
+                else:
+                    out['t5xxl_ids'] = comfy.conds.CONDRegular(t5xxl_ids)
+                    out['t5xxl_weights'] = comfy.conds.CONDRegular(t5xxl_weights)

-                if cross_attn.shape[1] < 512:
-                    cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, 0, 512 - cross_attn.shape[1]))
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        return out

@@ -1548,9 +1552,12 @@ class ACEStep15(BaseModel):
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        device = kwargs["device"]
+        noise = kwargs["noise"]

        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
+            if torch.count_nonzero(cross_attn) == 0:
+                out['replace_with_null_embeds'] = comfy.conds.CONDConstant(True)
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)

        conditioning_lyrics = kwargs.get("conditioning_lyrics", None)
@@ -1559,27 +1566,26 @@ class ACEStep15(BaseModel):

        refer_audio = kwargs.get("reference_audio_timbre_latents", None)
        if refer_audio is None or len(refer_audio) == 0:
-            refer_audio = torch.tensor([[[-1.3672e-01, -1.5820e-01,  5.8594e-01, -5.7422e-01,  3.0273e-02,
-                                        2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
-                                        -2.7710e-02, -1.8066e-01, -2.9688e-01,  1.6016e+00, -2.6719e+00,
-                                        7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
-                                        2.4316e-01,  4.7266e-01,  4.6387e-02, -6.6406e-01, -2.1973e-01,
-                                        -6.7578e-01, -1.5723e-01,  9.5312e-01, -2.0020e-01, -1.7109e+00,
-                                        5.8984e-01, -5.7422e-01,  5.1562e-01,  2.8320e-01,  1.4551e-01,
-                                        -1.8750e-01, -5.9814e-02,  3.6719e-01, -1.0059e-01, -1.5723e-01,
-                                        2.0605e-01, -4.3359e-01, -8.2812e-01,  4.5654e-02, -6.6016e-01,
-                                        1.4844e-01,  9.4727e-02,  3.8477e-01, -1.2578e+00, -3.3203e-01,
-                                        -8.5547e-01,  4.3359e-01,  4.2383e-01, -8.9453e-01, -5.0391e-01,
-                                        -5.6152e-02, -2.9219e+00, -2.4658e-02,  5.0391e-01,  9.8438e-01,
-                                        7.2754e-02, -2.1582e-01,  6.3672e-01,  1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, 750)
+            refer_audio = comfy.ldm.ace.ace_step15.get_silence_latent(noise.shape[2], device)
+            pass_audio_codes = True
        else:
-            refer_audio = refer_audio[-1]
+            refer_audio = refer_audio[-1][:, :, :noise.shape[2]]
+            out['is_covers'] = comfy.conds.CONDConstant(True)
+            pass_audio_codes = False
+
+        if pass_audio_codes:
+            audio_codes = kwargs.get("audio_codes", None)
+            if audio_codes is not None:
+                out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
+                refer_audio = refer_audio[:, :, :750]
+            else:
+                out['is_covers'] = comfy.conds.CONDConstant(False)
+
+        if refer_audio.shape[2] < noise.shape[2]:
+            pad = comfy.ldm.ace.ace_step15.get_silence_latent(noise.shape[2], device)
+            refer_audio = torch.cat([refer_audio.to(pad), pad[:, :, refer_audio.shape[2]:]], dim=2)
+
        out['refer_audio'] = comfy.conds.CONDRegular(refer_audio)
-
-        audio_codes = kwargs.get("audio_codes", None)
-        if audio_codes is not None:
-            out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
-
        return out

 class Omnigen2(BaseModel):
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -19,7 +19,7 @@
 import psutil
 import logging
 from enum import Enum
-from comfy.cli_args import args, PerformanceFeature, enables_dynamic_vram
+from comfy.cli_args import args, PerformanceFeature
 import threading
 import torch
 import sys
@@ -55,6 +55,11 @@ cpu_state = CPUState.GPU

 total_vram = 0

+
+# Training Related State
+in_training = False
+
+
 def get_supported_float8_types():
    float8_types = []
    try:
@@ -651,7 +656,7 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, ram_
                soft_empty_cache()
    return unloaded_models

-def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
+def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
    cleanup_models_gc()
    global vram_state

@@ -747,26 +752,6 @@ def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, m
        current_loaded_models.insert(0, loaded_model)
    return

-def load_models_gpu_thread(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load):
-    with torch.inference_mode():
-        load_models_gpu_orig(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load)
-        soft_empty_cache()
-
-def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
-    #Deliberately load models outside of the Aimdo mempool so they can be retained accross
-    #nodes. Use a dummy thread to do it as pytorch documents that mempool contexts are
-    #thread local. So exploit that to escape context
-    if enables_dynamic_vram():
-        t = threading.Thread(
-            target=load_models_gpu_thread,
-            args=(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load)
-        )
-        t.start()
-        t.join()
-    else:
-        load_models_gpu_orig(models, memory_required=memory_required, force_patch_weights=force_patch_weights,
-                             minimum_memory_required=minimum_memory_required, force_full_load=force_full_load)
-
 def load_model_gpu(model):
    return load_models_gpu([model])

@@ -1226,21 +1211,16 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str
        if dtype is None:
            dtype = weight._model_dtype

-        r = torch.empty_like(weight, dtype=dtype, device=device)
-
        signature = comfy_aimdo.model_vbar.vbar_fault(weight._v)
        if signature is not None:
-            raw_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device)
-            v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, raw_tensor)[0]
+            v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, weight._v_tensor)[0]
            if not comfy_aimdo.model_vbar.vbar_signature_compare(signature, weight._v_signature):
                weight._v_signature = signature
                #Send it over
                v_tensor.copy_(weight, non_blocking=non_blocking)
-            #always take a deep copy even if _v is good, as we have no reasonable point to unpin
-            #a non comfy weight
-            r.copy_(v_tensor)
-            comfy_aimdo.model_vbar.vbar_unpin(weight._v)
-            return r
+            return v_tensor.to(dtype=dtype)
+
+        r = torch.empty_like(weight, dtype=dtype, device=device)

        if weight.dtype != r.dtype and weight.dtype != weight._model_dtype:
            #Offloaded casting could skip this, however it would make the quantizations
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -19,7 +19,6 @@
 from __future__ import annotations

 import collections
-import copy
 import inspect
 import logging
 import math
@@ -317,7 +316,7 @@ class ModelPatcher:

        n.object_patches = self.object_patches.copy()
        n.weight_wrapper_patches = self.weight_wrapper_patches.copy()
-        n.model_options = copy.deepcopy(self.model_options)
+        n.model_options = comfy.utils.deepcopy_list_dict(self.model_options)
        n.backup = self.backup
        n.object_patches_backup = self.object_patches_backup
        n.parent = self
@@ -1400,7 +1399,7 @@ class ModelPatcher:
                continue
            key = "diffusion_model." + k
            unet_state_dict[k] = LazyCastingParam(self, key, comfy.utils.get_attr(self.model, key))
-        return self.model.state_dict_for_saving(unet_state_dict)
+        return self.model.state_dict_for_saving(unet_state_dict, clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict)

    def __del__(self):
        self.unpin_all_weights()
@@ -1492,7 +1491,9 @@ class ModelPatcherDynamic(ModelPatcher):
            if vbar is not None:
                vbar.prioritize()

-            #We have way more tools for acceleration on comfy weight offloading, so always
+            #We force reserve VRAM for the non comfy-weight so we dont have to deal
+            #with pin and unpin syncrhonization which can be expensive for small weights
+            #with a high layer rate (e.g. autoregressive LLMs).
            #prioritize the non-comfy weights (note the order reverse).
            loading = self._load_list(prio_comfy_cast_weights=True)
            loading.sort(reverse=True)
@@ -1541,6 +1542,7 @@ class ModelPatcherDynamic(ModelPatcher):

                    if vbar is not None and not hasattr(m, "_v"):
                        m._v = vbar.alloc(v_weight_size)
+                        m._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(m._v, device_to)
                    allocated_size += v_weight_size

                else:
@@ -1555,8 +1557,10 @@ class ModelPatcherDynamic(ModelPatcher):
                        weight_size = geometry.numel() * geometry.element_size()
                        if vbar is not None and not hasattr(weight, "_v"):
                            weight._v = vbar.alloc(weight_size)
+                            weight._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device_to)
                            weight._model_dtype = model_dtype
                        allocated_size += weight_size
+                    vbar.set_watermark_limit(allocated_size)

            logging.info(f"Model {self.model.__class__.__name__} prepared for dynamic VRAM loading. {allocated_size // (1024 ** 2)}MB Staged. {num_patches} patches attached.")

--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -54,6 +54,8 @@ try:
            SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION)

            def scaled_dot_product_attention(q, k, v, *args, **kwargs):
+                if q.nelement() < 1024 * 128:  # arbitrary number, for small inputs cudnn attention seems slower
+                    return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs)
                with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True):
                    return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs)
        else:
@@ -85,7 +87,7 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu

    signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
    if signature is not None:
-        xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device)
+        xfer_dest = s._v_tensor
    resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)

    if not resident:
@@ -167,8 +169,8 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
                if orig.dtype == dtype and len(fns) == 0:
                    #The layer actually wants our freshly saved QT
                    x = y
-            else:
-                y = x
+            elif update_weight:
+                y = comfy.float.stochastic_rounding(x, orig.dtype, seed = comfy.utils.string_to_seed(s.seed_key))
            if update_weight:
                orig.copy_(y)
        for f in fns:
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -122,20 +122,26 @@ def estimate_memory(model, noise_shape, conds):
    minimum_memory_required = model.model.memory_required([noise_shape[0]] + list(noise_shape[1:]), cond_shapes=cond_shapes_min)
    return memory_required, minimum_memory_required

-def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False):
+def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False):
    executor = comfy.patcher_extension.WrapperExecutor.new_executor(
        _prepare_sampling,
        comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, model_options, is_model_options=True)
    )
-    return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load)
+    return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load, force_offload=force_offload)

-def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False):
+def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False):
    real_model: BaseModel = None
    models, inference_memory = get_additional_models(conds, model.model_dtype())
    models += get_additional_models_from_model_options(model_options)
    models += model.get_nested_additional_models()  # TODO: does this require inference_memory update?
-    memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds)
-    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required + inference_memory, minimum_memory_required=minimum_memory_required + inference_memory, force_full_load=force_full_load)
+    if force_offload: # In training + offload enabled, we want to force prepare sampling to trigger partial load
+        memory_required = 1e20
+        minimum_memory_required = None
+    else:
+        memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds)
+        memory_required += inference_memory
+        minimum_memory_required += inference_memory
+    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required, force_full_load=force_full_load)
    real_model = model.model

    return real_model, conds, models
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -793,8 +793,6 @@ class VAE:
            self.first_stage_model = AutoencoderKL(**(config['params']))
        self.first_stage_model = self.first_stage_model.eval()

-        model_management.archive_model_dtypes(self.first_stage_model)
-
        if device is None:
            device = model_management.vae_device()
        self.device = device
@@ -803,6 +801,7 @@ class VAE:
            dtype = model_management.vae_dtype(self.device, self.working_dtypes)
        self.vae_dtype = dtype
        self.first_stage_model.to(self.vae_dtype)
+        model_management.archive_model_dtypes(self.first_stage_model)
        self.output_device = model_management.intermediate_device()

        mp = comfy.model_patcher.CoreModelPatcher
@@ -976,7 +975,7 @@ class VAE:
        if overlap is not None:
            args["overlap"] = overlap

-        if dims == 1:
+        if dims == 1 or self.extra_1d_channel is not None:
            args.pop("tile_y")
            output = self.decode_tiled_1d(samples, **args)
        elif dims == 2:
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -993,7 +993,7 @@ class CosmosT2IPredict2(supported_models_base.BASE):

    memory_usage_factor = 1.0

-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]

    def __init__(self, unet_config):
        super().__init__(unet_config)
@@ -1023,11 +1023,7 @@ class Anima(supported_models_base.BASE):

    memory_usage_factor = 1.0

-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    def __init__(self, unet_config):
-        super().__init__(unet_config)
-        self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]

    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Anima(self, device=device)
@@ -1038,6 +1034,12 @@ class Anima(supported_models_base.BASE):
        detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_06b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.anima.AnimaTokenizer, comfy.text_encoders.anima.te(**detect))

+    def set_inference_dtype(self, dtype, manual_cast_dtype, **kwargs):
+        self.memory_usage_factor = (self.unet_config.get("model_channels", 2048) / 2048) * 0.95
+        if dtype is torch.float16:
+            self.memory_usage_factor *= 1.4
+        return super().set_inference_dtype(dtype, manual_cast_dtype, **kwargs)
+
 class CosmosI2VPredict2(CosmosT2IPredict2):
    unet_config = {
        "image_model": "cosmos_predict2",
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -3,6 +3,8 @@ import comfy.text_encoders.llama
 from comfy import sd1_clip
 import torch
 import math
+from tqdm.auto import trange
+import yaml
 import comfy.utils


@@ -22,6 +24,8 @@ def sample_manual_loop_no_classes(
    audio_end_id: int = 215669,
    eos_token_id: int = 151645,
 ):
+    if ids is None:
+        return []
    device = model.execution_device

    if execution_dtype is None:
@@ -31,6 +35,7 @@ def sample_manual_loop_no_classes(
            execution_dtype = torch.float32

    embeds, attention_mask, num_tokens, embeds_info = model.process_tokens(ids, device)
+    embeds_batch = embeds.shape[0]
    for i, t in enumerate(paddings):
        attention_mask[i, :t] = 0
        attention_mask[i, t:] = 1
@@ -40,22 +45,27 @@ def sample_manual_loop_no_classes(
    generator = torch.Generator(device=device)
    generator.manual_seed(seed)
    model_config = model.transformer.model.config
+    past_kv_shape = [embeds_batch, model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim]

    for x in range(model_config.num_hidden_layers):
-        past_key_values.append((torch.empty([embeds.shape[0], model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim], device=device, dtype=execution_dtype), torch.empty([embeds.shape[0], model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim], device=device, dtype=execution_dtype), 0))
+        past_key_values.append((torch.empty(past_kv_shape, device=device, dtype=execution_dtype), torch.empty(past_kv_shape, device=device, dtype=execution_dtype), 0))

    progress_bar = comfy.utils.ProgressBar(max_new_tokens)

-    for step in range(max_new_tokens):
+    for step in trange(max_new_tokens, desc="LM sampling"):
        outputs = model.transformer(None, attention_mask, embeds=embeds.to(execution_dtype), num_tokens=num_tokens, intermediate_output=None, dtype=execution_dtype, embeds_info=embeds_info, past_key_values=past_key_values)
        next_token_logits = model.transformer.logits(outputs[0])[:, -1]
        past_key_values = outputs[2]

-        cond_logits = next_token_logits[0:1]
-        uncond_logits = next_token_logits[1:2]
-        cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
+        if cfg_scale != 1.0:
+            cond_logits = next_token_logits[0:1]
+            uncond_logits = next_token_logits[1:2]
+            cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
+        else:
+            cfg_logits = next_token_logits[0:1]

-        if eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step:
+        use_eos_score = eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step
+        if use_eos_score:
            eos_score = cfg_logits[:, eos_token_id].clone()

        remove_logit_value = torch.finfo(cfg_logits.dtype).min
@@ -63,7 +73,7 @@ def sample_manual_loop_no_classes(
        cfg_logits[:, :audio_start_id] = remove_logit_value
        cfg_logits[:, audio_end_id:] = remove_logit_value

-        if eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step:
+        if use_eos_score:
            cfg_logits[:, eos_token_id] = eos_score

        if top_k is not None and top_k > 0:
@@ -92,8 +102,8 @@ def sample_manual_loop_no_classes(
            break

        embed, _, _, _ = model.process_tokens([[token]], device)
-        embeds = embed.repeat(2, 1, 1)
-        attention_mask = torch.cat([attention_mask, torch.ones((2, 1), device=device, dtype=attention_mask.dtype)], dim=1)
+        embeds = embed.repeat(embeds_batch, 1, 1)
+        attention_mask = torch.cat([attention_mask, torch.ones((embeds_batch, 1), device=device, dtype=attention_mask.dtype)], dim=1)

        output_audio_codes.append(token - audio_start_id)
        progress_bar.update_absolute(step)
@@ -101,53 +111,136 @@ def sample_manual_loop_no_classes(
    return output_audio_codes


-def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0):
-    cfg_scale = 2.0
-
+def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0, cfg_scale=2.0, temperature=0.85, top_p=0.9, top_k=0):
    positive = [[token for token, _ in inner_list] for inner_list in positive]
-    negative = [[token for token, _ in inner_list] for inner_list in negative]
    positive = positive[0]
-    negative = negative[0]

-    neg_pad = 0
-    if len(negative) < len(positive):
-        neg_pad = (len(positive) - len(negative))
-        negative = [model.special_tokens["pad"]] * neg_pad + negative
+    if cfg_scale != 1.0:
+        negative = [[token for token, _ in inner_list] for inner_list in negative]
+        negative = negative[0]

-    pos_pad = 0
-    if len(negative) > len(positive):
-        pos_pad = (len(negative) - len(positive))
-        positive = [model.special_tokens["pad"]] * pos_pad + positive
+        neg_pad = 0
+        if len(negative) < len(positive):
+            neg_pad = (len(positive) - len(negative))
+            negative = [model.special_tokens["pad"]] * neg_pad + negative

-    paddings = [pos_pad, neg_pad]
-    return sample_manual_loop_no_classes(model, [positive, negative], paddings, cfg_scale=cfg_scale, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+        pos_pad = 0
+        if len(negative) > len(positive):
+            pos_pad = (len(negative) - len(positive))
+            positive = [model.special_tokens["pad"]] * pos_pad + positive
+
+        paddings = [pos_pad, neg_pad]
+        ids = [positive, negative]
+    else:
+        paddings = []
+        ids = [positive]
+
+    return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)


 class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_06b", tokenizer=Qwen3Tokenizer)

+    def _metas_to_cot(self, *, return_yaml: bool = False, **kwargs) -> str:
+        user_metas = {
+            k: kwargs.pop(k)
+            for k in ("bpm", "duration", "keyscale", "timesignature")
+            if k in kwargs
+        }
+        timesignature = user_metas.get("timesignature")
+        if isinstance(timesignature, str) and timesignature.endswith("/4"):
+            user_metas["timesignature"] = timesignature[:-2]
+        user_metas = {
+            k: v if not isinstance(v, str) or not v.isdigit() else int(v)
+            for k, v in user_metas.items()
+            if v not in {"unspecified", None}
+        }
+        if len(user_metas):
+            meta_yaml = yaml.dump(user_metas, allow_unicode=True, sort_keys=True).strip()
+        else:
+            meta_yaml = ""
+        return f"<think>\n{meta_yaml}\n</think>" if not return_yaml else meta_yaml
+
+    def _metas_to_cap(self, **kwargs) -> str:
+        use_keys = ("bpm", "timesignature", "keyscale", "duration")
+        user_metas = { k: kwargs.pop(k, "N/A") for k in use_keys }
+        timesignature = user_metas.get("timesignature")
+        if isinstance(timesignature, str) and timesignature.endswith("/4"):
+            user_metas["timesignature"] = timesignature[:-2]
+        duration = user_metas["duration"]
+        if duration == "N/A":
+            user_metas["duration"] = "30 seconds"
+        elif isinstance(duration, (str, int, float)):
+            user_metas["duration"] = f"{math.ceil(float(duration))} seconds"
+        else:
+            raise TypeError("Unexpected type for duration key, must be str, int or float")
+        return "\n".join(f"- {k}: {user_metas[k]}" for k in use_keys)
+
    def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
-        out = {}
+        text = text.strip()
+        text_negative = kwargs.get("caption_negative", text).strip()
        lyrics = kwargs.get("lyrics", "")
-        bpm = kwargs.get("bpm", 120)
+        lyrics_negative = kwargs.get("lyrics_negative", lyrics)
        duration = kwargs.get("duration", 120)
-        keyscale = kwargs.get("keyscale", "C major")
-        timesignature = kwargs.get("timesignature", 2)
-        language = kwargs.get("language", "en")
+        if isinstance(duration, str):
+            duration = float(duration.split(None, 1)[0])
+        language = kwargs.get("language")
        seed = kwargs.get("seed", 0)

+        generate_audio_codes = kwargs.get("generate_audio_codes", True)
+        cfg_scale = kwargs.get("cfg_scale", 2.0)
+        temperature = kwargs.get("temperature", 0.85)
+        top_p = kwargs.get("top_p", 0.9)
+        top_k = kwargs.get("top_k", 0.0)
+
        duration = math.ceil(duration)
-        meta_lm = 'bpm: {}\nduration: {}\nkeyscale: {}\ntimesignature: {}'.format(bpm, duration, keyscale, timesignature)
-        lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n{}\n<|im_end|>\n<|im_start|>assistant\n<think>\n{}\n</think>\n\n<|im_end|>\n"
+        kwargs["duration"] = duration
+        tokens_duration = duration * 5
+        min_tokens = int(kwargs.get("min_tokens", tokens_duration))
+        max_tokens = int(kwargs.get("max_tokens", tokens_duration))

-        meta_cap = '- bpm: {}\n- timesignature: {}\n- keyscale: {}\n- duration: {}\n'.format(bpm, timesignature, keyscale, duration)
-        out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, meta_lm), disable_weights=True)
-        out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, ""), disable_weights=True)
+        metas_negative = {
+            k.rsplit("_", 1)[0]: kwargs.pop(k)
+            for k in ("bpm_negative", "duration_negative", "keyscale_negative", "timesignature_negative", "language_negative", "caption_negative")
+            if k in kwargs
+        }
+        if not kwargs.get("use_negative_caption"):
+            _ = metas_negative.pop("caption", None)

-        out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric{}<|endoftext|><|endoftext|>".format(language, lyrics), return_word_ids, disable_weights=True, **kwargs)
-        out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}# Metas\n{}<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
-        out["lm_metadata"] = {"min_tokens": duration * 5, "seed": seed}
+        cot_text = self._metas_to_cot(caption=text, **kwargs)
+        cot_text_negative = "<think>\n\n</think>" if not metas_negative else self._metas_to_cot(**metas_negative)
+        meta_cap = self._metas_to_cap(**kwargs)
+
+        lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n\n# Lyric\n{}\n<|im_end|>\n<|im_start|>assistant\n{}\n\n<|im_end|>\n"
+        lyrics_template = "# Languages\n{}\n\n# Lyric\n{}<|endoftext|><|endoftext|>"
+        qwen3_06b_template = "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}\n\n# Metas\n{}\n<|endoftext|>\n<|endoftext|>"
+
+        llm_prompts = {
+            "lm_prompt": lm_template.format(text, lyrics.strip(), cot_text),
+            "lm_prompt_negative": lm_template.format(text_negative, lyrics_negative.strip(), cot_text_negative),
+            "lyrics": lyrics_template.format(language if language is not None else "", lyrics),
+            "qwen3_06b": qwen3_06b_template.format(text, meta_cap),
+        }
+
+        out = {
+            prompt_key: self.qwen3_06b.tokenize_with_weights(
+                prompt,
+                prompt_key == "qwen3_06b" and return_word_ids,
+                disable_weights = True,
+                **kwargs,
+            )
+            for prompt_key, prompt in llm_prompts.items()
+        }
+        out["lm_metadata"] = {"min_tokens": min_tokens,
+                              "max_tokens": max_tokens,
+                              "seed": seed,
+                              "generate_audio_codes": generate_audio_codes,
+                              "cfg_scale": cfg_scale,
+                              "temperature": temperature,
+                              "top_p": top_p,
+                              "top_k": top_k,
+                              }
        return out


@@ -203,10 +296,14 @@ class ACE15TEModel(torch.nn.Module):
        self.qwen3_06b.set_clip_options({"layer": [0]})
        lyrics_embeds, _, extra_l = self.qwen3_06b.encode_token_weights(token_weight_pairs_lyrics)

-        lm_metadata = token_weight_pairs["lm_metadata"]
-        audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"])
+        out = {"conditioning_lyrics": lyrics_embeds[:, 0]}

-        return base_out, None, {"conditioning_lyrics": lyrics_embeds[:, 0], "audio_codes": [audio_codes]}
+        lm_metadata = token_weight_pairs["lm_metadata"]
+        if lm_metadata["generate_audio_codes"]:
+            audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["max_tokens"], seed=lm_metadata["seed"], cfg_scale=lm_metadata["cfg_scale"], temperature=lm_metadata["temperature"], top_p=lm_metadata["top_p"], top_k=lm_metadata["top_k"])
+            out["audio_codes"] = [audio_codes]
+
+        return base_out, None, out

    def set_clip_options(self, options):
        self.qwen3_06b.set_clip_options(options)
--- a/comfy/text_encoders/anima.py
+++ b/comfy/text_encoders/anima.py
@@ -23,7 +23,7 @@ class AnimaTokenizer:
    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
        out = {}
        qwen_ids = self.qwen3_06b.tokenize_with_weights(text, return_word_ids, **kwargs)
-        out["qwen3_06b"] = [[(token, 1.0) for token, _ in inner_list] for inner_list in qwen_ids]  # Set weights to 1.0
+        out["qwen3_06b"] = [[(k[0], 1.0, k[2]) if return_word_ids else (k[0], 1.0) for k in inner_list] for inner_list in qwen_ids]  # Set weights to 1.0
        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids, **kwargs)
        return out

--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -651,10 +651,10 @@ class Llama2_(nn.Module):
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, seq_len, attention_mask.shape[-1])
-            mask = mask.masked_fill(mask.to(torch.bool), torch.finfo(x.dtype).min)
+            mask = mask.masked_fill(mask.to(torch.bool), torch.finfo(x.dtype).min / 4)

        if seq_len > 1:
-            causal_mask = torch.empty(past_len + seq_len, past_len + seq_len, dtype=x.dtype, device=x.device).fill_(torch.finfo(x.dtype).min).triu_(1)
+            causal_mask = torch.empty(past_len + seq_len, past_len + seq_len, dtype=x.dtype, device=x.device).fill_(torch.finfo(x.dtype).min / 4).triu_(1)
            if mask is not None:
                mask += causal_mask
            else:
--- a/comfy/text_encoders/lt.py
+++ b/comfy/text_encoders/lt.py
@@ -25,7 +25,7 @@ def ltxv_te(*args, **kwargs):
 class Gemma3_12BTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        tokenizer = tokenizer_data.get("spiece_model", None)
-        super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)

    def state_dict(self):
        return {"spiece_model": self.tokenizer.serialize_model()}
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -82,14 +82,12 @@ _TYPES = {
 def load_safetensors(ckpt):
    f = open(ckpt, "rb")
    mapping = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+    mv = memoryview(mapping)

    header_size = struct.unpack("<Q", mapping[:8])[0]
    header = json.loads(mapping[8:8+header_size].decode("utf-8"))

-    with warnings.catch_warnings():
-        #We are working with read-only RAM by design
-        warnings.filterwarnings("ignore", message="The given buffer is not writable")
-        data_area = torch.frombuffer(mapping, dtype=torch.uint8)[8 + header_size:]
+    mv = mv[8 + header_size:]

    sd = {}
    for name, info in header.items():
@@ -97,7 +95,13 @@ def load_safetensors(ckpt):
            continue

        start, end = info["data_offsets"]
-        sd[name] = data_area[start:end].view(_TYPES[info["dtype"]]).view(info["shape"])
+        if start == end:
+            sd[name] = torch.empty(info["shape"], dtype =_TYPES[info["dtype"]])
+        else:
+            with warnings.catch_warnings():
+                #We are working with read-only RAM by design
+                warnings.filterwarnings("ignore", message="The given buffer is not writable")
+                sd[name] = torch.frombuffer(mv[start:end], dtype=_TYPES[info["dtype"]]).view(info["shape"])

    return sd, header.get("__metadata__", {}),

@@ -1372,3 +1376,21 @@ def string_to_seed(data):
            else:
                crc >>= 1
    return crc ^ 0xFFFFFFFF
+
+def deepcopy_list_dict(obj, memo=None):
+    if memo is None:
+        memo = {}
+
+    obj_id = id(obj)
+    if obj_id in memo:
+        return memo[obj_id]
+
+    if isinstance(obj, dict):
+        res = {deepcopy_list_dict(k, memo): deepcopy_list_dict(v, memo) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        res = [deepcopy_list_dict(i, memo) for i in obj]
+    else:
+        res = obj
+
+    memo[obj_id] = res
+    return res
--- a/comfy/weight_adapter/bypass.py
+++ b/comfy/weight_adapter/bypass.py
@@ -21,6 +21,7 @@ from typing import Optional, Union
 import torch
 import torch.nn as nn

+import comfy.model_management
 from .base import WeightAdapterBase, WeightAdapterTrainBase
 from comfy.patcher_extension import PatcherInjection

@@ -181,18 +182,21 @@ class BypassForwardHook:
            )
            return  # Already injected

-        # Move adapter weights to module's device to avoid CPU-GPU transfer on every forward
-        device = None
+        # Move adapter weights to compute device (GPU)
+        # Use get_torch_device() instead of module.weight.device because
+        # with offloading, module weights may be on CPU while compute happens on GPU
+        device = comfy.model_management.get_torch_device()
+
+        # Get dtype from module weight if available
        dtype = None
        if hasattr(self.module, "weight") and self.module.weight is not None:
-            device = self.module.weight.device
            dtype = self.module.weight.dtype
-        elif hasattr(self.module, "W_q"):  # Quantized layers might use different attr
-            device = self.module.W_q.device
-            dtype = self.module.W_q.dtype

-        if device is not None:
-            self._move_adapter_weights_to_device(device, dtype)
+        # Only use dtype if it's a standard float type, not quantized
+        if dtype is not None and dtype not in (torch.float32, torch.float16, torch.bfloat16):
+            dtype = None
+
+        self._move_adapter_weights_to_device(device, dtype)

        self.original_forward = self.module.forward
        self.module.forward = self._bypass_forward
--- a/comfy_api/latest/_input/video_types.py
+++ b/comfy_api/latest/_input/video_types.py
@@ -34,6 +34,21 @@ class VideoInput(ABC):
        """
        pass

+    @abstractmethod
+    def as_trimmed(
+        self,
+        start_time: float | None = None,
+        duration: float | None = None,
+        strict_duration: bool = False,
+    ) -> VideoInput | None:
+        """
+        Create a new VideoInput which is trimmed to have the corresponding start_time and duration
+
+        Returns:
+            A new VideoInput, or None if the result would have negative duration
+        """
+        pass
+
    def get_stream_source(self) -> Union[str, io.BytesIO]:
        """
        Get a streamable source for the video. This allows processing without
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@@ -6,6 +6,7 @@ from typing import Optional
 from .._input import AudioInput, VideoInput
 import av
 import io
+import itertools
 import json
 import numpy as np
 import math
@@ -29,7 +30,6 @@ def container_to_output_format(container_format: str | None) -> str | None:
    formats = container_format.split(",")
    return formats[0]

-
 def get_open_write_kwargs(
    dest: str | io.BytesIO, container_format: str, to_format: str | None
 ) -> dict:
@@ -57,12 +57,14 @@ class VideoFromFile(VideoInput):
    Class representing video input from a file.
    """

-    def __init__(self, file: str | io.BytesIO):
+    def __init__(self, file: str | io.BytesIO, *, start_time: float=0, duration: float=0):
        """
        Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object
        containing the file contents.
        """
        self.__file = file
+        self.__start_time = start_time
+        self.__duration = duration

    def get_stream_source(self) -> str | io.BytesIO:
        """
@@ -96,6 +98,16 @@ class VideoFromFile(VideoInput):
        Returns:
            Duration in seconds
        """
+        raw_duration = self._get_raw_duration()
+        if self.__start_time < 0:
+            duration_from_start = min(raw_duration, -self.__start_time)
+        else:
+            duration_from_start = raw_duration - self.__start_time
+        if self.__duration:
+            return min(self.__duration, duration_from_start)
+        return duration_from_start
+
+    def _get_raw_duration(self) -> float:
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)
        with av.open(self.__file, mode="r") as container:
@@ -113,9 +125,13 @@ class VideoFromFile(VideoInput):
            if video_stream and video_stream.average_rate:
                frame_count = 0
                container.seek(0)
-                for packet in container.demux(video_stream):
-                    for _ in packet.decode():
-                        frame_count += 1
+                frame_iterator = (
+                    container.decode(video_stream)
+                    if video_stream.codec.capabilities & 0x100
+                    else container.demux(video_stream)
+                )
+                for packet in frame_iterator:
+                    frame_count += 1
                if frame_count > 0:
                    return float(frame_count / video_stream.average_rate)

@@ -131,36 +147,54 @@ class VideoFromFile(VideoInput):

        with av.open(self.__file, mode="r") as container:
            video_stream = self._get_first_video_stream(container)
-            # 1. Prefer the frames field if available
-            if video_stream.frames and video_stream.frames > 0:
+            # 1. Prefer the frames field if available and usable
+            if (
+                video_stream.frames
+                and video_stream.frames > 0
+                and not self.__start_time
+                and not self.__duration
+            ):
                return int(video_stream.frames)

            # 2. Try to estimate from duration and average_rate using only metadata
-            if container.duration is not None and video_stream.average_rate:
-                duration_seconds = float(container.duration / av.time_base)
-                estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
-                if estimated_frames > 0:
-                    return estimated_frames
-
            if (
                getattr(video_stream, "duration", None) is not None
                and getattr(video_stream, "time_base", None) is not None
                and video_stream.average_rate
            ):
-                duration_seconds = float(video_stream.duration * video_stream.time_base)
+                raw_duration = float(video_stream.duration * video_stream.time_base)
+                if self.__start_time < 0:
+                    duration_from_start = min(raw_duration, -self.__start_time)
+                else:
+                    duration_from_start = raw_duration - self.__start_time
+                duration_seconds = min(self.__duration, duration_from_start)
                estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
                if estimated_frames > 0:
                    return estimated_frames

            # 3. Last resort: decode frames and count them (streaming)
-            frame_count = 0
-            container.seek(0)
-            for packet in container.demux(video_stream):
-                for _ in packet.decode():
-                    frame_count += 1
-
-            if frame_count == 0:
-                raise ValueError(f"Could not determine frame count for file '{self.__file}'")
+            if self.__start_time < 0:
+                start_time = max(self._get_raw_duration() + self.__start_time, 0)
+            else:
+                start_time = self.__start_time
+            frame_count = 1
+            start_pts = int(start_time / video_stream.time_base)
+            end_pts = int((start_time + self.__duration) / video_stream.time_base)
+            container.seek(start_pts, stream=video_stream)
+            frame_iterator = (
+                container.decode(video_stream)
+                if video_stream.codec.capabilities & 0x100
+                else container.demux(video_stream)
+            )
+            for frame in frame_iterator:
+                if frame.pts >= start_pts:
+                    break
+            else:
+                raise ValueError(f"Could not determine frame count for file '{self.__file}'\nNo frames exist for start_time {self.__start_time}")
+            for frame in frame_iterator:
+                if frame.pts >= end_pts:
+                    break
+                frame_count += 1
            return frame_count

    def get_frame_rate(self) -> Fraction:
@@ -199,9 +233,21 @@ class VideoFromFile(VideoInput):
            return container.format.name

    def get_components_internal(self, container: InputContainer) -> VideoComponents:
+        video_stream = self._get_first_video_stream(container)
+        if self.__start_time < 0:
+            start_time = max(self._get_raw_duration() + self.__start_time, 0)
+        else:
+            start_time = self.__start_time
        # Get video frames
        frames = []
-        for frame in container.decode(video=0):
+        start_pts = int(start_time / video_stream.time_base)
+        end_pts = int((start_time + self.__duration) / video_stream.time_base)
+        container.seek(start_pts, stream=video_stream)
+        for frame in container.decode(video_stream):
+            if frame.pts < start_pts:
+                continue
+            if self.__duration and frame.pts >= end_pts:
+                break
            img = frame.to_ndarray(format='rgb24')  # shape: (H, W, 3)
            img = torch.from_numpy(img) / 255.0  # shape: (H, W, 3)
            frames.append(img)
@@ -209,31 +255,44 @@ class VideoFromFile(VideoInput):
        images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0)

        # Get frame rate
-        video_stream = next(s for s in container.streams if s.type == 'video')
-        frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1)
+        frame_rate = Fraction(video_stream.average_rate) if video_stream.average_rate else Fraction(1)

        # Get audio if available
        audio = None
-        try:
-            container.seek(0)  # Reset the container to the beginning
-            for stream in container.streams:
-                if stream.type != 'audio':
-                    continue
-                assert isinstance(stream, av.AudioStream)
-                audio_frames = []
-                for packet in container.demux(stream):
-                    for frame in packet.decode():
-                        assert isinstance(frame, av.AudioFrame)
-                        audio_frames.append(frame.to_ndarray())  # shape: (channels, samples)
-                if len(audio_frames) > 0:
-                    audio_data = np.concatenate(audio_frames, axis=1)  # shape: (channels, total_samples)
-                    audio_tensor = torch.from_numpy(audio_data).unsqueeze(0)  # shape: (1, channels, total_samples)
-                    audio = AudioInput({
-                        "waveform": audio_tensor,
-                        "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1,
-                    })
-        except StopIteration:
-            pass  # No audio stream
+        container.seek(start_pts, stream=video_stream)
+        # Use last stream for consistency
+        if len(container.streams.audio):
+            audio_stream = container.streams.audio[-1]
+            audio_frames = []
+            resample = av.audio.resampler.AudioResampler(format='fltp').resample
+            frames = itertools.chain.from_iterable(
+                map(resample, container.decode(audio_stream))
+            )
+
+            has_first_frame = False
+            for frame in frames:
+                offset_seconds = start_time - frame.pts * audio_stream.time_base
+                to_skip = int(offset_seconds * audio_stream.sample_rate)
+                if to_skip < frame.samples:
+                    has_first_frame = True
+                    break
+            if has_first_frame:
+                audio_frames.append(frame.to_ndarray()[..., to_skip:])
+
+            for frame in frames:
+                if frame.time > start_time + self.__duration:
+                    break
+                audio_frames.append(frame.to_ndarray())  # shape: (channels, samples)
+            if len(audio_frames) > 0:
+                audio_data = np.concatenate(audio_frames, axis=1)  # shape: (channels, total_samples)
+                if self.__duration:
+                    audio_data = audio_data[..., :int(self.__duration * audio_stream.sample_rate)]
+
+                audio_tensor = torch.from_numpy(audio_data).unsqueeze(0)  # shape: (1, channels, total_samples)
+                audio = AudioInput({
+                    "waveform": audio_tensor,
+                    "sample_rate": int(audio_stream.sample_rate) if audio_stream.sample_rate else 1,
+                })

        metadata = container.metadata
        return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)
@@ -250,7 +309,7 @@ class VideoFromFile(VideoInput):
        path: str | io.BytesIO,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
-        metadata: Optional[dict] = None
+        metadata: Optional[dict] = None,
    ):
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)  # Reset the BytesIO object to the beginning
@@ -262,15 +321,14 @@ class VideoFromFile(VideoInput):
                reuse_streams = False
            if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None:
                reuse_streams = False
+            if self.__start_time or self.__duration:
+                reuse_streams = False

            if not reuse_streams:
                components = self.get_components_internal(container)
                video = VideoFromComponents(components)
                return video.save_to(
-                    path,
-                    format=format,
-                    codec=codec,
-                    metadata=metadata
+                    path, format=format, codec=codec, metadata=metadata
                )

            streams = container.streams
@@ -304,10 +362,21 @@ class VideoFromFile(VideoInput):
                        output_container.mux(packet)

    def _get_first_video_stream(self, container: InputContainer):
-        video_stream = next((s for s in container.streams if s.type == "video"), None)
-        if video_stream is None:
-            raise ValueError(f"No video stream found in file '{self.__file}'")
-        return video_stream
+        if len(container.streams.video):
+            return container.streams.video[0]
+        raise ValueError(f"No video stream found in file '{self.__file}'")
+
+    def as_trimmed(
+        self, start_time: float = 0, duration: float = 0, strict_duration: bool = True
+    ) -> VideoInput | None:
+        trimmed = VideoFromFile(
+            self.get_stream_source(),
+            start_time=start_time + self.__start_time,
+            duration=duration,
+        )
+        if trimmed.get_duration() < duration and strict_duration:
+            return None
+        return trimmed


 class VideoFromComponents(VideoInput):
@@ -322,7 +391,7 @@ class VideoFromComponents(VideoInput):
        return VideoComponents(
            images=self.__components.images,
            audio=self.__components.audio,
-            frame_rate=self.__components.frame_rate
+            frame_rate=self.__components.frame_rate,
        )

    def save_to(
@@ -330,7 +399,7 @@ class VideoFromComponents(VideoInput):
        path: str,
        format: VideoContainer = VideoContainer.AUTO,
        codec: VideoCodec = VideoCodec.AUTO,
-        metadata: Optional[dict] = None
+        metadata: Optional[dict] = None,
    ):
        if format != VideoContainer.AUTO and format != VideoContainer.MP4:
            raise ValueError("Only MP4 format is supported for now")
@@ -357,7 +426,10 @@ class VideoFromComponents(VideoInput):
            audio_stream: Optional[av.AudioStream] = None
            if self.__components.audio:
                audio_sample_rate = int(self.__components.audio['sample_rate'])
-                audio_stream = output.add_stream('aac', rate=audio_sample_rate)
+                waveform = self.__components.audio['waveform']
+                waveform = waveform[0, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
+                layout = {1: 'mono', 2: 'stereo', 6: '5.1'}.get(waveform.shape[0], 'stereo')
+                audio_stream = output.add_stream('aac', rate=audio_sample_rate, layout=layout)

            # Encode video
            for i, frame in enumerate(self.__components.images):
@@ -372,12 +444,21 @@ class VideoFromComponents(VideoInput):
            output.mux(packet)

            if audio_stream and self.__components.audio:
-                waveform = self.__components.audio['waveform']
-                waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
-                frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().cpu().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo')
+                frame = av.AudioFrame.from_ndarray(waveform.float().cpu().numpy(), format='fltp', layout=layout)
                frame.sample_rate = audio_sample_rate
                frame.pts = 0
                output.mux(audio_stream.encode(frame))

                # Flush encoder
                output.mux(audio_stream.encode(None))
+
+    def as_trimmed(
+        self,
+        start_time: float | None = None,
+        duration: float | None = None,
+        strict_duration: bool = True,
+    ) -> VideoInput | None:
+        if self.get_duration() < start_time + duration:
+            return None
+        #TODO Consider tracking duration and trimming at time of save?
+        return VideoFromFile(self.get_stream_source(), start_time=start_time, duration=duration)
--- a/comfy_api_nodes/apis/init.py
+++ b/comfy_api_nodes/apis/init.py
@@ -1197,12 +1197,6 @@ class KlingImageGenImageReferenceType(str, Enum):
    face = 'face'


-class KlingImageGenModelName(str, Enum):
-    kling_v1 = 'kling-v1'
-    kling_v1_5 = 'kling-v1-5'
-    kling_v2 = 'kling-v2'
-
-
 class KlingImageGenerationsRequest(BaseModel):
    aspect_ratio: Optional[KlingImageGenAspectRatio] = '16:9'
    callback_url: Optional[AnyUrl] = Field(
@@ -1218,7 +1212,7 @@ class KlingImageGenerationsRequest(BaseModel):
        0.5, description='Reference intensity for user-uploaded images', ge=0.0, le=1.0
    )
    image_reference: Optional[KlingImageGenImageReferenceType] = None
-    model_name: Optional[KlingImageGenModelName] = 'kling-v1'
+    model_name: str = Field(...)
    n: Optional[int] = Field(1, description='Number of generated images', ge=1, le=9)
    negative_prompt: Optional[str] = Field(
        None, description='Negative text prompt', max_length=200
--- a/comfy_api_nodes/apis/kling.py
+++ b/comfy_api_nodes/apis/kling.py
@@ -1,12 +1,22 @@
 from pydantic import BaseModel, Field


+class MultiPromptEntry(BaseModel):
+    index: int = Field(...)
+    prompt: str = Field(...)
+    duration: str = Field(...)
+
+
 class OmniProText2VideoRequest(BaseModel):
    model_name: str = Field(..., description="kling-video-o1")
    aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'")
    duration: str = Field(..., description="'5' or '10'")
    prompt: str = Field(...)
    mode: str = Field("pro")
+    multi_shot: bool | None = Field(None)
+    multi_prompt: list[MultiPromptEntry] | None = Field(None)
+    shot_type: str | None = Field(None)
+    sound: str = Field(..., description="'on' or 'off'")


 class OmniParamImage(BaseModel):
@@ -26,6 +36,10 @@ class OmniProFirstLastFrameRequest(BaseModel):
    duration: str = Field(..., description="'5' or '10'")
    prompt: str = Field(...)
    mode: str = Field("pro")
+    sound: str | None = Field(None, description="'on' or 'off'")
+    multi_shot: bool | None = Field(None)
+    multi_prompt: list[MultiPromptEntry] | None = Field(None)
+    shot_type: str | None = Field(None)


 class OmniProReferences2VideoRequest(BaseModel):
@@ -38,6 +52,10 @@ class OmniProReferences2VideoRequest(BaseModel):
    duration: str | None = Field(..., description="From 3 to 10.")
    prompt: str = Field(...)
    mode: str = Field("pro")
+    sound: str | None = Field(None, description="'on' or 'off'")
+    multi_shot: bool | None = Field(None)
+    multi_prompt: list[MultiPromptEntry] | None = Field(None)
+    shot_type: str | None = Field(None)


 class TaskStatusVideoResult(BaseModel):
@@ -54,6 +72,7 @@ class TaskStatusImageResult(BaseModel):
 class TaskStatusResults(BaseModel):
    videos: list[TaskStatusVideoResult] | None = Field(None)
    images: list[TaskStatusImageResult] | None = Field(None)
+    series_images: list[TaskStatusImageResult] | None = Field(None)


 class TaskStatusResponseData(BaseModel):
@@ -77,31 +96,42 @@ class OmniImageParamImage(BaseModel):


 class OmniProImageRequest(BaseModel):
-    model_name: str = Field(..., description="kling-image-o1")
-    resolution: str = Field(..., description="'1k' or '2k'")
+    model_name: str = Field(...)
+    resolution: str = Field(...)
    aspect_ratio: str | None = Field(...)
    prompt: str = Field(...)
    mode: str = Field("pro")
    n: int | None = Field(1, le=9)
    image_list: list[OmniImageParamImage] | None = Field(..., max_length=10)
+    result_type: str | None = Field(None, description="Set to 'series' for series generation")
+    series_amount: int | None = Field(None, ge=2, le=9, description="Number of images in a series")


 class TextToVideoWithAudioRequest(BaseModel):
-    model_name: str = Field(..., description="kling-v2-6")
+    model_name: str = Field(...)
    aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'")
-    duration: str = Field(..., description="'5' or '10'")
-    prompt: str = Field(...)
+    duration: str = Field(...)
+    prompt: str | None = Field(...)
+    negative_prompt: str | None = Field(None)
    mode: str = Field("pro")
    sound: str = Field(..., description="'on' or 'off'")
+    multi_shot: bool | None = Field(None)
+    multi_prompt: list[MultiPromptEntry] | None = Field(None)
+    shot_type: str | None = Field(None)


 class ImageToVideoWithAudioRequest(BaseModel):
-    model_name: str = Field(..., description="kling-v2-6")
+    model_name: str = Field(...)
    image: str = Field(...)
-    duration: str = Field(..., description="'5' or '10'")
-    prompt: str = Field(...)
+    image_tail: str | None = Field(None)
+    duration: str = Field(...)
+    prompt: str | None = Field(...)
+    negative_prompt: str | None = Field(None)
    mode: str = Field("pro")
    sound: str = Field(..., description="'on' or 'off'")
+    multi_shot: bool | None = Field(None)
+    multi_prompt: list[MultiPromptEntry] | None = Field(None)
+    shot_type: str | None = Field(None)


 class MotionControlRequest(BaseModel):
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
--- a/comfy_api_nodes/nodes_moonvalley.py
+++ b/comfy_api_nodes/nodes_moonvalley.py
@@ -219,8 +219,8 @@ class MoonvalleyImg2VideoNode(IO.ComfyNode):
                ),
                IO.Int.Input(
                    "steps",
-                    default=33,
-                    min=1,
+                    default=80,
+                    min=75,  # steps should be greater or equal to cooldown_steps(75) + warmup_steps(0)
                    max=100,
                    step=1,
                    tooltip="Number of denoising steps",
@@ -340,8 +340,8 @@ class MoonvalleyVideo2VideoNode(IO.ComfyNode):
                ),
                IO.Int.Input(
                    "steps",
-                    default=33,
-                    min=1,
+                    default=60,
+                    min=60,  # steps should be greater or equal to cooldown_steps(36) + warmup_steps(24)
                    max=100,
                    step=1,
                    display_mode=IO.NumberDisplay.number,
@@ -370,7 +370,7 @@ class MoonvalleyVideo2VideoNode(IO.ComfyNode):
        video: Input.Video | None = None,
        control_type: str = "Motion Transfer",
        motion_intensity: int | None = 100,
-        steps=33,
+        steps=60,
        prompt_adherence=4.5,
    ) -> IO.NodeOutput:
        validated_video = validate_video_to_video_input(video)
@@ -465,8 +465,8 @@ class MoonvalleyTxt2VideoNode(IO.ComfyNode):
                ),
                IO.Int.Input(
                    "steps",
-                    default=33,
-                    min=1,
+                    default=80,
+                    min=75,  # steps should be greater or equal to cooldown_steps(75) + warmup_steps(0)
                    max=100,
                    step=1,
                    tooltip="Inference steps",
--- a/comfy_execution/jobs.py
+++ b/comfy_execution/jobs.py
@@ -20,10 +20,60 @@ class JobStatus:


 # Media types that can be previewed in the frontend
-PREVIEWABLE_MEDIA_TYPES = frozenset({'images', 'video', 'audio'})
+PREVIEWABLE_MEDIA_TYPES = frozenset({'images', 'video', 'audio', '3d'})

 # 3D file extensions for preview fallback (no dedicated media_type exists)
-THREE_D_EXTENSIONS = frozenset({'.obj', '.fbx', '.gltf', '.glb'})
+THREE_D_EXTENSIONS = frozenset({'.obj', '.fbx', '.gltf', '.glb', '.usdz'})
+
+
+def has_3d_extension(filename: str) -> bool:
+    lower = filename.lower()
+    return any(lower.endswith(ext) for ext in THREE_D_EXTENSIONS)
+
+
+def normalize_output_item(item):
+    """Normalize a single output list item for the jobs API.
+
+    Returns the normalized item, or None to exclude it.
+    String items with 3D extensions become {filename, type, subfolder} dicts.
+    """
+    if item is None:
+        return None
+    if isinstance(item, str):
+        if has_3d_extension(item):
+            return {'filename': item, 'type': 'output', 'subfolder': '', 'mediaType': '3d'}
+        return None
+    if isinstance(item, dict):
+        return item
+    return None
+
+
+def normalize_outputs(outputs: dict) -> dict:
+    """Normalize raw node outputs for the jobs API.
+
+    Transforms string 3D filenames into file output dicts and removes
+    None items. All other items (non-3D strings, dicts, etc.) are
+    preserved as-is.
+    """
+    normalized = {}
+    for node_id, node_outputs in outputs.items():
+        if not isinstance(node_outputs, dict):
+            normalized[node_id] = node_outputs
+            continue
+        normalized_node = {}
+        for media_type, items in node_outputs.items():
+            if media_type == 'animated' or not isinstance(items, list):
+                normalized_node[media_type] = items
+                continue
+            normalized_items = []
+            for item in items:
+                if item is None:
+                    continue
+                norm = normalize_output_item(item)
+                normalized_items.append(norm if norm is not None else item)
+            normalized_node[media_type] = normalized_items
+        normalized[node_id] = normalized_node
+    return normalized


 def _extract_job_metadata(extra_data: dict) -> tuple[Optional[int], Optional[str]]:
@@ -45,9 +95,9 @@ def is_previewable(media_type: str, item: dict) -> bool:
    Maintains backwards compatibility with existing logic.

    Priority:
-    1. media_type is 'images', 'video', or 'audio'
+    1. media_type is 'images', 'video', 'audio', or '3d'
    2. format field starts with 'video/' or 'audio/'
-    3. filename has a 3D extension (.obj, .fbx, .gltf, .glb)
+    3. filename has a 3D extension (.obj, .fbx, .gltf, .glb, .usdz)
    """
    if media_type in PREVIEWABLE_MEDIA_TYPES:
        return True
@@ -139,7 +189,7 @@ def normalize_history_item(prompt_id: str, history_item: dict, include_outputs:
    })

    if include_outputs:
-        job['outputs'] = outputs
+        job['outputs'] = normalize_outputs(outputs)
        job['execution_status'] = status_info
        job['workflow'] = {
            'prompt': prompt,
@@ -171,18 +221,23 @@ def get_outputs_summary(outputs: dict) -> tuple[int, Optional[dict]]:
                continue

            for item in items:
-                count += 1
-
-                if not isinstance(item, dict):
+                normalized = normalize_output_item(item)
+                if normalized is None:
                    continue

-                if preview_output is None and is_previewable(media_type, item):
+                count += 1
+
+                if preview_output is not None:
+                    continue
+
+                if isinstance(normalized, dict) and is_previewable(media_type, normalized):
                    enriched = {
-                        **item,
+                        **normalized,
                        'nodeId': node_id,
-                        'mediaType': media_type
                    }
-                    if item.get('type') == 'output':
+                    if 'mediaType' not in normalized:
+                        enriched['mediaType'] = media_type
+                    if normalized.get('type') == 'output':
                        preview_output = enriched
                    elif fallback_preview is None:
                        fallback_preview = enriched
--- a/comfy_extras/nodes_ace.py
+++ b/comfy_extras/nodes_ace.py
@@ -44,13 +44,18 @@ class TextEncodeAceStepAudio15(io.ComfyNode):
                io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
                io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
                io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
+                io.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
+                io.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
+                io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
+                io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
+                io.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
            ],
            outputs=[io.Conditioning.Output()],
        )

    @classmethod
-    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale) -> io.NodeOutput:
-        tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed)
+    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k) -> io.NodeOutput:
+        tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k)
        conditioning = clip.encode_from_tokens_scheduled(tokens)
        return io.NodeOutput(conditioning)

@@ -100,14 +105,15 @@ class EmptyAceStep15LatentAudio(io.ComfyNode):
        latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device())
        return io.NodeOutput({"samples": latent, "type": "audio"})

-class ReferenceTimbreAudio(io.ComfyNode):
+class ReferenceAudio(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="ReferenceTimbreAudio",
+            display_name="Reference Audio",
            category="advanced/conditioning/audio",
            is_experimental=True,
-            description="This node sets the reference audio for timbre (for ace step 1.5)",
+            description="This node sets the reference audio for ace step 1.5",
            inputs=[
                io.Conditioning.Input("conditioning"),
                io.Latent.Input("latent", optional=True),
@@ -131,7 +137,7 @@ class AceExtension(ComfyExtension):
            EmptyAceStepLatentAudio,
            TextEncodeAceStepAudio15,
            EmptyAceStep15LatentAudio,
-            ReferenceTimbreAudio,
+            ReferenceAudio,
        ]

 async def comfy_entrypoint() -> AceExtension:
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -94,6 +94,19 @@ class VAEEncodeAudio(IO.ComfyNode):
    encode = execute  # TODO: remove


+def vae_decode_audio(vae, samples, tile=None, overlap=None):
+    if tile is not None:
+        audio = vae.decode_tiled(samples["samples"], tile_y=tile, overlap=overlap).movedim(-1, 1)
+    else:
+        audio = vae.decode(samples["samples"]).movedim(-1, 1)
+
+    std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
+    std[std < 1.0] = 1.0
+    audio /= std
+    vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
+    return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}
+
+
 class VAEDecodeAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -111,16 +124,33 @@ class VAEDecodeAudio(IO.ComfyNode):

    @classmethod
    def execute(cls, vae, samples) -> IO.NodeOutput:
-        audio = vae.decode(samples["samples"]).movedim(-1, 1)
-        std = torch.std(audio, dim=[1,2], keepdim=True) * 5.0
-        std[std < 1.0] = 1.0
-        audio /= std
-        vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
-        return IO.NodeOutput({"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]})
+        return IO.NodeOutput(vae_decode_audio(vae, samples))

    decode = execute  # TODO: remove


+class VAEDecodeAudioTiled(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="VAEDecodeAudioTiled",
+            search_aliases=["latent to audio"],
+            display_name="VAE Decode Audio (Tiled)",
+            category="latent/audio",
+            inputs=[
+                IO.Latent.Input("samples"),
+                IO.Vae.Input("vae"),
+                IO.Int.Input("tile_size", default=512, min=32, max=8192, step=8),
+                IO.Int.Input("overlap", default=64, min=0, max=1024, step=8),
+            ],
+            outputs=[IO.Audio.Output()],
+        )
+
+    @classmethod
+    def execute(cls, vae, samples, tile_size, overlap) -> IO.NodeOutput:
+        return IO.NodeOutput(vae_decode_audio(vae, samples, tile_size, overlap))
+
+
 class SaveAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -675,6 +705,7 @@ class AudioExtension(ComfyExtension):
            EmptyLatentAudio,
            VAEEncodeAudio,
            VAEDecodeAudio,
+            VAEDecodeAudioTiled,
            SaveAudio,
            SaveAudioMP3,
            SaveAudioOpus,
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -622,6 +622,7 @@ class SamplerSASolver(io.ComfyNode):
    def define_schema(cls):
        return io.Schema(
            node_id="SamplerSASolver",
+            search_aliases=["sde"],
            category="sampling/custom_sampling/samplers",
            inputs=[
                io.Model.Input("model"),
@@ -666,6 +667,7 @@ class SamplerSEEDS2(io.ComfyNode):
    def define_schema(cls):
        return io.Schema(
            node_id="SamplerSEEDS2",
+            search_aliases=["sde", "exp heun"],
            category="sampling/custom_sampling/samplers",
            inputs=[
                io.Combo.Input("solver_type", options=["phi_1", "phi_2"]),
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@@ -9,6 +9,14 @@ if TYPE_CHECKING:
    from uuid import UUID


+def _extract_tensor(data, output_channels):
+    """Extract tensor from data, handling both single tensors and lists."""
+    if isinstance(data, list):
+        # LTX2 AV tensors: [video, audio]
+        return data[0][:, :output_channels], data[1][:, :output_channels]
+    return data[:, :output_channels], None
+
+
 def easycache_forward_wrapper(executor, *args, **kwargs):
    # get values from args
    transformer_options: dict[str] = args[-1]
@@ -17,7 +25,7 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
        if not transformer_options:
            transformer_options = args[-2]
    easycache: EasyCacheHolder = transformer_options["easycache"]
-    x: torch.Tensor = args[0][:, :easycache.output_channels]
+    x, ax = _extract_tensor(args[0], easycache.output_channels)
    sigmas = transformer_options["sigmas"]
    uuids = transformer_options["uuids"]
    if sigmas is not None and easycache.is_past_end_timestep(sigmas):
@@ -35,7 +43,11 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
        if easycache.skip_current_step and can_apply_cache_diff:
            if easycache.verbose:
                logging.info(f"EasyCache [verbose] - was marked to skip this step by {easycache.first_cond_uuid}. Present uuids: {uuids}")
-            return easycache.apply_cache_diff(x, uuids)
+            result = easycache.apply_cache_diff(x, uuids)
+            if ax is not None:
+                result_audio = easycache.apply_cache_diff(ax, uuids, is_audio=True)
+                return [result, result_audio]
+            return result
        if easycache.initial_step:
            easycache.first_cond_uuid = uuids[0]
            has_first_cond_uuid = easycache.has_first_cond_uuid(uuids)
@@ -51,13 +63,18 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
                        logging.info(f"EasyCache [verbose] - skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}")
                    # other conds should also skip this step, and instead use their cached values
                    easycache.skip_current_step = True
-                    return easycache.apply_cache_diff(x, uuids)
+                    result = easycache.apply_cache_diff(x, uuids)
+                    if ax is not None:
+                        result_audio = easycache.apply_cache_diff(ax, uuids, is_audio=True)
+                        return [result, result_audio]
+                    return result
                else:
                    if easycache.verbose:
                        logging.info(f"EasyCache [verbose] - NOT skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}")
                    easycache.cumulative_change_rate = 0.0

-    output: torch.Tensor = executor(*args, **kwargs)
+    full_output: torch.Tensor = executor(*args, **kwargs)
+    output, audio_output = _extract_tensor(full_output, easycache.output_channels)
    if has_first_cond_uuid and easycache.has_output_prev_norm():
        output_change = (easycache.subsample(output, uuids, clone=False) - easycache.output_prev_subsampled).flatten().abs().mean()
        if easycache.verbose:
@@ -74,13 +91,15 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
            logging.info(f"EasyCache [verbose] - output_change_rate: {output_change_rate}")
    # TODO: allow cache_diff to be offloaded
    easycache.update_cache_diff(output, next_x_prev, uuids)
+    if audio_output is not None:
+        easycache.update_cache_diff(audio_output, ax, uuids, is_audio=True)
    if has_first_cond_uuid:
        easycache.x_prev_subsampled = easycache.subsample(next_x_prev, uuids)
        easycache.output_prev_subsampled = easycache.subsample(output, uuids)
        easycache.output_prev_norm = output.flatten().abs().mean()
        if easycache.verbose:
            logging.info(f"EasyCache [verbose] - x_prev_subsampled: {easycache.x_prev_subsampled.shape}")
-    return output
+    return full_output

 def lazycache_predict_noise_wrapper(executor, *args, **kwargs):
    # get values from args
@@ -89,8 +108,8 @@ def lazycache_predict_noise_wrapper(executor, *args, **kwargs):
    easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"]
    if easycache.is_past_end_timestep(timestep):
        return executor(*args, **kwargs)
-    # prepare next x_prev
    x: torch.Tensor = args[0][:, :easycache.output_channels]
+    # prepare next x_prev
    next_x_prev = x
    input_change = None
    do_easycache = easycache.should_do_easycache(timestep)
@@ -197,6 +216,7 @@ class EasyCacheHolder:
        self.output_prev_subsampled: torch.Tensor = None
        self.output_prev_norm: torch.Tensor = None
        self.uuid_cache_diffs: dict[UUID, torch.Tensor] = {}
+        self.uuid_cache_diffs_audio: dict[UUID, torch.Tensor] = {}
        self.output_change_rates = []
        self.approx_output_change_rates = []
        self.total_steps_skipped = 0
@@ -245,20 +265,21 @@ class EasyCacheHolder:
    def can_apply_cache_diff(self, uuids: list[UUID]) -> bool:
        return all(uuid in self.uuid_cache_diffs for uuid in uuids)

-    def apply_cache_diff(self, x: torch.Tensor, uuids: list[UUID]):
-        if self.first_cond_uuid in uuids:
+    def apply_cache_diff(self, x: torch.Tensor, uuids: list[UUID], is_audio: bool = False):
+        if self.first_cond_uuid in uuids and not is_audio:
            self.total_steps_skipped += 1
+        cache_diffs = self.uuid_cache_diffs_audio if is_audio else self.uuid_cache_diffs
        batch_offset = x.shape[0] // len(uuids)
        for i, uuid in enumerate(uuids):
            # slice out only what is relevant to this cond
            batch_slice = [slice(i*batch_offset,(i+1)*batch_offset)]
            # if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video)
-            if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]:
+            if x.shape[1:] != cache_diffs[uuid].shape[1:]:
                if not self.allow_mismatch:
                    raise ValueError(f"Cached dims {self.uuid_cache_diffs[uuid].shape} don't match x dims {x.shape} - this is no good")
                slicing = []
                skip_this_dim = True
-                for dim_u, dim_x in zip(self.uuid_cache_diffs[uuid].shape, x.shape):
+                for dim_u, dim_x in zip(cache_diffs[uuid].shape, x.shape):
                    if skip_this_dim:
                        skip_this_dim = False
                        continue
@@ -270,10 +291,11 @@ class EasyCacheHolder:
                    else:
                        slicing.append(slice(None))
                batch_slice = batch_slice + slicing
-            x[tuple(batch_slice)] += self.uuid_cache_diffs[uuid].to(x.device)
+            x[tuple(batch_slice)] += cache_diffs[uuid].to(x.device)
        return x

-    def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]):
+    def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID], is_audio: bool = False):
+        cache_diffs = self.uuid_cache_diffs_audio if is_audio else self.uuid_cache_diffs
        # if output dims don't match x dims, cut off excess and hope for the best (cosmos world2video)
        if output.shape[1:] != x.shape[1:]:
            if not self.allow_mismatch:
@@ -293,7 +315,7 @@ class EasyCacheHolder:
        diff = output - x
        batch_offset = diff.shape[0] // len(uuids)
        for i, uuid in enumerate(uuids):
-            self.uuid_cache_diffs[uuid] = diff[i*batch_offset:(i+1)*batch_offset, ...]
+            cache_diffs[uuid] = diff[i*batch_offset:(i+1)*batch_offset, ...]

    def has_first_cond_uuid(self, uuids: list[UUID]) -> bool:
        return self.first_cond_uuid in uuids
@@ -324,6 +346,8 @@ class EasyCacheHolder:
        self.output_prev_norm = None
        del self.uuid_cache_diffs
        self.uuid_cache_diffs = {}
+        del self.uuid_cache_diffs_audio
+        self.uuid_cache_diffs_audio = {}
        self.total_steps_skipped = 0
        self.state_metadata = None
        return self
--- a/comfy_extras/nodes_hunyuan3d.py
+++ b/comfy_extras/nodes_hunyuan3d.py
@@ -618,6 +618,7 @@ class SaveGLB(IO.ComfyNode):
    def define_schema(cls):
        return IO.Schema(
            node_id="SaveGLB",
+            display_name="Save 3D Model",
            search_aliases=["export 3d model", "save mesh"],
            category="3d",
            is_output_node=True,
@@ -626,8 +627,14 @@ class SaveGLB(IO.ComfyNode):
                    IO.Mesh.Input("mesh"),
                    types=[
                        IO.File3DGLB,
+                        IO.File3DGLTF,
+                        IO.File3DOBJ,
+                        IO.File3DFBX,
+                        IO.File3DSTL,
+                        IO.File3DUSDZ,
+                        IO.File3DAny,
                    ],
-                    tooltip="Mesh or GLB file to save",
+                    tooltip="Mesh or 3D file to save",
                ),
                IO.String.Input("filename_prefix", default="mesh/ComfyUI"),
            ],
@@ -649,7 +656,8 @@ class SaveGLB(IO.ComfyNode):

        if isinstance(mesh, Types.File3D):
            # Handle File3D input - save BytesIO data to output folder
-            f = f"{filename}_{counter:05}_.glb"
+            ext = mesh.format or "glb"
+            f = f"{filename}_{counter:05}_.{ext}"
            mesh.save_to(os.path.join(full_output_folder, f))
            results.append({
                "filename": f,
--- a/comfy_extras/nodes_latent.py
+++ b/comfy_extras/nodes_latent.py
@@ -391,8 +391,9 @@ class LatentOperationTonemapReinhard(io.ComfyNode):
            latent_vector_magnitude = (torch.linalg.vector_norm(latent, dim=(1)) + 0.0000000001)[:,None]
            normalized_latent = latent / latent_vector_magnitude

-            mean = torch.mean(latent_vector_magnitude, dim=(1,2,3), keepdim=True)
-            std = torch.std(latent_vector_magnitude, dim=(1,2,3), keepdim=True)
+            dims = list(range(1, latent_vector_magnitude.ndim))
+            mean = torch.mean(latent_vector_magnitude, dim=dims, keepdim=True)
+            std = torch.std(latent_vector_magnitude, dim=dims, keepdim=True)

            top = (std * 5 + mean) * multiplier

--- a/comfy_extras/nodes_load_3d.py
+++ b/comfy_extras/nodes_load_3d.py
@@ -45,6 +45,7 @@ class Load3D(IO.ComfyNode):
                IO.Image.Output(display_name="normal"),
                IO.Load3DCamera.Output(display_name="camera_info"),
                IO.Video.Output(display_name="recording_video"),
+                IO.File3DAny.Output(display_name="model_3d"),
            ],
        )

@@ -66,7 +67,8 @@ class Load3D(IO.ComfyNode):

            video = InputImpl.VideoFromFile(recording_video_path)

-        return IO.NodeOutput(output_image, output_mask, model_file, normal_image, image['camera_info'], video)
+        file_3d = Types.File3D(folder_paths.get_annotated_filepath(model_file))
+        return IO.NodeOutput(output_image, output_mask, model_file, normal_image, image['camera_info'], video, file_3d)

    process = execute  # TODO: remove

--- a/comfy_extras/nodes_toolkit.py
+++ b/comfy_extras/nodes_toolkit.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+
+class CreateList(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        template_matchtype = io.MatchType.Template("type")
+        template_autogrow = io.Autogrow.TemplatePrefix(
+            input=io.MatchType.Input("input", template=template_matchtype),
+            prefix="input",
+        )
+        return io.Schema(
+            node_id="CreateList",
+            display_name="Create List",
+            category="logic",
+            is_input_list=True,
+            search_aliases=["Image Iterator", "Text Iterator", "Iterator"],
+            inputs=[io.Autogrow.Input("inputs", template=template_autogrow)],
+            outputs=[
+                io.MatchType.Output(
+                    template=template_matchtype,
+                    is_output_list=True,
+                    display_name="list",
+                ),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, inputs: io.Autogrow.Type) -> io.NodeOutput:
+        output_list = []
+        for input in inputs.values():
+            output_list += input
+        return io.NodeOutput(output_list)
+
+
+class ToolkitExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            CreateList,
+        ]
+
+
+async def comfy_entrypoint() -> ToolkitExtension:
+    return ToolkitExtension()
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@@ -4,6 +4,7 @@ import os
 import numpy as np
 import safetensors
 import torch
+import torch.nn as nn
 import torch.utils.checkpoint
 from tqdm.auto import trange
 from PIL import Image, ImageDraw, ImageFont
@@ -27,6 +28,11 @@ class TrainGuider(comfy_extras.nodes_custom_sampler.Guider_Basic):
    """
    CFGGuider with modifications for training specific logic
    """
+
+    def __init__(self, *args, offloading=False, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.offloading = offloading
+
    def outer_sample(
        self,
        noise,
@@ -45,9 +51,11 @@ class TrainGuider(comfy_extras.nodes_custom_sampler.Guider_Basic):
                noise.shape,
                self.conds,
                self.model_options,
-                force_full_load=True,  # mirror behavior in TrainLoraNode.execute() to keep model loaded
+                force_full_load=not self.offloading,
+                force_offload=self.offloading,
            )
        )
+        torch.cuda.empty_cache()
        device = self.model_patcher.load_device

        if denoise_mask is not None:
@@ -404,16 +412,97 @@ def find_all_highest_child_module_with_forward(
    return result


-def patch(m):
+def find_modules_at_depth(
+    model: nn.Module, depth: int = 1, result=None, current_depth=0, name=None
+) -> list[nn.Module]:
+    """
+    Find modules at a specific depth level for gradient checkpointing.
+
+    Args:
+        model: The model to search
+        depth: Target depth level (1 = top-level blocks, 2 = their children, etc.)
+        result: Accumulator for results
+        current_depth: Current recursion depth
+        name: Current module name for logging
+
+    Returns:
+        List of modules at the target depth
+    """
+    if result is None:
+        result = []
+    name = name or "root"
+
+    # Skip container modules (they don't have meaningful forward)
+    is_container = isinstance(model, (nn.ModuleList, nn.Sequential, nn.ModuleDict))
+    has_forward = hasattr(model, "forward") and not is_container
+
+    if has_forward:
+        current_depth += 1
+        if current_depth == depth:
+            result.append(model)
+            logging.debug(f"Found module at depth {depth}: {name} ({model.__class__.__name__})")
+            return result
+
+    # Recurse into children
+    for next_name, child in model.named_children():
+        find_modules_at_depth(child, depth, result, current_depth, f"{name}.{next_name}")
+
+    return result
+
+
+class OffloadCheckpointFunction(torch.autograd.Function):
+    """
+    Gradient checkpointing that works with weight offloading.
+
+    Forward: no_grad -> compute -> weights can be freed
+    Backward: enable_grad -> recompute -> backward -> weights can be freed
+
+    For single input, single output modules (Linear, Conv*).
+    """
+
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, forward_fn):
+        ctx.save_for_backward(x)
+        ctx.forward_fn = forward_fn
+        with torch.no_grad():
+            return forward_fn(x)
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor):
+        x, = ctx.saved_tensors
+        forward_fn = ctx.forward_fn
+
+        # Clear context early
+        ctx.forward_fn = None
+
+        with torch.enable_grad():
+            x_detached = x.detach().requires_grad_(True)
+            y = forward_fn(x_detached)
+            y.backward(grad_out)
+            grad_x = x_detached.grad
+
+        # Explicit cleanup
+        del y, x_detached, forward_fn
+
+        return grad_x, None
+
+
+def patch(m, offloading=False):
    if not hasattr(m, "forward"):
        return
    org_forward = m.forward

-    def fwd(args, kwargs):
-        return org_forward(*args, **kwargs)
+    # Branch 1: Linear/Conv* -> offload-compatible checkpoint (single input/output)
+    if offloading and isinstance(m, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        def checkpointing_fwd(x):
+            return OffloadCheckpointFunction.apply(x, org_forward)
+    # Branch 2: Others -> standard checkpoint
+    else:
+        def fwd(args, kwargs):
+            return org_forward(*args, **kwargs)

-    def checkpointing_fwd(*args, **kwargs):
-        return torch.utils.checkpoint.checkpoint(fwd, args, kwargs, use_reentrant=False)
+        def checkpointing_fwd(*args, **kwargs):
+            return torch.utils.checkpoint.checkpoint(fwd, args, kwargs, use_reentrant=False)

    m.org_forward = org_forward
    m.forward = checkpointing_fwd
@@ -936,6 +1025,18 @@ class TrainLoraNode(io.ComfyNode):
                    default=True,
                    tooltip="Use gradient checkpointing for training.",
                ),
+                io.Int.Input(
+                    "checkpoint_depth",
+                    default=1,
+                    min=1,
+                    max=5,
+                    tooltip="Depth level for gradient checkpointing.",
+                ),
+                io.Boolean.Input(
+                    "offloading",
+                    default=False,
+                    tooltip="Depth level for gradient checkpointing.",
+                ),
                io.Combo.Input(
                    "existing_lora",
                    options=folder_paths.get_filename_list("loras") + ["[None]"],
@@ -982,6 +1083,8 @@ class TrainLoraNode(io.ComfyNode):
        lora_dtype,
        algorithm,
        gradient_checkpointing,
+        checkpoint_depth,
+        offloading,
        existing_lora,
        bucket_mode,
        bypass_mode,
@@ -1000,6 +1103,8 @@ class TrainLoraNode(io.ComfyNode):
        lora_dtype = lora_dtype[0]
        algorithm = algorithm[0]
        gradient_checkpointing = gradient_checkpointing[0]
+        offloading = offloading[0]
+        checkpoint_depth = checkpoint_depth[0]
        existing_lora = existing_lora[0]
        bucket_mode = bucket_mode[0]
        bypass_mode = bypass_mode[0]
@@ -1054,16 +1159,18 @@ class TrainLoraNode(io.ComfyNode):

            # Setup gradient checkpointing
            if gradient_checkpointing:
-                for m in find_all_highest_child_module_with_forward(
-                    mp.model.diffusion_model
-                ):
-                    patch(m)
+                modules_to_patch = find_modules_at_depth(
+                    mp.model.diffusion_model, depth=checkpoint_depth
+                )
+                logging.info(f"Gradient checkpointing: patching {len(modules_to_patch)} modules at depth {checkpoint_depth}")
+                for m in modules_to_patch:
+                    patch(m, offloading=offloading)

            torch.cuda.empty_cache()
            # With force_full_load=False we should be able to have offloading
            # But for offloading in training we need custom AutoGrad hooks for fwd/bwd
            comfy.model_management.load_models_gpu(
-                [mp], memory_required=1e20, force_full_load=True
+                [mp], memory_required=1e20, force_full_load=not offloading
            )
            torch.cuda.empty_cache()

@@ -1100,7 +1207,7 @@ class TrainLoraNode(io.ComfyNode):
                )

            # Setup guider
-            guider = TrainGuider(mp)
+            guider = TrainGuider(mp, offloading=offloading)
            guider.set_conds(positive)

            # Inject bypass hooks if bypass mode is enabled
@@ -1113,6 +1220,7 @@ class TrainLoraNode(io.ComfyNode):

            # Run training loop
            try:
+                comfy.model_management.in_training = True
                _run_training_loop(
                    guider,
                    train_sampler,
@@ -1123,6 +1231,7 @@ class TrainLoraNode(io.ComfyNode):
                    multi_res,
                )
            finally:
+                comfy.model_management.in_training = False
                # Eject bypass hooks if they were injected
                if bypass_injections is not None:
                    for injection in bypass_injections:
@@ -1132,19 +1241,20 @@ class TrainLoraNode(io.ComfyNode):
                    unpatch(m)
            del train_sampler, optimizer

-            # Finalize adapters
+            for param in lora_sd:
+                lora_sd[param] = lora_sd[param].to(lora_dtype).detach()
+
            for adapter in all_weight_adapters:
                adapter.requires_grad_(False)
-
-            for param in lora_sd:
-                lora_sd[param] = lora_sd[param].to(lora_dtype)
+                del adapter
+            del all_weight_adapters

            # mp in train node is highly specialized for training
            # use it in inference will result in bad behavior so we don't return it
            return io.NodeOutput(lora_sd, loss_map, steps + existing_steps)


-class LoraModelLoader(io.ComfyNode):#
+class LoraModelLoader(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
@@ -1166,6 +1276,11 @@ class LoraModelLoader(io.ComfyNode):#
                    max=100.0,
                    tooltip="How strongly to modify the diffusion model. This value can be negative.",
                ),
+                io.Boolean.Input(
+                    "bypass",
+                    default=False,
+                    tooltip="When enabled, applies LoRA in bypass mode without modifying base model weights. Useful for training and when model weights are offloaded.",
+                ),
            ],
            outputs=[
                io.Model.Output(
@@ -1175,13 +1290,18 @@ class LoraModelLoader(io.ComfyNode):#
        )

    @classmethod
-    def execute(cls, model, lora, strength_model):
+    def execute(cls, model, lora, strength_model, bypass=False):
        if strength_model == 0:
            return io.NodeOutput(model)

-        model_lora, _ = comfy.sd.load_lora_for_models(
-            model, None, lora, strength_model, 0
-        )
+        if bypass:
+            model_lora, _ = comfy.sd.load_bypass_lora_for_models(
+                model, None, lora, strength_model, 0
+            )
+        else:
+            model_lora, _ = comfy.sd.load_lora_for_models(
+                model, None, lora, strength_model, 0
+            )
        return io.NodeOutput(model_lora)


--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@@ -202,6 +202,56 @@ class LoadVideo(io.ComfyNode):

        return True

+class VideoSlice(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="Video Slice",
+            display_name="Video Slice",
+            search_aliases=[
+                "trim video duration",
+                "skip first frames",
+                "frame load cap",
+                "start time",
+            ],
+            category="image/video",
+            inputs=[
+                io.Video.Input("video"),
+                io.Float.Input(
+                    "start_time",
+                    default=0.0,
+                    max=1e5,
+                    min=-1e5,
+                    step=0.001,
+                    tooltip="Start time in seconds",
+                ),
+                io.Float.Input(
+                    "duration",
+                    default=0.0,
+                    min=0.0,
+                    step=0.001,
+                    tooltip="Duration in seconds, or 0 for unlimited duration",
+                ),
+                io.Boolean.Input(
+                    "strict_duration",
+                    default=False,
+                    tooltip="If True, when the specified duration is not possible, an error will be raised.",
+                ),
+            ],
+            outputs=[
+                io.Video.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, video: io.Video.Type, start_time: float, duration: float, strict_duration: bool) -> io.NodeOutput:
+        trimmed = video.as_trimmed(start_time, duration, strict_duration=strict_duration)
+        if trimmed is not None:
+            return io.NodeOutput(trimmed)
+        raise ValueError(
+            f"Failed to slice video:\nSource duration: {video.get_duration()}\nStart time: {start_time}\nTarget duration: {duration}"
+        )
+

 class VideoExtension(ComfyExtension):
    @override
@@ -212,6 +262,7 @@ class VideoExtension(ComfyExtension):
            CreateVideo,
            GetVideoComponents,
            LoadVideo,
+            VideoSlice,
        ]

 async def comfy_entrypoint() -> VideoExtension:
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.12.2"
+__version__ = "0.13.0"
--- a/execution.py
+++ b/execution.py
@@ -13,8 +13,11 @@ from contextlib import nullcontext

 import torch

+from comfy.cli_args import args
 import comfy.memory_management
 import comfy.model_management
+import comfy_aimdo.model_vbar
+
 from latent_preview import set_preview_method
 import nodes
 from comfy_execution.caching import (
@@ -527,8 +530,10 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
                    output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data)
                finally:
                    if allocator is not None:
+                        if args.verbose == "DEBUG":
+                            comfy_aimdo.model_vbar.vbars_analyze()
                        comfy.model_management.reset_cast_buffers()
-                        torch.cuda.synchronize()
+                        comfy_aimdo.model_vbar.vbars_reset_watermark_limits()

            if has_pending_tasks:
                pending_async_nodes[unique_id] = output_data
--- a/nodes.py
+++ b/nodes.py
@@ -2433,7 +2433,8 @@ async def init_builtin_extra_nodes():
        "nodes_image_compare.py",
        "nodes_zimage.py",
        "nodes_lora_debug.py",
-        "nodes_color.py"
+        "nodes_color.py",
+        "nodes_toolkit.py",
    ]

    import_failed = []
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.12.2"
+version = "0.13.0"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-comfyui-frontend-package==1.37.11
-comfyui-workflow-templates==0.8.31
-comfyui-embedded-docs==0.4.0
+comfyui-frontend-package==1.38.13
+comfyui-workflow-templates==0.8.38
+comfyui-embedded-docs==0.4.1
 torch
 torchsde
 torchvision
@@ -22,7 +22,7 @@ alembic
 SQLAlchemy
 av>=14.2.0
 comfy-kitchen>=0.2.7
-comfy-aimdo>=0.1.7
+comfy-aimdo>=0.1.8
 requests

 #non essential dependencies:
@@ -30,3 +30,6 @@ kornia>=0.7.1
 spandrel
 pydantic~=2.0
 pydantic-settings~=2.0
+
+# test
+fastapi
--- a/tests/execution/test_jobs.py
+++ b/tests/execution/test_jobs.py
@@ -5,8 +5,11 @@ from comfy_execution.jobs import (
    is_previewable,
    normalize_queue_item,
    normalize_history_item,
+    normalize_output_item,
+    normalize_outputs,
    get_outputs_summary,
    apply_sorting,
+    has_3d_extension,
 )


@@ -35,8 +38,8 @@ class TestIsPreviewable:
    """Unit tests for is_previewable()"""

    def test_previewable_media_types(self):
-        """Images, video, audio media types should be previewable."""
-        for media_type in ['images', 'video', 'audio']:
+        """Images, video, audio, 3d media types should be previewable."""
+        for media_type in ['images', 'video', 'audio', '3d']:
            assert is_previewable(media_type, {}) is True

    def test_non_previewable_media_types(self):
@@ -46,7 +49,7 @@ class TestIsPreviewable:

    def test_3d_extensions_previewable(self):
        """3D file extensions should be previewable regardless of media_type."""
-        for ext in ['.obj', '.fbx', '.gltf', '.glb']:
+        for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']:
            item = {'filename': f'model{ext}'}
            assert is_previewable('files', item) is True

@@ -160,7 +163,7 @@ class TestGetOutputsSummary:

    def test_3d_files_previewable(self):
        """3D file extensions should be previewable."""
-        for ext in ['.obj', '.fbx', '.gltf', '.glb']:
+        for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']:
            outputs = {
                'node1': {
                    'files': [{'filename': f'model{ext}', 'type': 'output'}]
@@ -192,6 +195,64 @@ class TestGetOutputsSummary:
        assert preview['mediaType'] == 'images'
        assert preview['subfolder'] == 'outputs'

+    def test_string_3d_filename_creates_preview(self):
+        """String items with 3D extensions should synthesize a preview (Preview3D node output).
+        Only the .glb counts — nulls and non-file strings are excluded."""
+        outputs = {
+            'node1': {
+                'result': ['preview3d_abc123.glb', None, None]
+            }
+        }
+        count, preview = get_outputs_summary(outputs)
+        assert count == 1
+        assert preview is not None
+        assert preview['filename'] == 'preview3d_abc123.glb'
+        assert preview['mediaType'] == '3d'
+        assert preview['nodeId'] == 'node1'
+        assert preview['type'] == 'output'
+
+    def test_string_non_3d_filename_no_preview(self):
+        """String items without 3D extensions should not create a preview."""
+        outputs = {
+            'node1': {
+                'result': ['data.json', None]
+            }
+        }
+        count, preview = get_outputs_summary(outputs)
+        assert count == 0
+        assert preview is None
+
+    def test_string_3d_filename_used_as_fallback(self):
+        """String 3D preview should be used when no dict items are previewable."""
+        outputs = {
+            'node1': {
+                'latents': [{'filename': 'latent.safetensors'}],
+            },
+            'node2': {
+                'result': ['model.glb', None]
+            }
+        }
+        count, preview = get_outputs_summary(outputs)
+        assert preview is not None
+        assert preview['filename'] == 'model.glb'
+        assert preview['mediaType'] == '3d'
+
+
+class TestHas3DExtension:
+    """Unit tests for has_3d_extension()"""
+
+    def test_recognized_extensions(self):
+        for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']:
+            assert has_3d_extension(f'model{ext}') is True
+
+    def test_case_insensitive(self):
+        assert has_3d_extension('MODEL.GLB') is True
+        assert has_3d_extension('Scene.GLTF') is True
+
+    def test_non_3d_extensions(self):
+        for name in ['photo.png', 'video.mp4', 'data.json', 'model']:
+            assert has_3d_extension(name) is False
+

 class TestApplySorting:
    """Unit tests for apply_sorting()"""
@@ -395,3 +456,142 @@ class TestNormalizeHistoryItem:
            'prompt': {'nodes': {'1': {}}},
            'extra_data': {'create_time': 1234567890, 'client_id': 'abc'},
        }
+
+    def test_include_outputs_normalizes_3d_strings(self):
+        """Detail view should transform string 3D filenames into file output dicts."""
+        history_item = {
+            'prompt': (
+                5,
+                'prompt-3d',
+                {'nodes': {}},
+                {'create_time': 1234567890},
+                ['node1'],
+            ),
+            'status': {'status_str': 'success', 'completed': True, 'messages': []},
+            'outputs': {
+                'node1': {
+                    'result': ['preview3d_abc123.glb', None, None]
+                }
+            },
+        }
+        job = normalize_history_item('prompt-3d', history_item, include_outputs=True)
+
+        assert job['outputs_count'] == 1
+        result_items = job['outputs']['node1']['result']
+        assert len(result_items) == 1
+        assert result_items[0] == {
+            'filename': 'preview3d_abc123.glb',
+            'type': 'output',
+            'subfolder': '',
+            'mediaType': '3d',
+        }
+
+    def test_include_outputs_preserves_dict_items(self):
+        """Detail view normalization should pass dict items through unchanged."""
+        history_item = {
+            'prompt': (
+                5,
+                'prompt-img',
+                {'nodes': {}},
+                {'create_time': 1234567890},
+                ['node1'],
+            ),
+            'status': {'status_str': 'success', 'completed': True, 'messages': []},
+            'outputs': {
+                'node1': {
+                    'images': [
+                        {'filename': 'photo.png', 'type': 'output', 'subfolder': ''},
+                    ]
+                }
+            },
+        }
+        job = normalize_history_item('prompt-img', history_item, include_outputs=True)
+
+        assert job['outputs_count'] == 1
+        assert job['outputs']['node1']['images'] == [
+            {'filename': 'photo.png', 'type': 'output', 'subfolder': ''},
+        ]
+
+
+class TestNormalizeOutputItem:
+    """Unit tests for normalize_output_item()"""
+
+    def test_none_returns_none(self):
+        assert normalize_output_item(None) is None
+
+    def test_string_3d_extension_synthesizes_dict(self):
+        result = normalize_output_item('model.glb')
+        assert result == {'filename': 'model.glb', 'type': 'output', 'subfolder': '', 'mediaType': '3d'}
+
+    def test_string_non_3d_extension_returns_none(self):
+        assert normalize_output_item('data.json') is None
+
+    def test_string_no_extension_returns_none(self):
+        assert normalize_output_item('camera_info_string') is None
+
+    def test_dict_passes_through(self):
+        item = {'filename': 'test.png', 'type': 'output'}
+        assert normalize_output_item(item) is item
+
+    def test_other_types_return_none(self):
+        assert normalize_output_item(42) is None
+        assert normalize_output_item(True) is None
+
+
+class TestNormalizeOutputs:
+    """Unit tests for normalize_outputs()"""
+
+    def test_empty_outputs(self):
+        assert normalize_outputs({}) == {}
+
+    def test_dict_items_pass_through(self):
+        outputs = {
+            'node1': {
+                'images': [{'filename': 'a.png', 'type': 'output'}],
+            }
+        }
+        result = normalize_outputs(outputs)
+        assert result == outputs
+
+    def test_3d_string_synthesized(self):
+        outputs = {
+            'node1': {
+                'result': ['model.glb', None, None],
+            }
+        }
+        result = normalize_outputs(outputs)
+        assert result == {
+            'node1': {
+                'result': [
+                    {'filename': 'model.glb', 'type': 'output', 'subfolder': '', 'mediaType': '3d'},
+                ],
+            }
+        }
+
+    def test_animated_key_preserved(self):
+        outputs = {
+            'node1': {
+                'images': [{'filename': 'a.png', 'type': 'output'}],
+                'animated': [True],
+            }
+        }
+        result = normalize_outputs(outputs)
+        assert result['node1']['animated'] == [True]
+
+    def test_non_dict_node_outputs_preserved(self):
+        outputs = {'node1': 'unexpected_value'}
+        result = normalize_outputs(outputs)
+        assert result == {'node1': 'unexpected_value'}
+
+    def test_none_items_filtered_but_other_types_preserved(self):
+        outputs = {
+            'node1': {
+                'result': ['data.json', None, [1, 2, 3]],
+            }
+        }
+        result = normalize_outputs(outputs)
+        assert result == {
+            'node1': {
+                'result': ['data.json', [1, 2, 3]],
+            }
+        }
Author	SHA1	Message	Date
Si Feng	181a43b017	test	2026-02-10 22:57:00 -08:00
Si Feng	fff5a876b7	test2	2026-02-10 22:54:55 -08:00
Si Feng	2a7a8acb29	test	2026-02-10 22:54:55 -08:00
comfyanonymous	76a7fa96db	Make built in lora training work on anima. (#12402 )	2026-02-10 22:04:32 -05:00
Kohaku-Blueleaf	cdcf4119b3	[Trainer] training with proper offloading (#12189 ) * Fix bypass dtype/device moving * Force offloading mode for training * training context var * offloading implementation in training node * fix wrong input type * Support bypass load lora model, correct adapter/offloading handling	2026-02-10 21:45:19 -05:00
AustinMroz	dbe70b6821	Add a VideoSlice node (#12107 ) * Base TrimVideo implementation * Raise error if as_trimmed call fails * Bigger max start_time, tooltips, and formatting * Count packets unless codec has subframes * Remove incorrect nested decode * Add null check for audio streams * Support non-strict duration * Added strict_duration bool to node definition * Empty commit for approval * Fix duration * Support 5.1 audio layout on save --------- Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>	2026-02-10 14:42:21 -08:00
guill	00fff6019e	feat(jobs): add 3d to PREVIEWABLE_MEDIA_TYPES for first-class 3D output support (#12381 ) Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>	2026-02-10 14:37:14 -08:00
rattus	123a7874a9	ops: Fix vanilla-fp8 loaded lora quality (#12390 ) This was missing the stochastic rounding required for fp8 downcast to be consistent with model_patcher.patch_weight_to_device. Missed in testing as I spend too much time with quantized tensors and overlooked the simpler ones.	2026-02-10 13:38:28 -05:00
rattus	f719f9c062	sd: delay VAE dtype archive until after override (#12388 ) VAEs have host specific dtype logic that should override the dynamic _model_dtype. Defer the archiving of model dtypes until after.	2026-02-10 13:37:46 -05:00
rattus	fe053ba5eb	mp: dont deep-clone objects from model_options (#12382 ) If there are non-trivial python objects nested in the model_options, this causes all sorts of issues. Traverse lists and dicts so clones can safely overide settings and BYO objects but stop there on the deepclone.	2026-02-10 13:37:17 -05:00
comfyanonymous	6648ab68bc	ComfyUI v0.13.0	2026-02-10 13:26:29 -05:00
ComfyUI Wiki	6615db925c	chore: update workflow templates to v0.8.38 (#12394 )	2026-02-10 13:24:56 -05:00
Alexander Piskun	8ca842a8ed	feat(api-nodes-Kling): add new models (V3, O3) (#12389 ) * feat(api-nodes-Kling): add new models (V3, O3) * remove storyboard from VideoToVideo node * added check for total duration of storyboards * fixed other small things * updated display name for nodes * added "fake" seed	2026-02-10 09:34:54 -08:00
Alexander Piskun	c1b63a7e78	fix(Moonvalley-API-Nodes): adjust "steps" parameter to not raise exception (#12370 )	2026-02-09 21:58:27 -05:00
ComfyUI Wiki	349a636a2b	chore: update workflow templates to v0.8.37 (#12377 )	2026-02-09 21:25:34 -05:00
comfyanonymous	a4be04c5d7	Ace step prompts match now. (#12376 )	2026-02-09 19:45:56 -05:00
blepping	baf8c87455	Iimprovements to ACE-Steps 1.5 text encoding (part 2) (#12350 )	2026-02-09 19:41:49 -05:00
rattus	62315fbb15	Dynamic VRAM fixes - Ace 1.5 performance + a VRAM leak (#12368 ) * revert threaded model loader change This change was only needed to get around the pytorch 2.7 mempool bugs, and should have been reverted along with #12260. This fixes a different memory leak where pytorch gets confused about cache emptying. * load non comfy weights * MPDynamic: Pre-generate the tensors for vbars Apparently this is an expensive operation that slows down things. * bump to aimdo 1.8 New features: watermark limit feature logging enhancements -O2 build on linux	2026-02-09 16:16:08 -05:00
comfyanonymous	a0302cc6a8	Make tonemap latent work on any dim latents. (#12363 )	2026-02-08 21:16:40 -05:00
comfyanonymous	f350a84261	Disable prompt weights for ltxv2. (#12354 )	2026-02-07 19:16:28 -05:00
ComfyUI Wiki	3760d74005	chore: update embedded docs to v0.4.1 (#12346 )	2026-02-07 18:34:52 -05:00
chaObserv	9bf5aa54db	Add search_aliases to sa-solver and seeds-2 node (#12327 )	2026-02-07 17:38:51 -05:00
Jukka Seppänen	5ff4fdedba	Fix LazyCache (#12344 )	2026-02-07 11:25:30 -08:00
comfyanonymous	17e7df43d1	Pad ace step 1.5 ref audio if not long enough. (#12341 )	2026-02-07 00:02:11 -05:00
comfyanonymous	039955c527	Some fixes to previous pr. (#12339 )	2026-02-06 20:14:52 -05:00
tdrussell	6a26328842	Support fp16 for Cosmos-Predict2 and Anima (#12249 )	2026-02-06 20:12:15 -05:00
comfyanonymous	204e65b8dc	Fix bug with last pr (#12338 )	2026-02-06 19:48:20 -05:00
asagi4	a831c19b70	Fix return_word_ids=True with Anima tokenizer (#12328 )	2026-02-06 19:38:04 -05:00
comfyanonymous	eba6c940fd	Make ace step 1.5 base model work properly with default workflow. (#12337 )	2026-02-06 19:14:56 -05:00
Jukka Seppänen	a1c101f861	EasyCache: Support LTX2 (#12231 )	2026-02-06 00:43:09 -05:00
comfyanonymous	c2d7f07dbf	Fix issue when using disable_unet_model_creation (#12315 )	2026-02-05 19:24:09 -05:00
comfyanonymous	458292fef0	Fix some lowvram stuff with ace step 1.5 (#12312 )	2026-02-05 19:15:04 -05:00
comfyanonymous	6555dc65b8	Make ace step 1.5 work without the llm. (#12311 )	2026-02-05 16:43:45 -05:00
AustinMroz	2b70ab9ad0	Add a Create List node (#12173 )	2026-02-05 01:18:21 -05:00
Comfy Org PR Bot	00efcc6cd0	Bump comfyui-frontend-package to 1.38.13 (#12238 )	2026-02-05 01:17:37 -05:00
comfyanonymous	cb459573c8	ComfyUI v0.12.3	2026-02-05 01:13:35 -05:00
comfyanonymous	35183543e0	Add VAE tiled decode node for audio. (#12299 )	2026-02-05 01:12:04 -05:00
blepping	a246cc02b2	Improvements to ACE-Steps 1.5 text encoding (#12283 )	2026-02-05 00:17:37 -05:00
comfyanonymous	a50c32d63f	Disable sage attention on ace step 1.5 (#12297 )	2026-02-04 22:15:30 -05:00
comfyanonymous	6125b80979	Add llm sampling options and make reference audio work on ace step 1.5 (#12295 )	2026-02-04 21:29:22 -05:00
comfyanonymous	c8fcbd66ee	Try to fix ace text encoder slowness on some configs. (#12290 )	2026-02-04 19:37:05 -05:00
comfyanonymous	26dd7eb421	Fix ace step nan issue on some hardware/pytorch configs. (#12289 )	2026-02-04 18:25:06 -05:00
Alexander Piskun	e77b34dfea	add File3DAny output to Load3D node; extend SaveGLB to accept File3DAny as input (#12276 ) * add File3DAny output to Load3D node; extend SaveGLB node to accept File3DAny as input * fix(grammar): capitalize letter	2026-02-04 11:35:38 -08:00
rattus	ef73070ea4	mp: Fix checkpoint saving (#12268 ) Fix regression in the recent model saving refactor. Pass the non unet pieces down the layers so that checkpoints are complete.	2026-02-04 02:08:45 -05:00
rattus	d30c609f5a	utils: safetensors: dont slice data on torch level (#12266 ) Torch has alignment enforcement when viewing with data type changes but only relative to itself. Do all tensor constructions straight off the memory-view individually so pytorch doesnt see an alignment problem. The is needed for handling misaligned safetensors weights, which are reasonably common in third party models. This limits usage of this safetensors loader to GPU compute only as CPUs kernnel are very likely to bus error. But it works for dynamic_vram, where we really dont want to take a deep copy and we always use GPU copy_ which disentangles the misalignment.	2026-02-04 01:48:47 -05:00