feat: add search aliases for old mahiro name

Amp-Thread-ID: https://ampcode.com/threads/T-019c0d36-8b43-745f-b7b2-e35b53f17fa1
refactor: rename Mahiro CFG to Similarity-Adaptive Guidance
2026-02-13 11:40:02 +00:00 · 2026-02-06 00:59:06 -08:00 · 2026-02-06 00:58:53 -08:00 · 2026-02-06 00:43:09 -05:00 · 2026-02-05 19:24:09 -05:00 · 2026-02-05 19:15:04 -05:00
28 changed files with 304 additions and 118 deletions
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
@@ -7,6 +7,67 @@ from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management
 from comfy.ldm.flux.layers import timestep_embedding

+def get_silence_latent(length, device):
+    head = torch.tensor([[[ 0.5707,  0.0982,  0.6909, -0.5658,  0.6266,  0.6996, -0.1365, -0.1291,
+                        -0.0776, -0.1171, -0.2743, -0.8422, -0.1168,  1.5539, -4.6936,  0.7436,
+                        -1.1846, -0.2637,  0.6933, -6.7266,  0.0966, -0.1187, -0.3501, -1.1736,
+                        0.0587, -2.0517, -1.3651,  0.7508, -0.2490, -1.3548, -0.1290, -0.7261,
+                        1.1132, -0.3249,  0.2337,  0.3004,  0.6605, -0.0298, -0.1989, -0.4041,
+                        0.2843, -1.0963, -0.5519,  0.2639, -1.0436, -0.1183,  0.0640,  0.4460,
+                        -1.1001, -0.6172, -1.3241,  1.1379,  0.5623, -0.1507, -0.1963, -0.4742,
+                        -2.4697,  0.5302,  0.5381,  0.4636, -0.1782, -0.0687,  1.0333,  0.4202],
+                        [ 0.3040, -0.1367,  0.6200,  0.0665, -0.0642,  0.4655, -0.1187, -0.0440,
+                        0.2941, -0.2753,  0.0173, -0.2421, -0.0147,  1.5603, -2.7025,  0.7907,
+                        -0.9736, -0.0682,  0.1294, -5.0707, -0.2167,  0.3302, -0.1513, -0.8100,
+                        -0.3894, -0.2884, -0.3149,  0.8660, -0.3817, -1.7061,  0.5824, -0.4840,
+                        0.6938,  0.1859,  0.1753,  0.3081,  0.0195,  0.1403, -0.0754, -0.2091,
+                        0.1251, -0.1578, -0.4968, -0.1052, -0.4554, -0.0320,  0.1284,  0.4974,
+                        -1.1889, -0.0344, -0.8313,  0.2953,  0.5445, -0.6249, -0.1595, -0.0682,
+                        -3.1412,  0.0484,  0.4153,  0.8260, -0.1526, -0.0625,  0.5366,  0.8473],
+                        [ 5.3524e-02, -1.7534e-01,  5.4443e-01, -4.3501e-01, -2.1317e-03,
+                        3.7200e-01, -4.0143e-03, -1.5516e-01, -1.2968e-01, -1.5375e-01,
+                        -7.7107e-02, -2.0593e-01, -3.2780e-01,  1.5142e+00, -2.6101e+00,
+                        5.8698e-01, -1.2716e+00, -2.4773e-01, -2.7933e-02, -5.0799e+00,
+                        1.1601e-01,  4.0987e-01, -2.2030e-02, -6.6495e-01, -2.0995e-01,
+                        -6.3474e-01, -1.5893e-01,  8.2745e-01, -2.2992e-01, -1.6816e+00,
+                        5.4440e-01, -4.9579e-01,  5.5128e-01,  3.0477e-01,  8.3052e-02,
+                        -6.1782e-02,  5.9036e-03,  2.9553e-01, -8.0645e-02, -1.0060e-01,
+                        1.9144e-01, -3.8124e-01, -7.2949e-01,  2.4520e-02, -5.0814e-01,
+                        2.3977e-01,  9.2943e-02,  3.9256e-01, -1.1993e+00, -3.2752e-01,
+                        -7.2707e-01,  2.9476e-01,  4.3542e-01, -8.8597e-01, -4.1686e-01,
+                        -8.5390e-02, -2.9018e+00,  6.4988e-02,  5.3945e-01,  9.1988e-01,
+                        5.8762e-02, -7.0098e-02,  6.4772e-01,  8.9118e-01],
+                        [-3.2225e-02, -1.3195e-01,  5.6411e-01, -5.4766e-01, -5.2170e-03,
+                        3.1425e-01, -5.4367e-02, -1.9419e-01, -1.3059e-01, -1.3660e-01,
+                        -9.0984e-02, -1.9540e-01, -2.5590e-01,  1.5440e+00, -2.6349e+00,
+                        6.8273e-01, -1.2532e+00, -1.9810e-01, -2.2793e-02, -5.0506e+00,
+                        1.8818e-01,  5.0109e-01,  7.3546e-03, -6.8771e-01, -3.0676e-01,
+                        -7.3257e-01, -1.6687e-01,  9.2232e-01, -1.8987e-01, -1.7267e+00,
+                        5.3355e-01, -5.3179e-01,  4.4953e-01,  2.8820e-01,  1.3012e-01,
+                        -2.0943e-01, -1.1348e-01,  3.3929e-01, -1.5069e-01, -1.2919e-01,
+                        1.8929e-01, -3.6166e-01, -8.0756e-01,  6.6387e-02, -5.8867e-01,
+                        1.6978e-01,  1.0134e-01,  3.3877e-01, -1.2133e+00, -3.2492e-01,
+                        -8.1237e-01,  3.8101e-01,  4.3765e-01, -8.0596e-01, -4.4531e-01,
+                        -4.7513e-02, -2.9266e+00,  1.1741e-03,  4.5123e-01,  9.3075e-01,
+                        5.3688e-02, -1.9621e-01,  6.4530e-01,  9.3870e-01]]], device=device).movedim(-1, 1)
+
+    silence_latent = torch.tensor([[[-1.3672e-01, -1.5820e-01,  5.8594e-01, -5.7422e-01,  3.0273e-02,
+                                2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
+                                -2.7710e-02, -1.8066e-01, -2.9688e-01,  1.6016e+00, -2.6719e+00,
+                                7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
+                                2.4316e-01,  4.7266e-01,  4.6387e-02, -6.6406e-01, -2.1973e-01,
+                                -6.7578e-01, -1.5723e-01,  9.5312e-01, -2.0020e-01, -1.7109e+00,
+                                5.8984e-01, -5.7422e-01,  5.1562e-01,  2.8320e-01,  1.4551e-01,
+                                -1.8750e-01, -5.9814e-02,  3.6719e-01, -1.0059e-01, -1.5723e-01,
+                                2.0605e-01, -4.3359e-01, -8.2812e-01,  4.5654e-02, -6.6016e-01,
+                                1.4844e-01,  9.4727e-02,  3.8477e-01, -1.2578e+00, -3.3203e-01,
+                                -8.5547e-01,  4.3359e-01,  4.2383e-01, -8.9453e-01, -5.0391e-01,
+                                -5.6152e-02, -2.9219e+00, -2.4658e-02,  5.0391e-01,  9.8438e-01,
+                                7.2754e-02, -2.1582e-01,  6.3672e-01,  1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, length)
+    silence_latent[:, :, :head.shape[-1]] = head
+    return silence_latent
+
+
 def get_layer_class(operations, layer_name):
    if operations is not None and hasattr(operations, layer_name):
        return getattr(operations, layer_name)
@@ -183,7 +244,7 @@ class AceStepAttention(nn.Module):
            else:
                attn_bias = window_bias

-        attn_output = optimized_attention(query_states, key_states, value_states, self.num_heads, attn_bias, skip_reshape=True)
+        attn_output = optimized_attention(query_states, key_states, value_states, self.num_heads, attn_bias, skip_reshape=True, low_precision_attention=False)
        attn_output = self.o_proj(attn_output)

        return attn_output
@@ -677,7 +738,7 @@ class AttentionPooler(nn.Module):
    def forward(self, x):
        B, T, P, D = x.shape
        x = self.embed_tokens(x)
-        special = self.special_token.expand(B, T, 1, -1)
+        special = comfy.model_management.cast_to(self.special_token, device=x.device, dtype=x.dtype).expand(B, T, 1, -1)
        x = torch.cat([special, x], dim=2)
        x = x.view(B * T, P + 1, D)

@@ -728,7 +789,7 @@ class FSQ(nn.Module):
        self.register_buffer('implicit_codebook', implicit_codebook, persistent=False)

    def bound(self, z):
-        levels_minus_1 = (self._levels - 1).to(z.dtype)
+        levels_minus_1 = (comfy.model_management.cast_to(self._levels, device=z.device, dtype=z.dtype) - 1)
        scale = 2. / levels_minus_1
        bracket = (levels_minus_1 * (torch.tanh(z) + 1) / 2.) + 0.5

@@ -743,8 +804,8 @@ class FSQ(nn.Module):
        return codes_non_centered.float() * (2. / (self._levels.float() - 1)) - 1.

    def codes_to_indices(self, zhat):
-        zhat_normalized = (zhat + 1.) / (2. / (self._levels.to(zhat.dtype) - 1))
-        return (zhat_normalized * self._basis.to(zhat.dtype)).sum(dim=-1).round().to(torch.int32)
+        zhat_normalized = (zhat + 1.) / (2. / (comfy.model_management.cast_to(self._levels, device=zhat.device, dtype=zhat.dtype) - 1))
+        return (zhat_normalized * comfy.model_management.cast_to(self._basis, device=zhat.device, dtype=zhat.dtype)).sum(dim=-1).round().to(torch.int32)

    def forward(self, z):
        orig_dtype = z.dtype
@@ -826,7 +887,7 @@ class ResidualFSQ(nn.Module):
        x = self.project_in(x)

        if hasattr(self, 'soft_clamp_input_value'):
-            sc_val = self.soft_clamp_input_value.to(x.dtype)
+            sc_val = comfy.model_management.cast_to(self.soft_clamp_input_value, device=x.device, dtype=x.dtype)
            x = (x / sc_val).tanh() * sc_val

        quantized_out = torch.tensor(0., device=x.device, dtype=x.dtype)
@@ -834,7 +895,7 @@ class ResidualFSQ(nn.Module):
        all_indices = []

        for layer, scale in zip(self.layers, self.scales):
-            scale = scale.to(residual.dtype)
+            scale = comfy.model_management.cast_to(scale, device=x.device, dtype=x.dtype)

            quantized, indices = layer(residual / scale)
            quantized = quantized * scale
@@ -1035,28 +1096,26 @@ class AceStepConditionGenerationModel(nn.Module):
                    audio_codes = torch.nn.functional.pad(audio_codes, (0, math.ceil(src_latents.shape[1] / 5) - audio_codes.shape[1]), "constant", 35847)
                lm_hints_5Hz = self.tokenizer.quantizer.get_output_from_indices(audio_codes, dtype=text_hidden_states.dtype)
            else:
-                assert False
-                # TODO ?
+                lm_hints_5Hz, indices = self.tokenizer.tokenize(refer_audio_acoustic_hidden_states_packed)

            lm_hints = self.detokenizer(lm_hints_5Hz)

        lm_hints = lm_hints[:, :src_latents.shape[1], :]
-        if is_covers is None:
+        if is_covers is None or is_covers is True:
            src_latents = lm_hints
-        else:
-            src_latents = torch.where(is_covers.unsqueeze(-1).unsqueeze(-1) > 0, lm_hints, src_latents)
+        elif is_covers is False:
+            src_latents = refer_audio_acoustic_hidden_states_packed

        context_latents = torch.cat([src_latents, chunk_masks.to(src_latents.dtype)], dim=-1)

        return encoder_hidden, encoder_mask, context_latents

-    def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, **kwargs):
+    def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, is_covers=None, **kwargs):
        text_attention_mask = None
        lyric_attention_mask = None
        refer_audio_order_mask = None
        attention_mask = None
        chunk_masks = None
-        is_covers = None
        src_latents = None
        precomputed_lm_hints_25Hz = None
        lyric_hidden_states = lyric_embed
@@ -1068,7 +1127,7 @@ class AceStepConditionGenerationModel(nn.Module):
        if refer_audio_order_mask is None:
            refer_audio_order_mask = torch.zeros((x.shape[0],), device=x.device, dtype=torch.long)

-        if src_latents is None and is_covers is None:
+        if src_latents is None:
            src_latents = x

        if chunk_masks is None:
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -524,6 +524,9 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha

@wrap_attn
 def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
+    if kwargs.get("low_precision_attention", True) is False:
+        return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=skip_reshape, skip_output_reshape=skip_output_reshape, **kwargs)
+
    exception_fallback = False
    if skip_reshape:
        b, _, _, dim_head = q.shape
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -147,11 +147,11 @@ class BaseModel(torch.nn.Module):
                self.diffusion_model.to(memory_format=torch.channels_last)
                logging.debug("using channels last mode for diffusion model")
            logging.info("model weight dtype {}, manual cast: {}".format(self.get_dtype(), self.manual_cast_dtype))
+            comfy.model_management.archive_model_dtypes(self.diffusion_model)
+
        self.model_type = model_type
        self.model_sampling = model_sampling(model_config, model_type)

-        comfy.model_management.archive_model_dtypes(self.diffusion_model)
-
        self.adm_channels = unet_config.get("adm_in_channels", None)
        if self.adm_channels is None:
            self.adm_channels = 0
@@ -1548,6 +1548,7 @@ class ACEStep15(BaseModel):
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        device = kwargs["device"]
+        noise = kwargs["noise"]

        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
@@ -1559,27 +1560,22 @@ class ACEStep15(BaseModel):

        refer_audio = kwargs.get("reference_audio_timbre_latents", None)
        if refer_audio is None or len(refer_audio) == 0:
-            refer_audio = torch.tensor([[[-1.3672e-01, -1.5820e-01,  5.8594e-01, -5.7422e-01,  3.0273e-02,
-                                        2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
-                                        -2.7710e-02, -1.8066e-01, -2.9688e-01,  1.6016e+00, -2.6719e+00,
-                                        7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
-                                        2.4316e-01,  4.7266e-01,  4.6387e-02, -6.6406e-01, -2.1973e-01,
-                                        -6.7578e-01, -1.5723e-01,  9.5312e-01, -2.0020e-01, -1.7109e+00,
-                                        5.8984e-01, -5.7422e-01,  5.1562e-01,  2.8320e-01,  1.4551e-01,
-                                        -1.8750e-01, -5.9814e-02,  3.6719e-01, -1.0059e-01, -1.5723e-01,
-                                        2.0605e-01, -4.3359e-01, -8.2812e-01,  4.5654e-02, -6.6016e-01,
-                                        1.4844e-01,  9.4727e-02,  3.8477e-01, -1.2578e+00, -3.3203e-01,
-                                        -8.5547e-01,  4.3359e-01,  4.2383e-01, -8.9453e-01, -5.0391e-01,
-                                        -5.6152e-02, -2.9219e+00, -2.4658e-02,  5.0391e-01,  9.8438e-01,
-                                        7.2754e-02, -2.1582e-01,  6.3672e-01,  1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, 750)
+            refer_audio = comfy.ldm.ace.ace_step15.get_silence_latent(noise.shape[2], device)
+            pass_audio_codes = True
        else:
-            refer_audio = refer_audio[-1]
+            refer_audio = refer_audio[-1][:, :, :noise.shape[2]]
+            out['is_covers'] = comfy.conds.CONDConstant(True)
+            pass_audio_codes = False
+
+        if pass_audio_codes:
+            audio_codes = kwargs.get("audio_codes", None)
+            if audio_codes is not None:
+                out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
+                refer_audio = refer_audio[:, :, :750]
+            else:
+                out['is_covers'] = comfy.conds.CONDConstant(False)
+
        out['refer_audio'] = comfy.conds.CONDRegular(refer_audio)
-
-        audio_codes = kwargs.get("audio_codes", None)
-        if audio_codes is not None:
-            out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
-
        return out

 class Omnigen2(BaseModel):
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -54,6 +54,8 @@ try:
            SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION)

            def scaled_dot_product_attention(q, k, v, *args, **kwargs):
+                if q.nelement() < 1024 * 128:  # arbitrary number, for small inputs cudnn attention seems slower
+                    return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs)
                with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True):
                    return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs)
        else:
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -976,7 +976,7 @@ class VAE:
        if overlap is not None:
            args["overlap"] = overlap

-        if dims == 1:
+        if dims == 1 or self.extra_1d_channel is not None:
            args.pop("tile_y")
            output = self.decode_tiled_1d(samples, **args)
        elif dims == 2:
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -3,6 +3,7 @@ import comfy.text_encoders.llama
 from comfy import sd1_clip
 import torch
 import math
+import yaml
 import comfy.utils


@@ -101,9 +102,7 @@ def sample_manual_loop_no_classes(
    return output_audio_codes


-def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0):
-    cfg_scale = 2.0
-
+def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0, cfg_scale=2.0, temperature=0.85, top_p=0.9, top_k=0):
    positive = [[token for token, _ in inner_list] for inner_list in positive]
    negative = [[token for token, _ in inner_list] for inner_list in negative]
    positive = positive[0]
@@ -120,34 +119,80 @@ def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=102
        positive = [model.special_tokens["pad"]] * pos_pad + positive

    paddings = [pos_pad, neg_pad]
-    return sample_manual_loop_no_classes(model, [positive, negative], paddings, cfg_scale=cfg_scale, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+    return sample_manual_loop_no_classes(model, [positive, negative], paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)


 class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_06b", tokenizer=Qwen3Tokenizer)

+    def _metas_to_cot(self, *, return_yaml: bool = False, **kwargs) -> str:
+        user_metas = {
+            k: kwargs.pop(k)
+            for k in ("bpm", "duration", "keyscale", "timesignature", "language", "caption")
+            if k in kwargs
+        }
+        timesignature = user_metas.get("timesignature")
+        if isinstance(timesignature, str) and timesignature.endswith("/4"):
+            user_metas["timesignature"] = timesignature.rsplit("/", 1)[0]
+        user_metas = {
+            k: v if not isinstance(v, str) or not v.isdigit() else int(v)
+            for k, v in user_metas.items()
+            if v not in {"unspecified", None}
+        }
+        if len(user_metas):
+            meta_yaml = yaml.dump(user_metas, allow_unicode=True, sort_keys=True).strip()
+        else:
+            meta_yaml = ""
+        return f"<think>\n{meta_yaml}\n</think>" if not return_yaml else meta_yaml
+
+    def _metas_to_cap(self, **kwargs) -> str:
+        use_keys = ("bpm", "duration", "keyscale", "timesignature")
+        user_metas = { k: kwargs.pop(k, "N/A") for k in use_keys }
+        duration = user_metas["duration"]
+        if duration == "N/A":
+            user_metas["duration"] = "30 seconds"
+        elif isinstance(duration, (str, int, float)):
+            user_metas["duration"] = f"{math.ceil(float(duration))} seconds"
+        else:
+            raise TypeError("Unexpected type for duration key, must be str, int or float")
+        return "\n".join(f"- {k}: {user_metas[k]}" for k in use_keys)
+
    def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
        out = {}
        lyrics = kwargs.get("lyrics", "")
-        bpm = kwargs.get("bpm", 120)
        duration = kwargs.get("duration", 120)
-        keyscale = kwargs.get("keyscale", "C major")
-        timesignature = kwargs.get("timesignature", 2)
-        language = kwargs.get("language", "en")
+        language = kwargs.get("language")
        seed = kwargs.get("seed", 0)

+        generate_audio_codes = kwargs.get("generate_audio_codes", True)
+        cfg_scale = kwargs.get("cfg_scale", 2.0)
+        temperature = kwargs.get("temperature", 0.85)
+        top_p = kwargs.get("top_p", 0.9)
+        top_k = kwargs.get("top_k", 0.0)
+
+
        duration = math.ceil(duration)
-        meta_lm = 'bpm: {}\nduration: {}\nkeyscale: {}\ntimesignature: {}'.format(bpm, duration, keyscale, timesignature)
-        lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n{}\n<|im_end|>\n<|im_start|>assistant\n<think>\n{}\n</think>\n\n<|im_end|>\n"
+        kwargs["duration"] = duration

-        meta_cap = '- bpm: {}\n- timesignature: {}\n- keyscale: {}\n- duration: {}\n'.format(bpm, timesignature, keyscale, duration)
-        out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, meta_lm), disable_weights=True)
-        out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, ""), disable_weights=True)
+        cot_text = self._metas_to_cot(caption = text, **kwargs)
+        meta_cap = self._metas_to_cap(**kwargs)

-        out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric{}<|endoftext|><|endoftext|>".format(language, lyrics), return_word_ids, disable_weights=True, **kwargs)
-        out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}# Metas\n{}<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
-        out["lm_metadata"] = {"min_tokens": duration * 5, "seed": seed}
+        lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n# Lyric\n{}\n<|im_end|>\n<|im_start|>assistant\n{}\n<|im_end|>\n"
+
+        out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, cot_text), disable_weights=True)
+        out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, "<think>\n</think>"), disable_weights=True)
+
+        out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric\n{}<|endoftext|><|endoftext|>".format(language if language is not None else "", lyrics), return_word_ids, disable_weights=True, **kwargs)
+        out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}\n# Metas\n{}\n<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
+        out["lm_metadata"] = {"min_tokens": duration * 5,
+                              "seed": seed,
+                              "generate_audio_codes": generate_audio_codes,
+                              "cfg_scale": cfg_scale,
+                              "temperature": temperature,
+                              "top_p": top_p,
+                              "top_k": top_k,
+                              }
        return out


@@ -203,10 +248,14 @@ class ACE15TEModel(torch.nn.Module):
        self.qwen3_06b.set_clip_options({"layer": [0]})
        lyrics_embeds, _, extra_l = self.qwen3_06b.encode_token_weights(token_weight_pairs_lyrics)

-        lm_metadata = token_weight_pairs["lm_metadata"]
-        audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"])
+        out = {"conditioning_lyrics": lyrics_embeds[:, 0]}

-        return base_out, None, {"conditioning_lyrics": lyrics_embeds[:, 0], "audio_codes": [audio_codes]}
+        lm_metadata = token_weight_pairs["lm_metadata"]
+        if lm_metadata["generate_audio_codes"]:
+            audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"], cfg_scale=lm_metadata["cfg_scale"], temperature=lm_metadata["temperature"], top_p=lm_metadata["top_p"], top_k=lm_metadata["top_k"])
+            out["audio_codes"] = [audio_codes]
+
+        return base_out, None, out

    def set_clip_options(self, options):
        self.qwen3_06b.set_clip_options(options)
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -1309,7 +1309,6 @@ class NodeInfoV1:
    api_node: bool=None
    price_badge: dict | None = None
    search_aliases: list[str]=None
-    main_category: str=None


@dataclass
@@ -1431,8 +1430,6 @@ class Schema:
    """Flags a node as expandable, allowing NodeOutput to include 'expand' property."""
    accept_all_inputs: bool=False
    """When True, all inputs from the prompt will be passed to the node as kwargs, even if not defined in the schema."""
-    main_category: str | None = None
-    """Optional main category for top-level tabs in the node library (e.g., 'Basic', 'Image Tools', 'Partner Nodes')."""

    def validate(self):
        '''Validate the schema:
@@ -1539,7 +1536,6 @@ class Schema:
            python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes"),
            price_badge=self.price_badge.as_dict(self.inputs) if self.price_badge is not None else None,
            search_aliases=self.search_aliases if self.search_aliases else None,
-            main_category=self.main_category,
        )
        return info

--- a/comfy_api_nodes/nodes_hunyuan3d.py
+++ b/comfy_api_nodes/nodes_hunyuan3d.py
@@ -37,7 +37,6 @@ class TencentTextToModelNode(IO.ComfyNode):
            node_id="TencentTextToModelNode",
            display_name="Hunyuan3D: Text to Model (Pro)",
            category="api node/3d/Tencent",
-            main_category="3D",
            inputs=[
                IO.Combo.Input(
                    "model",
@@ -148,7 +147,6 @@ class TencentImageToModelNode(IO.ComfyNode):
            node_id="TencentImageToModelNode",
            display_name="Hunyuan3D: Image(s) to Model (Pro)",
            category="api node/3d/Tencent",
-            main_category="3D",
            inputs=[
                IO.Combo.Input(
                    "model",
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@@ -1936,7 +1936,6 @@ class KlingLipSyncAudioToVideoNode(IO.ComfyNode):
            node_id="KlingLipSyncAudioToVideoNode",
            display_name="Kling Lip Sync Video with Audio",
            category="api node/video/Kling",
-            main_category="Video Generation",
            description="Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.",
            inputs=[
                IO.Video.Input("video"),
--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@@ -576,7 +576,6 @@ class OpenAIChatNode(IO.ComfyNode):
            node_id="OpenAIChatNode",
            display_name="OpenAI ChatGPT",
            category="api node/text/OpenAI",
-            main_category="Text Generation",
            description="Generate text responses from an OpenAI model.",
            inputs=[
                IO.String.Input(
--- a/comfy_api_nodes/nodes_recraft.py
+++ b/comfy_api_nodes/nodes_recraft.py
@@ -963,7 +963,6 @@ class RecraftRemoveBackgroundNode(IO.ComfyNode):
            node_id="RecraftRemoveBackgroundNode",
            display_name="Recraft Remove Background",
            category="api node/image/Recraft",
-            main_category="Image Tools",
            description="Remove background from image, and return processed image and mask.",
            inputs=[
                IO.Image.Input("image"),
--- a/comfy_api_nodes/nodes_stability.py
+++ b/comfy_api_nodes/nodes_stability.py
@@ -624,7 +624,6 @@ class StabilityTextToAudio(IO.ComfyNode):
            node_id="StabilityTextToAudio",
            display_name="Stability AI Text To Audio",
            category="api node/audio/Stability AI",
-            main_category="Audio",
            description=cleandoc(cls.__doc__ or ""),
            inputs=[
                IO.Combo.Input(
--- a/comfy_extras/nodes_ace.py
+++ b/comfy_extras/nodes_ace.py
@@ -44,13 +44,18 @@ class TextEncodeAceStepAudio15(io.ComfyNode):
                io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
                io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
                io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
+                io.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
+                io.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
+                io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
+                io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
+                io.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
            ],
            outputs=[io.Conditioning.Output()],
        )

    @classmethod
-    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale) -> io.NodeOutput:
-        tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed)
+    def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k) -> io.NodeOutput:
+        tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k)
        conditioning = clip.encode_from_tokens_scheduled(tokens)
        return io.NodeOutput(conditioning)

@@ -100,14 +105,15 @@ class EmptyAceStep15LatentAudio(io.ComfyNode):
        latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device())
        return io.NodeOutput({"samples": latent, "type": "audio"})

-class ReferenceTimbreAudio(io.ComfyNode):
+class ReferenceAudio(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="ReferenceTimbreAudio",
+            display_name="Reference Audio",
            category="advanced/conditioning/audio",
            is_experimental=True,
-            description="This node sets the reference audio for timbre (for ace step 1.5)",
+            description="This node sets the reference audio for ace step 1.5",
            inputs=[
                io.Conditioning.Input("conditioning"),
                io.Latent.Input("latent", optional=True),
@@ -131,7 +137,7 @@ class AceExtension(ComfyExtension):
            EmptyAceStepLatentAudio,
            TextEncodeAceStepAudio15,
            EmptyAceStep15LatentAudio,
-            ReferenceTimbreAudio,
+            ReferenceAudio,
        ]

 async def comfy_entrypoint() -> AceExtension:
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -94,6 +94,19 @@ class VAEEncodeAudio(IO.ComfyNode):
    encode = execute  # TODO: remove


+def vae_decode_audio(vae, samples, tile=None, overlap=None):
+    if tile is not None:
+        audio = vae.decode_tiled(samples["samples"], tile_y=tile, overlap=overlap).movedim(-1, 1)
+    else:
+        audio = vae.decode(samples["samples"]).movedim(-1, 1)
+
+    std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
+    std[std < 1.0] = 1.0
+    audio /= std
+    vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
+    return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}
+
+
 class VAEDecodeAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -111,16 +124,33 @@ class VAEDecodeAudio(IO.ComfyNode):

    @classmethod
    def execute(cls, vae, samples) -> IO.NodeOutput:
-        audio = vae.decode(samples["samples"]).movedim(-1, 1)
-        std = torch.std(audio, dim=[1,2], keepdim=True) * 5.0
-        std[std < 1.0] = 1.0
-        audio /= std
-        vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
-        return IO.NodeOutput({"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]})
+        return IO.NodeOutput(vae_decode_audio(vae, samples))

    decode = execute  # TODO: remove


+class VAEDecodeAudioTiled(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="VAEDecodeAudioTiled",
+            search_aliases=["latent to audio"],
+            display_name="VAE Decode Audio (Tiled)",
+            category="latent/audio",
+            inputs=[
+                IO.Latent.Input("samples"),
+                IO.Vae.Input("vae"),
+                IO.Int.Input("tile_size", default=512, min=32, max=8192, step=8),
+                IO.Int.Input("overlap", default=64, min=0, max=1024, step=8),
+            ],
+            outputs=[IO.Audio.Output()],
+        )
+
+    @classmethod
+    def execute(cls, vae, samples, tile_size, overlap) -> IO.NodeOutput:
+        return IO.NodeOutput(vae_decode_audio(vae, samples, tile_size, overlap))
+
+
 class SaveAudio(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -129,7 +159,6 @@ class SaveAudio(IO.ComfyNode):
            search_aliases=["export flac"],
            display_name="Save Audio (FLAC)",
            category="audio",
-            main_category="Audio",
            inputs=[
                IO.Audio.Input("audio"),
                IO.String.Input("filename_prefix", default="audio/ComfyUI"),
@@ -271,7 +300,6 @@ class LoadAudio(IO.ComfyNode):
            search_aliases=["import audio", "open audio", "audio file"],
            display_name="Load Audio",
            category="audio",
-            main_category="Audio",
            inputs=[
                IO.Combo.Input("audio", upload=IO.UploadType.audio, options=sorted(files)),
            ],
@@ -677,6 +705,7 @@ class AudioExtension(ComfyExtension):
            EmptyLatentAudio,
            VAEEncodeAudio,
            VAEDecodeAudio,
+            VAEDecodeAudioTiled,
            SaveAudio,
            SaveAudioMP3,
            SaveAudioOpus,
--- a/comfy_extras/nodes_canny.py
+++ b/comfy_extras/nodes_canny.py
@@ -12,7 +12,6 @@ class Canny(io.ComfyNode):
            node_id="Canny",
            search_aliases=["edge detection", "outline", "contour detection", "line art"],
            category="image/preprocessors",
-            main_category="Image Tools/Preprocessing",
            inputs=[
                io.Image.Input("image"),
                io.Float.Input("low_threshold", default=0.4, min=0.01, max=0.99, step=0.01),
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@@ -9,6 +9,14 @@ if TYPE_CHECKING:
    from uuid import UUID


+def _extract_tensor(data, output_channels):
+    """Extract tensor from data, handling both single tensors and lists."""
+    if isinstance(data, list):
+        # LTX2 AV tensors: [video, audio]
+        return data[0][:, :output_channels], data[1][:, :output_channels]
+    return data[:, :output_channels], None
+
+
 def easycache_forward_wrapper(executor, *args, **kwargs):
    # get values from args
    transformer_options: dict[str] = args[-1]
@@ -17,7 +25,7 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
        if not transformer_options:
            transformer_options = args[-2]
    easycache: EasyCacheHolder = transformer_options["easycache"]
-    x: torch.Tensor = args[0][:, :easycache.output_channels]
+    x, ax = _extract_tensor(args[0], easycache.output_channels)
    sigmas = transformer_options["sigmas"]
    uuids = transformer_options["uuids"]
    if sigmas is not None and easycache.is_past_end_timestep(sigmas):
@@ -35,7 +43,11 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
        if easycache.skip_current_step and can_apply_cache_diff:
            if easycache.verbose:
                logging.info(f"EasyCache [verbose] - was marked to skip this step by {easycache.first_cond_uuid}. Present uuids: {uuids}")
-            return easycache.apply_cache_diff(x, uuids)
+            result = easycache.apply_cache_diff(x, uuids)
+            if ax is not None:
+                result_audio = easycache.apply_cache_diff(ax, uuids, is_audio=True)
+                return [result, result_audio]
+            return result
        if easycache.initial_step:
            easycache.first_cond_uuid = uuids[0]
            has_first_cond_uuid = easycache.has_first_cond_uuid(uuids)
@@ -51,13 +63,18 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
                        logging.info(f"EasyCache [verbose] - skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}")
                    # other conds should also skip this step, and instead use their cached values
                    easycache.skip_current_step = True
-                    return easycache.apply_cache_diff(x, uuids)
+                    result = easycache.apply_cache_diff(x, uuids)
+                    if ax is not None:
+                        result_audio = easycache.apply_cache_diff(ax, uuids, is_audio=True)
+                        return [result, result_audio]
+                    return result
                else:
                    if easycache.verbose:
                        logging.info(f"EasyCache [verbose] - NOT skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}")
                    easycache.cumulative_change_rate = 0.0

-    output: torch.Tensor = executor(*args, **kwargs)
+    full_output: torch.Tensor = executor(*args, **kwargs)
+    output, audio_output = _extract_tensor(full_output, easycache.output_channels)
    if has_first_cond_uuid and easycache.has_output_prev_norm():
        output_change = (easycache.subsample(output, uuids, clone=False) - easycache.output_prev_subsampled).flatten().abs().mean()
        if easycache.verbose:
@@ -74,13 +91,15 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
            logging.info(f"EasyCache [verbose] - output_change_rate: {output_change_rate}")
    # TODO: allow cache_diff to be offloaded
    easycache.update_cache_diff(output, next_x_prev, uuids)
+    if audio_output is not None:
+        easycache.update_cache_diff(audio_output, ax, uuids, is_audio=True)
    if has_first_cond_uuid:
        easycache.x_prev_subsampled = easycache.subsample(next_x_prev, uuids)
        easycache.output_prev_subsampled = easycache.subsample(output, uuids)
        easycache.output_prev_norm = output.flatten().abs().mean()
        if easycache.verbose:
            logging.info(f"EasyCache [verbose] - x_prev_subsampled: {easycache.x_prev_subsampled.shape}")
-    return output
+    return full_output

 def lazycache_predict_noise_wrapper(executor, *args, **kwargs):
    # get values from args
@@ -89,8 +108,8 @@ def lazycache_predict_noise_wrapper(executor, *args, **kwargs):
    easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"]
    if easycache.is_past_end_timestep(timestep):
        return executor(*args, **kwargs)
+    x: torch.Tensor = _extract_tensor(args[0], easycache.output_channels)
    # prepare next x_prev
-    x: torch.Tensor = args[0][:, :easycache.output_channels]
    next_x_prev = x
    input_change = None
    do_easycache = easycache.should_do_easycache(timestep)
@@ -197,6 +216,7 @@ class EasyCacheHolder:
        self.output_prev_subsampled: torch.Tensor = None
        self.output_prev_norm: torch.Tensor = None
        self.uuid_cache_diffs: dict[UUID, torch.Tensor] = {}
+        self.uuid_cache_diffs_audio: dict[UUID, torch.Tensor] = {}
        self.output_change_rates = []
        self.approx_output_change_rates = []
        self.total_steps_skipped = 0
@@ -245,20 +265,21 @@ class EasyCacheHolder:
    def can_apply_cache_diff(self, uuids: list[UUID]) -> bool:
        return all(uuid in self.uuid_cache_diffs for uuid in uuids)

-    def apply_cache_diff(self, x: torch.Tensor, uuids: list[UUID]):
-        if self.first_cond_uuid in uuids:
+    def apply_cache_diff(self, x: torch.Tensor, uuids: list[UUID], is_audio: bool = False):
+        if self.first_cond_uuid in uuids and not is_audio:
            self.total_steps_skipped += 1
+        cache_diffs = self.uuid_cache_diffs_audio if is_audio else self.uuid_cache_diffs
        batch_offset = x.shape[0] // len(uuids)
        for i, uuid in enumerate(uuids):
            # slice out only what is relevant to this cond
            batch_slice = [slice(i*batch_offset,(i+1)*batch_offset)]
            # if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video)
-            if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]:
+            if x.shape[1:] != cache_diffs[uuid].shape[1:]:
                if not self.allow_mismatch:
                    raise ValueError(f"Cached dims {self.uuid_cache_diffs[uuid].shape} don't match x dims {x.shape} - this is no good")
                slicing = []
                skip_this_dim = True
-                for dim_u, dim_x in zip(self.uuid_cache_diffs[uuid].shape, x.shape):
+                for dim_u, dim_x in zip(cache_diffs[uuid].shape, x.shape):
                    if skip_this_dim:
                        skip_this_dim = False
                        continue
@@ -270,10 +291,11 @@ class EasyCacheHolder:
                    else:
                        slicing.append(slice(None))
                batch_slice = batch_slice + slicing
-            x[tuple(batch_slice)] += self.uuid_cache_diffs[uuid].to(x.device)
+            x[tuple(batch_slice)] += cache_diffs[uuid].to(x.device)
        return x

-    def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]):
+    def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID], is_audio: bool = False):
+        cache_diffs = self.uuid_cache_diffs_audio if is_audio else self.uuid_cache_diffs
        # if output dims don't match x dims, cut off excess and hope for the best (cosmos world2video)
        if output.shape[1:] != x.shape[1:]:
            if not self.allow_mismatch:
@@ -293,7 +315,7 @@ class EasyCacheHolder:
        diff = output - x
        batch_offset = diff.shape[0] // len(uuids)
        for i, uuid in enumerate(uuids):
-            self.uuid_cache_diffs[uuid] = diff[i*batch_offset:(i+1)*batch_offset, ...]
+            cache_diffs[uuid] = diff[i*batch_offset:(i+1)*batch_offset, ...]

    def has_first_cond_uuid(self, uuids: list[UUID]) -> bool:
        return self.first_cond_uuid in uuids
@@ -324,6 +346,8 @@ class EasyCacheHolder:
        self.output_prev_norm = None
        del self.uuid_cache_diffs
        self.uuid_cache_diffs = {}
+        del self.uuid_cache_diffs_audio
+        self.uuid_cache_diffs_audio = {}
        self.total_steps_skipped = 0
        self.state_metadata = None
        return self
--- a/comfy_extras/nodes_hunyuan3d.py
+++ b/comfy_extras/nodes_hunyuan3d.py
@@ -621,7 +621,6 @@ class SaveGLB(IO.ComfyNode):
            display_name="Save 3D Model",
            search_aliases=["export 3d model", "save mesh"],
            category="3d",
-            main_category="Basic",
            is_output_node=True,
            inputs=[
                IO.MultiType.Input(
--- a/comfy_extras/nodes_images.py
+++ b/comfy_extras/nodes_images.py
@@ -25,7 +25,6 @@ class ImageCrop(IO.ComfyNode):
            search_aliases=["trim"],
            display_name="Image Crop",
            category="image/transform",
-            main_category="Image Tools",
            inputs=[
                IO.Image.Input("image"),
                IO.Int.Input("width", default=512, min=1, max=nodes.MAX_RESOLUTION, step=1),
@@ -538,7 +537,6 @@ class ImageRotate(IO.ComfyNode):
            node_id="ImageRotate",
            search_aliases=["turn", "flip orientation"],
            category="image/transform",
-            main_category="Image Tools",
            inputs=[
                IO.Image.Input("image"),
                IO.Combo.Input("rotation", options=["none", "90 degrees", "180 degrees", "270 degrees"]),
--- a/comfy_extras/nodes_load_3d.py
+++ b/comfy_extras/nodes_load_3d.py
@@ -31,7 +31,6 @@ class Load3D(IO.ComfyNode):
            node_id="Load3D",
            display_name="Load 3D & Animation",
            category="3d",
-            main_category="Basic",
            is_experimental=True,
            inputs=[
                IO.Combo.Input("model_file", options=sorted(files), upload=IO.UploadType.model),
--- a/comfy_extras/nodes_mahiro.py
+++ b/comfy_extras/nodes_mahiro.py
@@ -10,7 +10,7 @@ class Mahiro(io.ComfyNode):
    def define_schema(cls):
        return io.Schema(
            node_id="Mahiro",
-            display_name="Mahiro CFG",
+            display_name="Similarity-Adaptive Guidance",
            category="_for_testing",
            description="Modify the guidance to scale more on the 'direction' of the positive prompt rather than the difference between the negative prompt.",
            inputs=[
@@ -20,6 +20,7 @@ class Mahiro(io.ComfyNode):
                io.Model.Output(display_name="patched_model"),
            ],
            is_experimental=True,
+            search_aliases=["mahiro", "mahiro cfg"],
        )

    @classmethod
--- a/comfy_extras/nodes_post_processing.py
+++ b/comfy_extras/nodes_post_processing.py
@@ -77,7 +77,6 @@ class Blur(io.ComfyNode):
        return io.Schema(
            node_id="ImageBlur",
            category="image/postprocessing",
-            main_category="Image Tools",
            inputs=[
                io.Image.Input("image"),
                io.Int.Input("blur_radius", default=1, min=1, max=31, step=1),
--- a/comfy_extras/nodes_toolkit.py
+++ b/comfy_extras/nodes_toolkit.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+
+class CreateList(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        template_matchtype = io.MatchType.Template("type")
+        template_autogrow = io.Autogrow.TemplatePrefix(
+            input=io.MatchType.Input("input", template=template_matchtype),
+            prefix="input",
+        )
+        return io.Schema(
+            node_id="CreateList",
+            display_name="Create List",
+            category="logic",
+            is_input_list=True,
+            search_aliases=["Image Iterator", "Text Iterator", "Iterator"],
+            inputs=[io.Autogrow.Input("inputs", template=template_autogrow)],
+            outputs=[
+                io.MatchType.Output(
+                    template=template_matchtype,
+                    is_output_list=True,
+                    display_name="list",
+                ),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, inputs: io.Autogrow.Type) -> io.NodeOutput:
+        output_list = []
+        for input in inputs.values():
+            output_list += input
+        return io.NodeOutput(output_list)
+
+
+class ToolkitExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            CreateList,
+        ]
+
+
+async def comfy_entrypoint() -> ToolkitExtension:
+    return ToolkitExtension()
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@@ -73,7 +73,6 @@ class SaveVideo(io.ComfyNode):
            search_aliases=["export video"],
            display_name="Save Video",
            category="image/video",
-            main_category="Basic",
            description="Saves the input images to your ComfyUI output directory.",
            inputs=[
                io.Video.Input("video", tooltip="The video to save."),
@@ -147,7 +146,6 @@ class GetVideoComponents(io.ComfyNode):
            search_aliases=["extract frames", "split video", "video to images", "demux"],
            display_name="Get Video Components",
            category="image/video",
-            main_category="Video Tools",
            description="Extracts all components from a video: frames, audio, and framerate.",
            inputs=[
                io.Video.Input("video", tooltip="The video to extract components from."),
@@ -176,7 +174,6 @@ class LoadVideo(io.ComfyNode):
            search_aliases=["import video", "open video", "video file"],
            display_name="Load Video",
            category="image/video",
-            main_category="Basic",
            inputs=[
                io.Combo.Input("file", options=sorted(files), upload=io.UploadType.video),
            ],
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.12.2"
+__version__ = "0.12.3"
--- a/nodes.py
+++ b/nodes.py
@@ -69,7 +69,6 @@ class CLIPTextEncode(ComfyNodeABC):
    FUNCTION = "encode"

    CATEGORY = "conditioning"
-    MAIN_CATEGORY = "Basic"
    DESCRIPTION = "Encodes a text prompt using a CLIP model into an embedding that can be used to guide the diffusion model towards generating specific images."
    SEARCH_ALIASES = ["text", "prompt", "text prompt", "positive prompt", "negative prompt", "encode text", "text encoder", "encode prompt"]

@@ -668,8 +667,6 @@ class CLIPSetLastLayer:
        return (clip,)

 class LoraLoader:
-    MAIN_CATEGORY = "Image Generation"
-
    def __init__(self):
        self.loaded_lora = None

@@ -1651,7 +1648,6 @@ class SaveImage:
    OUTPUT_NODE = True

    CATEGORY = "image"
-    MAIN_CATEGORY = "Basic"
    DESCRIPTION = "Saves the input images to your ComfyUI output directory."
    SEARCH_ALIASES = ["save", "save image", "export image", "output image", "write image", "download"]

@@ -1710,7 +1706,6 @@ class LoadImage:
                }

    CATEGORY = "image"
-    MAIN_CATEGORY = "Basic"
    SEARCH_ALIASES = ["load image", "open image", "import image", "image input", "upload image", "read image", "image loader"]

    RETURN_TYPES = ("IMAGE", "MASK")
@@ -1868,7 +1863,6 @@ class ImageScale:
    FUNCTION = "upscale"

    CATEGORY = "image/upscaling"
-    MAIN_CATEGORY = "Image Tools"
    SEARCH_ALIASES = ["resize", "resize image", "scale image", "image resize", "zoom", "zoom in", "change size"]

    def upscale(self, image, upscale_method, width, height, crop):
@@ -1908,7 +1902,6 @@ class ImageScaleBy:

 class ImageInvert:
    SEARCH_ALIASES = ["reverse colors"]
-    MAIN_CATEGORY = "Image Tools"

    @classmethod
    def INPUT_TYPES(s):
@@ -1925,7 +1918,6 @@ class ImageInvert:

 class ImageBatch:
    SEARCH_ALIASES = ["combine images", "merge images", "stack images"]
-    MAIN_CATEGORY = "Image Tools"

    @classmethod
    def INPUT_TYPES(s):
@@ -2441,7 +2433,8 @@ async def init_builtin_extra_nodes():
        "nodes_image_compare.py",
        "nodes_zimage.py",
        "nodes_lora_debug.py",
-        "nodes_color.py"
+        "nodes_color.py",
+        "nodes_toolkit.py",
    ]

    import_failed = []
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.12.2"
+version = "0.12.3"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.37.11
+comfyui-frontend-package==1.38.13
 comfyui-workflow-templates==0.8.31
 comfyui-embedded-docs==0.4.0
 torch
--- a/server.py
+++ b/server.py
@@ -687,10 +687,6 @@ class PromptServer():
                info['api_node'] = obj_class.API_NODE

            info['search_aliases'] = getattr(obj_class, 'SEARCH_ALIASES', [])
-
-            if hasattr(obj_class, 'MAIN_CATEGORY'):
-                info['main_category'] = obj_class.MAIN_CATEGORY
-
            return info

        @routes.get("/object_info")
Author	SHA1	Message	Date
bymyself	6253afa3a9	feat: add search aliases for old mahiro name Amp-Thread-ID: https://ampcode.com/threads/T-019c0d36-8b43-745f-b7b2-e35b53f17fa1	2026-02-06 00:59:06 -08:00
bymyself	d630dde8c8	refactor: rename Mahiro CFG to Similarity-Adaptive Guidance Rename the display name to better describe what the node does: adaptively blends guidance based on cosine similarity between positive and negative conditions. Amp-Thread-ID: https://ampcode.com/threads/T-019c0d36-8b43-745f-b7b2-e35b53f17fa1 Co-authored-by: Amp <amp@ampcode.com>	2026-02-06 00:58:53 -08:00
Jukka Seppänen	a1c101f861	EasyCache: Support LTX2 (#12231 )	2026-02-06 00:43:09 -05:00
comfyanonymous	c2d7f07dbf	Fix issue when using disable_unet_model_creation (#12315 )	2026-02-05 19:24:09 -05:00
comfyanonymous	458292fef0	Fix some lowvram stuff with ace step 1.5 (#12312 )	2026-02-05 19:15:04 -05:00
comfyanonymous	6555dc65b8	Make ace step 1.5 work without the llm. (#12311 )	2026-02-05 16:43:45 -05:00
AustinMroz	2b70ab9ad0	Add a Create List node (#12173 )	2026-02-05 01:18:21 -05:00
Comfy Org PR Bot	00efcc6cd0	Bump comfyui-frontend-package to 1.38.13 (#12238 )	2026-02-05 01:17:37 -05:00
comfyanonymous	cb459573c8	ComfyUI v0.12.3	2026-02-05 01:13:35 -05:00
comfyanonymous	35183543e0	Add VAE tiled decode node for audio. (#12299 )	2026-02-05 01:12:04 -05:00
blepping	a246cc02b2	Improvements to ACE-Steps 1.5 text encoding (#12283 )	2026-02-05 00:17:37 -05:00
comfyanonymous	a50c32d63f	Disable sage attention on ace step 1.5 (#12297 )	2026-02-04 22:15:30 -05:00
comfyanonymous	6125b80979	Add llm sampling options and make reference audio work on ace step 1.5 (#12295 )	2026-02-04 21:29:22 -05:00
comfyanonymous	c8fcbd66ee	Try to fix ace text encoder slowness on some configs. (#12290 )	2026-02-04 19:37:05 -05:00