From a246cc02b274104d5f656b68ce505354c164aef8 Mon Sep 17 00:00:00 2001
From: blepping <157360029+blepping@users.noreply.github.com>
Date: Wed, 4 Feb 2026 22:17:37 -0700
Subject: [PATCH 01/33] Improvements to ACE-Steps 1.5 text encoding (#12283)
---
comfy/text_encoders/ace15.py | 56 +++++++++++++++++++++++++++++-------
1 file changed, 45 insertions(+), 11 deletions(-)
diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index 74e62733e..00dd5ba90 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -3,6 +3,7 @@ import comfy.text_encoders.llama
from comfy import sd1_clip
import torch
import math
+import yaml
import comfy.utils
@@ -125,14 +126,43 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_06b", tokenizer=Qwen3Tokenizer)
+ def _metas_to_cot(self, *, return_yaml: bool = False, **kwargs) -> str:
+ user_metas = {
+ k: kwargs.pop(k)
+ for k in ("bpm", "duration", "keyscale", "timesignature", "language", "caption")
+ if k in kwargs
+ }
+ timesignature = user_metas.get("timesignature")
+ if isinstance(timesignature, str) and timesignature.endswith("/4"):
+ user_metas["timesignature"] = timesignature.rsplit("/", 1)[0]
+ user_metas = {
+ k: v if not isinstance(v, str) or not v.isdigit() else int(v)
+ for k, v in user_metas.items()
+ if v not in {"unspecified", None}
+ }
+ if len(user_metas):
+ meta_yaml = yaml.dump(user_metas, allow_unicode=True, sort_keys=True).strip()
+ else:
+ meta_yaml = ""
+ return f"\n{meta_yaml}\n" if not return_yaml else meta_yaml
+
+ def _metas_to_cap(self, **kwargs) -> str:
+ use_keys = ("bpm", "duration", "keyscale", "timesignature")
+ user_metas = { k: kwargs.pop(k, "N/A") for k in use_keys }
+ duration = user_metas["duration"]
+ if duration == "N/A":
+ user_metas["duration"] = "30 seconds"
+ elif isinstance(duration, (str, int, float)):
+ user_metas["duration"] = f"{math.ceil(float(duration))} seconds"
+ else:
+ raise TypeError("Unexpected type for duration key, must be str, int or float")
+ return "\n".join(f"- {k}: {user_metas[k]}" for k in use_keys)
+
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
out = {}
lyrics = kwargs.get("lyrics", "")
- bpm = kwargs.get("bpm", 120)
duration = kwargs.get("duration", 120)
- keyscale = kwargs.get("keyscale", "C major")
- timesignature = kwargs.get("timesignature", 2)
- language = kwargs.get("language", "en")
+ language = kwargs.get("language")
seed = kwargs.get("seed", 0)
generate_audio_codes = kwargs.get("generate_audio_codes", True)
@@ -141,16 +171,20 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
top_p = kwargs.get("top_p", 0.9)
top_k = kwargs.get("top_k", 0.0)
+
duration = math.ceil(duration)
- meta_lm = 'bpm: {}\nduration: {}\nkeyscale: {}\ntimesignature: {}'.format(bpm, duration, keyscale, timesignature)
- lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n{}\n<|im_end|>\n<|im_start|>assistant\n\n{}\n\n\n<|im_end|>\n"
+ kwargs["duration"] = duration
- meta_cap = '- bpm: {}\n- timesignature: {}\n- keyscale: {}\n- duration: {}\n'.format(bpm, timesignature, keyscale, duration)
- out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, meta_lm), disable_weights=True)
- out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, ""), disable_weights=True)
+ cot_text = self._metas_to_cot(caption = text, **kwargs)
+ meta_cap = self._metas_to_cap(**kwargs)
- out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric{}<|endoftext|><|endoftext|>".format(language, lyrics), return_word_ids, disable_weights=True, **kwargs)
- out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}# Metas\n{}<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
+ lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n# Lyric\n{}\n<|im_end|>\n<|im_start|>assistant\n{}\n<|im_end|>\n"
+
+ out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, cot_text), disable_weights=True)
+ out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, "\n"), disable_weights=True)
+
+ out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric\n{}<|endoftext|><|endoftext|>".format(language if language is not None else "", lyrics), return_word_ids, disable_weights=True, **kwargs)
+ out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}\n# Metas\n{}\n<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
out["lm_metadata"] = {"min_tokens": duration * 5,
"seed": seed,
"generate_audio_codes": generate_audio_codes,
From 35183543e004d8b7509c043e7a680bee07171622 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 4 Feb 2026 22:12:04 -0800
Subject: [PATCH 02/33] Add VAE tiled decode node for audio. (#12299)
---
comfy/sd.py | 2 +-
comfy_extras/nodes_audio.py | 43 +++++++++++++++++++++++++++++++------
2 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/comfy/sd.py b/comfy/sd.py
index bc63d6ced..bc9407405 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -976,7 +976,7 @@ class VAE:
if overlap is not None:
args["overlap"] = overlap
- if dims == 1:
+ if dims == 1 or self.extra_1d_channel is not None:
args.pop("tile_y")
output = self.decode_tiled_1d(samples, **args)
elif dims == 2:
diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
index bef723dce..b63dd8e97 100644
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -94,6 +94,19 @@ class VAEEncodeAudio(IO.ComfyNode):
encode = execute # TODO: remove
+def vae_decode_audio(vae, samples, tile=None, overlap=None):
+ if tile is not None:
+ audio = vae.decode_tiled(samples["samples"], tile_y=tile, overlap=overlap).movedim(-1, 1)
+ else:
+ audio = vae.decode(samples["samples"]).movedim(-1, 1)
+
+ std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
+ std[std < 1.0] = 1.0
+ audio /= std
+ vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
+ return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}
+
+
class VAEDecodeAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
@@ -111,16 +124,33 @@ class VAEDecodeAudio(IO.ComfyNode):
@classmethod
def execute(cls, vae, samples) -> IO.NodeOutput:
- audio = vae.decode(samples["samples"]).movedim(-1, 1)
- std = torch.std(audio, dim=[1,2], keepdim=True) * 5.0
- std[std < 1.0] = 1.0
- audio /= std
- vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
- return IO.NodeOutput({"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]})
+ return IO.NodeOutput(vae_decode_audio(vae, samples))
decode = execute # TODO: remove
+class VAEDecodeAudioTiled(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="VAEDecodeAudioTiled",
+ search_aliases=["latent to audio"],
+ display_name="VAE Decode Audio (Tiled)",
+ category="latent/audio",
+ inputs=[
+ IO.Latent.Input("samples"),
+ IO.Vae.Input("vae"),
+ IO.Int.Input("tile_size", default=512, min=32, max=8192, step=8),
+ IO.Int.Input("overlap", default=64, min=0, max=1024, step=8),
+ ],
+ outputs=[IO.Audio.Output()],
+ )
+
+ @classmethod
+ def execute(cls, vae, samples, tile_size, overlap) -> IO.NodeOutput:
+ return IO.NodeOutput(vae_decode_audio(vae, samples, tile_size, overlap))
+
+
class SaveAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
@@ -675,6 +705,7 @@ class AudioExtension(ComfyExtension):
EmptyLatentAudio,
VAEEncodeAudio,
VAEDecodeAudio,
+ VAEDecodeAudioTiled,
SaveAudio,
SaveAudioMP3,
SaveAudioOpus,
From cb459573c8fa025bbf9ecf312f6af376d659f567 Mon Sep 17 00:00:00 2001
From: comfyanonymous
Date: Thu, 5 Feb 2026 01:13:35 -0500
Subject: [PATCH 03/33] ComfyUI v0.12.3
---
comfyui_version.py | 2 +-
pyproject.toml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/comfyui_version.py b/comfyui_version.py
index 5d296cd1b..706b37763 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
# This file is automatically generated by the build process when version is
# updated in pyproject.toml.
-__version__ = "0.12.2"
+__version__ = "0.12.3"
diff --git a/pyproject.toml b/pyproject.toml
index 1ddcc3596..f7925b92a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "ComfyUI"
-version = "0.12.2"
+version = "0.12.3"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"
From 00efcc6cd028206ad81a90dec177c9a470a20a2a Mon Sep 17 00:00:00 2001
From: Comfy Org PR Bot
Date: Thu, 5 Feb 2026 15:17:37 +0900
Subject: [PATCH 04/33] Bump comfyui-frontend-package to 1.38.13 (#12238)
---
requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index 0c401873a..41cc9174b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.37.11
+comfyui-frontend-package==1.38.13
comfyui-workflow-templates==0.8.31
comfyui-embedded-docs==0.4.0
torch
From 2b70ab9ad0fd6a38b11546a18c546ce40cc176a1 Mon Sep 17 00:00:00 2001
From: AustinMroz
Date: Wed, 4 Feb 2026 22:18:21 -0800
Subject: [PATCH 05/33] Add a Create List node (#12173)
---
comfy_extras/nodes_toolkit.py | 47 +++++++++++++++++++++++++++++++++++
nodes.py | 3 ++-
2 files changed, 49 insertions(+), 1 deletion(-)
create mode 100644 comfy_extras/nodes_toolkit.py
diff --git a/comfy_extras/nodes_toolkit.py b/comfy_extras/nodes_toolkit.py
new file mode 100644
index 000000000..71faf7226
--- /dev/null
+++ b/comfy_extras/nodes_toolkit.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+
+class CreateList(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ template_matchtype = io.MatchType.Template("type")
+ template_autogrow = io.Autogrow.TemplatePrefix(
+ input=io.MatchType.Input("input", template=template_matchtype),
+ prefix="input",
+ )
+ return io.Schema(
+ node_id="CreateList",
+ display_name="Create List",
+ category="logic",
+ is_input_list=True,
+ search_aliases=["Image Iterator", "Text Iterator", "Iterator"],
+ inputs=[io.Autogrow.Input("inputs", template=template_autogrow)],
+ outputs=[
+ io.MatchType.Output(
+ template=template_matchtype,
+ is_output_list=True,
+ display_name="list",
+ ),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, inputs: io.Autogrow.Type) -> io.NodeOutput:
+ output_list = []
+ for input in inputs.values():
+ output_list += input
+ return io.NodeOutput(output_list)
+
+
+class ToolkitExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ CreateList,
+ ]
+
+
+async def comfy_entrypoint() -> ToolkitExtension:
+ return ToolkitExtension()
diff --git a/nodes.py b/nodes.py
index e11a8ed80..91de7a9d7 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2433,7 +2433,8 @@ async def init_builtin_extra_nodes():
"nodes_image_compare.py",
"nodes_zimage.py",
"nodes_lora_debug.py",
- "nodes_color.py"
+ "nodes_color.py",
+ "nodes_toolkit.py",
]
import_failed = []
From 6555dc65b82c5f072dcad87f0dbccb4fc5f85e6b Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 5 Feb 2026 13:43:45 -0800
Subject: [PATCH 06/33] Make ace step 1.5 work without the llm. (#12311)
---
comfy/ldm/ace/ace_step15.py | 72 +++++++++++++++++++++++++++++++++----
comfy/model_base.py | 17 +++------
2 files changed, 70 insertions(+), 19 deletions(-)
diff --git a/comfy/ldm/ace/ace_step15.py b/comfy/ldm/ace/ace_step15.py
index f2b130bc1..7fc7f1e8e 100644
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
@@ -7,6 +7,67 @@ from comfy.ldm.modules.attention import optimized_attention
import comfy.model_management
from comfy.ldm.flux.layers import timestep_embedding
+def get_silence_latent(length, device):
+ head = torch.tensor([[[ 0.5707, 0.0982, 0.6909, -0.5658, 0.6266, 0.6996, -0.1365, -0.1291,
+ -0.0776, -0.1171, -0.2743, -0.8422, -0.1168, 1.5539, -4.6936, 0.7436,
+ -1.1846, -0.2637, 0.6933, -6.7266, 0.0966, -0.1187, -0.3501, -1.1736,
+ 0.0587, -2.0517, -1.3651, 0.7508, -0.2490, -1.3548, -0.1290, -0.7261,
+ 1.1132, -0.3249, 0.2337, 0.3004, 0.6605, -0.0298, -0.1989, -0.4041,
+ 0.2843, -1.0963, -0.5519, 0.2639, -1.0436, -0.1183, 0.0640, 0.4460,
+ -1.1001, -0.6172, -1.3241, 1.1379, 0.5623, -0.1507, -0.1963, -0.4742,
+ -2.4697, 0.5302, 0.5381, 0.4636, -0.1782, -0.0687, 1.0333, 0.4202],
+ [ 0.3040, -0.1367, 0.6200, 0.0665, -0.0642, 0.4655, -0.1187, -0.0440,
+ 0.2941, -0.2753, 0.0173, -0.2421, -0.0147, 1.5603, -2.7025, 0.7907,
+ -0.9736, -0.0682, 0.1294, -5.0707, -0.2167, 0.3302, -0.1513, -0.8100,
+ -0.3894, -0.2884, -0.3149, 0.8660, -0.3817, -1.7061, 0.5824, -0.4840,
+ 0.6938, 0.1859, 0.1753, 0.3081, 0.0195, 0.1403, -0.0754, -0.2091,
+ 0.1251, -0.1578, -0.4968, -0.1052, -0.4554, -0.0320, 0.1284, 0.4974,
+ -1.1889, -0.0344, -0.8313, 0.2953, 0.5445, -0.6249, -0.1595, -0.0682,
+ -3.1412, 0.0484, 0.4153, 0.8260, -0.1526, -0.0625, 0.5366, 0.8473],
+ [ 5.3524e-02, -1.7534e-01, 5.4443e-01, -4.3501e-01, -2.1317e-03,
+ 3.7200e-01, -4.0143e-03, -1.5516e-01, -1.2968e-01, -1.5375e-01,
+ -7.7107e-02, -2.0593e-01, -3.2780e-01, 1.5142e+00, -2.6101e+00,
+ 5.8698e-01, -1.2716e+00, -2.4773e-01, -2.7933e-02, -5.0799e+00,
+ 1.1601e-01, 4.0987e-01, -2.2030e-02, -6.6495e-01, -2.0995e-01,
+ -6.3474e-01, -1.5893e-01, 8.2745e-01, -2.2992e-01, -1.6816e+00,
+ 5.4440e-01, -4.9579e-01, 5.5128e-01, 3.0477e-01, 8.3052e-02,
+ -6.1782e-02, 5.9036e-03, 2.9553e-01, -8.0645e-02, -1.0060e-01,
+ 1.9144e-01, -3.8124e-01, -7.2949e-01, 2.4520e-02, -5.0814e-01,
+ 2.3977e-01, 9.2943e-02, 3.9256e-01, -1.1993e+00, -3.2752e-01,
+ -7.2707e-01, 2.9476e-01, 4.3542e-01, -8.8597e-01, -4.1686e-01,
+ -8.5390e-02, -2.9018e+00, 6.4988e-02, 5.3945e-01, 9.1988e-01,
+ 5.8762e-02, -7.0098e-02, 6.4772e-01, 8.9118e-01],
+ [-3.2225e-02, -1.3195e-01, 5.6411e-01, -5.4766e-01, -5.2170e-03,
+ 3.1425e-01, -5.4367e-02, -1.9419e-01, -1.3059e-01, -1.3660e-01,
+ -9.0984e-02, -1.9540e-01, -2.5590e-01, 1.5440e+00, -2.6349e+00,
+ 6.8273e-01, -1.2532e+00, -1.9810e-01, -2.2793e-02, -5.0506e+00,
+ 1.8818e-01, 5.0109e-01, 7.3546e-03, -6.8771e-01, -3.0676e-01,
+ -7.3257e-01, -1.6687e-01, 9.2232e-01, -1.8987e-01, -1.7267e+00,
+ 5.3355e-01, -5.3179e-01, 4.4953e-01, 2.8820e-01, 1.3012e-01,
+ -2.0943e-01, -1.1348e-01, 3.3929e-01, -1.5069e-01, -1.2919e-01,
+ 1.8929e-01, -3.6166e-01, -8.0756e-01, 6.6387e-02, -5.8867e-01,
+ 1.6978e-01, 1.0134e-01, 3.3877e-01, -1.2133e+00, -3.2492e-01,
+ -8.1237e-01, 3.8101e-01, 4.3765e-01, -8.0596e-01, -4.4531e-01,
+ -4.7513e-02, -2.9266e+00, 1.1741e-03, 4.5123e-01, 9.3075e-01,
+ 5.3688e-02, -1.9621e-01, 6.4530e-01, 9.3870e-01]]], device=device).movedim(-1, 1)
+
+ silence_latent = torch.tensor([[[-1.3672e-01, -1.5820e-01, 5.8594e-01, -5.7422e-01, 3.0273e-02,
+ 2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
+ -2.7710e-02, -1.8066e-01, -2.9688e-01, 1.6016e+00, -2.6719e+00,
+ 7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
+ 2.4316e-01, 4.7266e-01, 4.6387e-02, -6.6406e-01, -2.1973e-01,
+ -6.7578e-01, -1.5723e-01, 9.5312e-01, -2.0020e-01, -1.7109e+00,
+ 5.8984e-01, -5.7422e-01, 5.1562e-01, 2.8320e-01, 1.4551e-01,
+ -1.8750e-01, -5.9814e-02, 3.6719e-01, -1.0059e-01, -1.5723e-01,
+ 2.0605e-01, -4.3359e-01, -8.2812e-01, 4.5654e-02, -6.6016e-01,
+ 1.4844e-01, 9.4727e-02, 3.8477e-01, -1.2578e+00, -3.3203e-01,
+ -8.5547e-01, 4.3359e-01, 4.2383e-01, -8.9453e-01, -5.0391e-01,
+ -5.6152e-02, -2.9219e+00, -2.4658e-02, 5.0391e-01, 9.8438e-01,
+ 7.2754e-02, -2.1582e-01, 6.3672e-01, 1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, length)
+ silence_latent[:, :, :head.shape[-1]] = head
+ return silence_latent
+
+
def get_layer_class(operations, layer_name):
if operations is not None and hasattr(operations, layer_name):
return getattr(operations, layer_name)
@@ -1040,22 +1101,21 @@ class AceStepConditionGenerationModel(nn.Module):
lm_hints = self.detokenizer(lm_hints_5Hz)
lm_hints = lm_hints[:, :src_latents.shape[1], :]
- if is_covers is None:
+ if is_covers is None or is_covers is True:
src_latents = lm_hints
- else:
- src_latents = torch.where(is_covers.unsqueeze(-1).unsqueeze(-1) > 0, lm_hints, src_latents)
+ elif is_covers is False:
+ src_latents = refer_audio_acoustic_hidden_states_packed
context_latents = torch.cat([src_latents, chunk_masks.to(src_latents.dtype)], dim=-1)
return encoder_hidden, encoder_mask, context_latents
- def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, **kwargs):
+ def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, is_covers=None, **kwargs):
text_attention_mask = None
lyric_attention_mask = None
refer_audio_order_mask = None
attention_mask = None
chunk_masks = None
- is_covers = None
src_latents = None
precomputed_lm_hints_25Hz = None
lyric_hidden_states = lyric_embed
@@ -1067,7 +1127,7 @@ class AceStepConditionGenerationModel(nn.Module):
if refer_audio_order_mask is None:
refer_audio_order_mask = torch.zeros((x.shape[0],), device=x.device, dtype=torch.long)
- if src_latents is None and is_covers is None:
+ if src_latents is None:
src_latents = x
if chunk_masks is None:
diff --git a/comfy/model_base.py b/comfy/model_base.py
index a2a34f191..dcbf12074 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1560,22 +1560,11 @@ class ACEStep15(BaseModel):
refer_audio = kwargs.get("reference_audio_timbre_latents", None)
if refer_audio is None or len(refer_audio) == 0:
- refer_audio = torch.tensor([[[-1.3672e-01, -1.5820e-01, 5.8594e-01, -5.7422e-01, 3.0273e-02,
- 2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
- -2.7710e-02, -1.8066e-01, -2.9688e-01, 1.6016e+00, -2.6719e+00,
- 7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
- 2.4316e-01, 4.7266e-01, 4.6387e-02, -6.6406e-01, -2.1973e-01,
- -6.7578e-01, -1.5723e-01, 9.5312e-01, -2.0020e-01, -1.7109e+00,
- 5.8984e-01, -5.7422e-01, 5.1562e-01, 2.8320e-01, 1.4551e-01,
- -1.8750e-01, -5.9814e-02, 3.6719e-01, -1.0059e-01, -1.5723e-01,
- 2.0605e-01, -4.3359e-01, -8.2812e-01, 4.5654e-02, -6.6016e-01,
- 1.4844e-01, 9.4727e-02, 3.8477e-01, -1.2578e+00, -3.3203e-01,
- -8.5547e-01, 4.3359e-01, 4.2383e-01, -8.9453e-01, -5.0391e-01,
- -5.6152e-02, -2.9219e+00, -2.4658e-02, 5.0391e-01, 9.8438e-01,
- 7.2754e-02, -2.1582e-01, 6.3672e-01, 1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, noise.shape[2])
+ refer_audio = comfy.ldm.ace.ace_step15.get_silence_latent(noise.shape[2], device)
pass_audio_codes = True
else:
refer_audio = refer_audio[-1][:, :, :noise.shape[2]]
+ out['is_covers'] = comfy.conds.CONDConstant(True)
pass_audio_codes = False
if pass_audio_codes:
@@ -1583,6 +1572,8 @@ class ACEStep15(BaseModel):
if audio_codes is not None:
out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
refer_audio = refer_audio[:, :, :750]
+ else:
+ out['is_covers'] = comfy.conds.CONDConstant(False)
out['refer_audio'] = comfy.conds.CONDRegular(refer_audio)
return out
From 458292fef0077470f5675ba52555e7bb4c28102e Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 5 Feb 2026 16:15:04 -0800
Subject: [PATCH 07/33] Fix some lowvram stuff with ace step 1.5 (#12312)
---
comfy/ldm/ace/ace_step15.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/comfy/ldm/ace/ace_step15.py b/comfy/ldm/ace/ace_step15.py
index 7fc7f1e8e..69338336d 100644
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
@@ -738,7 +738,7 @@ class AttentionPooler(nn.Module):
def forward(self, x):
B, T, P, D = x.shape
x = self.embed_tokens(x)
- special = self.special_token.expand(B, T, 1, -1)
+ special = comfy.model_management.cast_to(self.special_token, device=x.device, dtype=x.dtype).expand(B, T, 1, -1)
x = torch.cat([special, x], dim=2)
x = x.view(B * T, P + 1, D)
@@ -789,7 +789,7 @@ class FSQ(nn.Module):
self.register_buffer('implicit_codebook', implicit_codebook, persistent=False)
def bound(self, z):
- levels_minus_1 = (self._levels - 1).to(z.dtype)
+ levels_minus_1 = (comfy.model_management.cast_to(self._levels, device=z.device, dtype=z.dtype) - 1)
scale = 2. / levels_minus_1
bracket = (levels_minus_1 * (torch.tanh(z) + 1) / 2.) + 0.5
@@ -804,8 +804,8 @@ class FSQ(nn.Module):
return codes_non_centered.float() * (2. / (self._levels.float() - 1)) - 1.
def codes_to_indices(self, zhat):
- zhat_normalized = (zhat + 1.) / (2. / (self._levels.to(zhat.dtype) - 1))
- return (zhat_normalized * self._basis.to(zhat.dtype)).sum(dim=-1).round().to(torch.int32)
+ zhat_normalized = (zhat + 1.) / (2. / (comfy.model_management.cast_to(self._levels, device=zhat.device, dtype=zhat.dtype) - 1))
+ return (zhat_normalized * comfy.model_management.cast_to(self._basis, device=zhat.device, dtype=zhat.dtype)).sum(dim=-1).round().to(torch.int32)
def forward(self, z):
orig_dtype = z.dtype
@@ -887,7 +887,7 @@ class ResidualFSQ(nn.Module):
x = self.project_in(x)
if hasattr(self, 'soft_clamp_input_value'):
- sc_val = self.soft_clamp_input_value.to(x.dtype)
+ sc_val = comfy.model_management.cast_to(self.soft_clamp_input_value, device=x.device, dtype=x.dtype)
x = (x / sc_val).tanh() * sc_val
quantized_out = torch.tensor(0., device=x.device, dtype=x.dtype)
@@ -895,7 +895,7 @@ class ResidualFSQ(nn.Module):
all_indices = []
for layer, scale in zip(self.layers, self.scales):
- scale = scale.to(residual.dtype)
+ scale = comfy.model_management.cast_to(scale, device=x.device, dtype=x.dtype)
quantized, indices = layer(residual / scale)
quantized = quantized * scale
From c2d7f07dbf312ef9034c65102f1a45c4a3355c1a Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 5 Feb 2026 16:24:09 -0800
Subject: [PATCH 08/33] Fix issue when using disable_unet_model_creation
(#12315)
---
comfy/model_base.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/comfy/model_base.py b/comfy/model_base.py
index dcbf12074..3bb54f59e 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -147,11 +147,11 @@ class BaseModel(torch.nn.Module):
self.diffusion_model.to(memory_format=torch.channels_last)
logging.debug("using channels last mode for diffusion model")
logging.info("model weight dtype {}, manual cast: {}".format(self.get_dtype(), self.manual_cast_dtype))
+ comfy.model_management.archive_model_dtypes(self.diffusion_model)
+
self.model_type = model_type
self.model_sampling = model_sampling(model_config, model_type)
- comfy.model_management.archive_model_dtypes(self.diffusion_model)
-
self.adm_channels = unet_config.get("adm_in_channels", None)
if self.adm_channels is None:
self.adm_channels = 0
From a1c101f861681ff18df5bdb0605e63c1ba9e8a96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?=
<40791699+kijai@users.noreply.github.com>
Date: Fri, 6 Feb 2026 07:43:09 +0200
Subject: [PATCH 09/33] EasyCache: Support LTX2 (#12231)
---
comfy_extras/nodes_easycache.py | 50 ++++++++++++++++++++++++---------
1 file changed, 37 insertions(+), 13 deletions(-)
diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py
index 90d730df6..51d1e5b9c 100644
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@@ -9,6 +9,14 @@ if TYPE_CHECKING:
from uuid import UUID
+def _extract_tensor(data, output_channels):
+ """Extract tensor from data, handling both single tensors and lists."""
+ if isinstance(data, list):
+ # LTX2 AV tensors: [video, audio]
+ return data[0][:, :output_channels], data[1][:, :output_channels]
+ return data[:, :output_channels], None
+
+
def easycache_forward_wrapper(executor, *args, **kwargs):
# get values from args
transformer_options: dict[str] = args[-1]
@@ -17,7 +25,7 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
if not transformer_options:
transformer_options = args[-2]
easycache: EasyCacheHolder = transformer_options["easycache"]
- x: torch.Tensor = args[0][:, :easycache.output_channels]
+ x, ax = _extract_tensor(args[0], easycache.output_channels)
sigmas = transformer_options["sigmas"]
uuids = transformer_options["uuids"]
if sigmas is not None and easycache.is_past_end_timestep(sigmas):
@@ -35,7 +43,11 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
if easycache.skip_current_step and can_apply_cache_diff:
if easycache.verbose:
logging.info(f"EasyCache [verbose] - was marked to skip this step by {easycache.first_cond_uuid}. Present uuids: {uuids}")
- return easycache.apply_cache_diff(x, uuids)
+ result = easycache.apply_cache_diff(x, uuids)
+ if ax is not None:
+ result_audio = easycache.apply_cache_diff(ax, uuids, is_audio=True)
+ return [result, result_audio]
+ return result
if easycache.initial_step:
easycache.first_cond_uuid = uuids[0]
has_first_cond_uuid = easycache.has_first_cond_uuid(uuids)
@@ -51,13 +63,18 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
logging.info(f"EasyCache [verbose] - skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}")
# other conds should also skip this step, and instead use their cached values
easycache.skip_current_step = True
- return easycache.apply_cache_diff(x, uuids)
+ result = easycache.apply_cache_diff(x, uuids)
+ if ax is not None:
+ result_audio = easycache.apply_cache_diff(ax, uuids, is_audio=True)
+ return [result, result_audio]
+ return result
else:
if easycache.verbose:
logging.info(f"EasyCache [verbose] - NOT skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}")
easycache.cumulative_change_rate = 0.0
- output: torch.Tensor = executor(*args, **kwargs)
+ full_output: torch.Tensor = executor(*args, **kwargs)
+ output, audio_output = _extract_tensor(full_output, easycache.output_channels)
if has_first_cond_uuid and easycache.has_output_prev_norm():
output_change = (easycache.subsample(output, uuids, clone=False) - easycache.output_prev_subsampled).flatten().abs().mean()
if easycache.verbose:
@@ -74,13 +91,15 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
logging.info(f"EasyCache [verbose] - output_change_rate: {output_change_rate}")
# TODO: allow cache_diff to be offloaded
easycache.update_cache_diff(output, next_x_prev, uuids)
+ if audio_output is not None:
+ easycache.update_cache_diff(audio_output, ax, uuids, is_audio=True)
if has_first_cond_uuid:
easycache.x_prev_subsampled = easycache.subsample(next_x_prev, uuids)
easycache.output_prev_subsampled = easycache.subsample(output, uuids)
easycache.output_prev_norm = output.flatten().abs().mean()
if easycache.verbose:
logging.info(f"EasyCache [verbose] - x_prev_subsampled: {easycache.x_prev_subsampled.shape}")
- return output
+ return full_output
def lazycache_predict_noise_wrapper(executor, *args, **kwargs):
# get values from args
@@ -89,8 +108,8 @@ def lazycache_predict_noise_wrapper(executor, *args, **kwargs):
easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"]
if easycache.is_past_end_timestep(timestep):
return executor(*args, **kwargs)
+ x: torch.Tensor = _extract_tensor(args[0], easycache.output_channels)
# prepare next x_prev
- x: torch.Tensor = args[0][:, :easycache.output_channels]
next_x_prev = x
input_change = None
do_easycache = easycache.should_do_easycache(timestep)
@@ -197,6 +216,7 @@ class EasyCacheHolder:
self.output_prev_subsampled: torch.Tensor = None
self.output_prev_norm: torch.Tensor = None
self.uuid_cache_diffs: dict[UUID, torch.Tensor] = {}
+ self.uuid_cache_diffs_audio: dict[UUID, torch.Tensor] = {}
self.output_change_rates = []
self.approx_output_change_rates = []
self.total_steps_skipped = 0
@@ -245,20 +265,21 @@ class EasyCacheHolder:
def can_apply_cache_diff(self, uuids: list[UUID]) -> bool:
return all(uuid in self.uuid_cache_diffs for uuid in uuids)
- def apply_cache_diff(self, x: torch.Tensor, uuids: list[UUID]):
- if self.first_cond_uuid in uuids:
+ def apply_cache_diff(self, x: torch.Tensor, uuids: list[UUID], is_audio: bool = False):
+ if self.first_cond_uuid in uuids and not is_audio:
self.total_steps_skipped += 1
+ cache_diffs = self.uuid_cache_diffs_audio if is_audio else self.uuid_cache_diffs
batch_offset = x.shape[0] // len(uuids)
for i, uuid in enumerate(uuids):
# slice out only what is relevant to this cond
batch_slice = [slice(i*batch_offset,(i+1)*batch_offset)]
# if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video)
- if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]:
+ if x.shape[1:] != cache_diffs[uuid].shape[1:]:
if not self.allow_mismatch:
raise ValueError(f"Cached dims {self.uuid_cache_diffs[uuid].shape} don't match x dims {x.shape} - this is no good")
slicing = []
skip_this_dim = True
- for dim_u, dim_x in zip(self.uuid_cache_diffs[uuid].shape, x.shape):
+ for dim_u, dim_x in zip(cache_diffs[uuid].shape, x.shape):
if skip_this_dim:
skip_this_dim = False
continue
@@ -270,10 +291,11 @@ class EasyCacheHolder:
else:
slicing.append(slice(None))
batch_slice = batch_slice + slicing
- x[tuple(batch_slice)] += self.uuid_cache_diffs[uuid].to(x.device)
+ x[tuple(batch_slice)] += cache_diffs[uuid].to(x.device)
return x
- def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]):
+ def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID], is_audio: bool = False):
+ cache_diffs = self.uuid_cache_diffs_audio if is_audio else self.uuid_cache_diffs
# if output dims don't match x dims, cut off excess and hope for the best (cosmos world2video)
if output.shape[1:] != x.shape[1:]:
if not self.allow_mismatch:
@@ -293,7 +315,7 @@ class EasyCacheHolder:
diff = output - x
batch_offset = diff.shape[0] // len(uuids)
for i, uuid in enumerate(uuids):
- self.uuid_cache_diffs[uuid] = diff[i*batch_offset:(i+1)*batch_offset, ...]
+ cache_diffs[uuid] = diff[i*batch_offset:(i+1)*batch_offset, ...]
def has_first_cond_uuid(self, uuids: list[UUID]) -> bool:
return self.first_cond_uuid in uuids
@@ -324,6 +346,8 @@ class EasyCacheHolder:
self.output_prev_norm = None
del self.uuid_cache_diffs
self.uuid_cache_diffs = {}
+ del self.uuid_cache_diffs_audio
+ self.uuid_cache_diffs_audio = {}
self.total_steps_skipped = 0
self.state_metadata = None
return self
From eba6c940fd04483fedec6b47bb93fa669e77fe8a Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 6 Feb 2026 16:14:56 -0800
Subject: [PATCH 10/33] Make ace step 1.5 base model work properly with default
workflow. (#12337)
---
comfy/ldm/ace/ace_step15.py | 5 ++++-
comfy/model_base.py | 2 ++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/comfy/ldm/ace/ace_step15.py b/comfy/ldm/ace/ace_step15.py
index 69338336d..1d7dc59a8 100644
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
@@ -1110,7 +1110,7 @@ class AceStepConditionGenerationModel(nn.Module):
return encoder_hidden, encoder_mask, context_latents
- def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, is_covers=None, **kwargs):
+ def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, is_covers=None, replace_with_null_embeds=False, **kwargs):
text_attention_mask = None
lyric_attention_mask = None
refer_audio_order_mask = None
@@ -1140,6 +1140,9 @@ class AceStepConditionGenerationModel(nn.Module):
src_latents, chunk_masks, is_covers, precomputed_lm_hints_25Hz=precomputed_lm_hints_25Hz, audio_codes=audio_codes
)
+ if replace_with_null_embeds:
+ enc_hidden[:] = self.null_condition_emb.to(enc_hidden)
+
out = self.decoder(hidden_states=x,
timestep=timestep,
timestep_r=timestep,
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 3bb54f59e..3aa345254 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1552,6 +1552,8 @@ class ACEStep15(BaseModel):
cross_attn = kwargs.get("cross_attn", None)
if cross_attn is not None:
+ if torch.count_nonzero(cross_attn) == 0:
+ out['replace_with_null_embeds'] = comfy.conds.CONDConstant(True)
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
conditioning_lyrics = kwargs.get("conditioning_lyrics", None)
From a831c19b703693f561e32780248514eeaa9e832e Mon Sep 17 00:00:00 2001
From: asagi4 <130366179+asagi4@users.noreply.github.com>
Date: Sat, 7 Feb 2026 02:38:04 +0200
Subject: [PATCH 11/33] Fix return_word_ids=True with Anima tokenizer (#12328)
---
comfy/text_encoders/anima.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/comfy/text_encoders/anima.py b/comfy/text_encoders/anima.py
index b6f58cb25..fcba097cb 100644
--- a/comfy/text_encoders/anima.py
+++ b/comfy/text_encoders/anima.py
@@ -23,7 +23,7 @@ class AnimaTokenizer:
def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
out = {}
qwen_ids = self.qwen3_06b.tokenize_with_weights(text, return_word_ids, **kwargs)
- out["qwen3_06b"] = [[(token, 1.0) for token, _ in inner_list] for inner_list in qwen_ids] # Set weights to 1.0
+ out["qwen3_06b"] = [[(token, 1.0, id) if return_word_ids else (token, 1.0) for token, _, id in inner_list] for inner_list in qwen_ids] # Set weights to 1.0
out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids, **kwargs)
return out
From 204e65b8dcb2db2014f40e1b4c8def3a00150cde Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 6 Feb 2026 16:48:20 -0800
Subject: [PATCH 12/33] Fix bug with last pr (#12338)
---
comfy/text_encoders/anima.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/comfy/text_encoders/anima.py b/comfy/text_encoders/anima.py
index fcba097cb..d8c5a6f92 100644
--- a/comfy/text_encoders/anima.py
+++ b/comfy/text_encoders/anima.py
@@ -23,7 +23,7 @@ class AnimaTokenizer:
def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
out = {}
qwen_ids = self.qwen3_06b.tokenize_with_weights(text, return_word_ids, **kwargs)
- out["qwen3_06b"] = [[(token, 1.0, id) if return_word_ids else (token, 1.0) for token, _, id in inner_list] for inner_list in qwen_ids] # Set weights to 1.0
+ out["qwen3_06b"] = [[(k[0], 1.0, k[2]) if return_word_ids else (k[0], 1.0) for k in inner_list] for inner_list in qwen_ids] # Set weights to 1.0
out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids, **kwargs)
return out
From 6a263288427a9998086603db0e7078ebcb56f0c4 Mon Sep 17 00:00:00 2001
From: tdrussell
Date: Fri, 6 Feb 2026 19:12:15 -0600
Subject: [PATCH 13/33] Support fp16 for Cosmos-Predict2 and Anima (#12249)
---
comfy/ldm/cosmos/predict2.py | 24 +++++++++++++++++-------
comfy/supported_models.py | 4 ++--
2 files changed, 19 insertions(+), 9 deletions(-)
diff --git a/comfy/ldm/cosmos/predict2.py b/comfy/ldm/cosmos/predict2.py
index c270e6333..6491e486b 100644
--- a/comfy/ldm/cosmos/predict2.py
+++ b/comfy/ldm/cosmos/predict2.py
@@ -335,7 +335,7 @@ class FinalLayer(nn.Module):
device=None, dtype=None, operations=None
):
super().__init__()
- self.layer_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+ self.layer_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
self.linear = operations.Linear(
hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False, device=device, dtype=dtype
)
@@ -463,6 +463,8 @@ class Block(nn.Module):
extra_per_block_pos_emb: Optional[torch.Tensor] = None,
transformer_options: Optional[dict] = {},
) -> torch.Tensor:
+ residual_dtype = x_B_T_H_W_D.dtype
+ compute_dtype = emb_B_T_D.dtype
if extra_per_block_pos_emb is not None:
x_B_T_H_W_D = x_B_T_H_W_D + extra_per_block_pos_emb
@@ -512,7 +514,7 @@ class Block(nn.Module):
result_B_T_H_W_D = rearrange(
self.self_attn(
# normalized_x_B_T_HW_D,
- rearrange(normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
+ rearrange(normalized_x_B_T_H_W_D.to(compute_dtype), "b t h w d -> b (t h w) d"),
None,
rope_emb=rope_emb_L_1_1_D,
transformer_options=transformer_options,
@@ -522,7 +524,7 @@ class Block(nn.Module):
h=H,
w=W,
)
- x_B_T_H_W_D = x_B_T_H_W_D + gate_self_attn_B_T_1_1_D * result_B_T_H_W_D
+ x_B_T_H_W_D = x_B_T_H_W_D + gate_self_attn_B_T_1_1_D.to(residual_dtype) * result_B_T_H_W_D.to(residual_dtype)
def _x_fn(
_x_B_T_H_W_D: torch.Tensor,
@@ -536,7 +538,7 @@ class Block(nn.Module):
)
_result_B_T_H_W_D = rearrange(
self.cross_attn(
- rearrange(_normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"),
+ rearrange(_normalized_x_B_T_H_W_D.to(compute_dtype), "b t h w d -> b (t h w) d"),
crossattn_emb,
rope_emb=rope_emb_L_1_1_D,
transformer_options=transformer_options,
@@ -555,7 +557,7 @@ class Block(nn.Module):
shift_cross_attn_B_T_1_1_D,
transformer_options=transformer_options,
)
- x_B_T_H_W_D = result_B_T_H_W_D * gate_cross_attn_B_T_1_1_D + x_B_T_H_W_D
+ x_B_T_H_W_D = result_B_T_H_W_D.to(residual_dtype) * gate_cross_attn_B_T_1_1_D.to(residual_dtype) + x_B_T_H_W_D
normalized_x_B_T_H_W_D = _fn(
x_B_T_H_W_D,
@@ -563,8 +565,8 @@ class Block(nn.Module):
scale_mlp_B_T_1_1_D,
shift_mlp_B_T_1_1_D,
)
- result_B_T_H_W_D = self.mlp(normalized_x_B_T_H_W_D)
- x_B_T_H_W_D = x_B_T_H_W_D + gate_mlp_B_T_1_1_D * result_B_T_H_W_D
+ result_B_T_H_W_D = self.mlp(normalized_x_B_T_H_W_D.to(compute_dtype))
+ x_B_T_H_W_D = x_B_T_H_W_D + gate_mlp_B_T_1_1_D.to(residual_dtype) * result_B_T_H_W_D.to(residual_dtype)
return x_B_T_H_W_D
@@ -876,6 +878,14 @@ class MiniTrainDIT(nn.Module):
"extra_per_block_pos_emb": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
"transformer_options": kwargs.get("transformer_options", {}),
}
+
+ # The residual stream for this model has large values. To make fp16 compute_dtype work, we keep the residual stream
+ # in fp32, but run attention and MLP modules in fp16.
+ # An alternate method that clamps fp16 values "works" in the sense that it makes coherent images, but there is noticeable
+ # quality degradation and visual artifacts.
+ if x_B_T_H_W_D.dtype == torch.float16:
+ x_B_T_H_W_D = x_B_T_H_W_D.float()
+
for block in self.blocks:
x_B_T_H_W_D = block(
x_B_T_H_W_D,
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 77264ed28..56a21b0ef 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -993,7 +993,7 @@ class CosmosT2IPredict2(supported_models_base.BASE):
memory_usage_factor = 1.0
- supported_inference_dtypes = [torch.bfloat16, torch.float32]
+ supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
def __init__(self, unet_config):
super().__init__(unet_config)
@@ -1023,7 +1023,7 @@ class Anima(supported_models_base.BASE):
memory_usage_factor = 1.0
- supported_inference_dtypes = [torch.bfloat16, torch.float32]
+ supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
def __init__(self, unet_config):
super().__init__(unet_config)
From 039955c52744909107dc68ade0698a55d81a8886 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 6 Feb 2026 17:14:52 -0800
Subject: [PATCH 14/33] Some fixes to previous pr. (#12339)
---
comfy/ldm/cosmos/predict2.py | 2 +-
comfy/supported_models.py | 10 ++++++----
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/comfy/ldm/cosmos/predict2.py b/comfy/ldm/cosmos/predict2.py
index 6491e486b..2268bff38 100644
--- a/comfy/ldm/cosmos/predict2.py
+++ b/comfy/ldm/cosmos/predict2.py
@@ -894,6 +894,6 @@ class MiniTrainDIT(nn.Module):
**block_kwargs,
)
- x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D, t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D)
+ x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D.to(crossattn_emb.dtype), t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D)
x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O)[:, :, :orig_shape[-3], :orig_shape[-2], :orig_shape[-1]]
return x_B_C_Tt_Hp_Wp
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 56a21b0ef..d33db7507 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1025,10 +1025,6 @@ class Anima(supported_models_base.BASE):
supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
- def __init__(self, unet_config):
- super().__init__(unet_config)
- self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95
-
def get_model(self, state_dict, prefix="", device=None):
out = model_base.Anima(self, device=device)
return out
@@ -1038,6 +1034,12 @@ class Anima(supported_models_base.BASE):
detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_06b.transformer.".format(pref))
return supported_models_base.ClipTarget(comfy.text_encoders.anima.AnimaTokenizer, comfy.text_encoders.anima.te(**detect))
+ def set_inference_dtype(self, dtype, manual_cast_dtype, **kwargs):
+ self.memory_usage_factor = (self.unet_config.get("model_channels", 2048) / 2048) * 0.95
+ if dtype is torch.float16:
+ self.memory_usage_factor *= 1.4
+ return super().set_inference_dtype(dtype, manual_cast_dtype, **kwargs)
+
class CosmosI2VPredict2(CosmosT2IPredict2):
unet_config = {
"image_model": "cosmos_predict2",
From 17e7df43d19bde49efa46a32b89f5153b9cb0ded Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 6 Feb 2026 21:02:11 -0800
Subject: [PATCH 15/33] Pad ace step 1.5 ref audio if not long enough. (#12341)
---
comfy/model_base.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 3aa345254..858789b30 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1577,6 +1577,10 @@ class ACEStep15(BaseModel):
else:
out['is_covers'] = comfy.conds.CONDConstant(False)
+ if refer_audio.shape[2] < noise.shape[2]:
+ pad = comfy.ldm.ace.ace_step15.get_silence_latent(noise.shape[2], device)
+ refer_audio = torch.cat([refer_audio.to(pad), pad[:, :, refer_audio.shape[2]:]], dim=2)
+
out['refer_audio'] = comfy.conds.CONDRegular(refer_audio)
return out
From 5ff4fdedba2e72ffabf2948799a1c656d9002b52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?=
<40791699+kijai@users.noreply.github.com>
Date: Sat, 7 Feb 2026 21:25:30 +0200
Subject: [PATCH 16/33] Fix LazyCache (#12344)
---
comfy_extras/nodes_easycache.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py
index 51d1e5b9c..b1912392c 100644
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@@ -108,7 +108,7 @@ def lazycache_predict_noise_wrapper(executor, *args, **kwargs):
easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"]
if easycache.is_past_end_timestep(timestep):
return executor(*args, **kwargs)
- x: torch.Tensor = _extract_tensor(args[0], easycache.output_channels)
+ x: torch.Tensor = args[0][:, :easycache.output_channels]
# prepare next x_prev
next_x_prev = x
input_change = None
From 9bf5aa54dbda5b5de36812cfc10b123ae0930283 Mon Sep 17 00:00:00 2001
From: chaObserv <154517000+chaObserv@users.noreply.github.com>
Date: Sun, 8 Feb 2026 06:38:51 +0800
Subject: [PATCH 17/33] Add search_aliases to sa-solver and seeds-2 node
(#12327)
---
comfy_extras/nodes_custom_sampler.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py
index 8afd13acf..61a234634 100644
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -622,6 +622,7 @@ class SamplerSASolver(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerSASolver",
+ search_aliases=["sde"],
category="sampling/custom_sampling/samplers",
inputs=[
io.Model.Input("model"),
@@ -666,6 +667,7 @@ class SamplerSEEDS2(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerSEEDS2",
+ search_aliases=["sde", "exp heun"],
category="sampling/custom_sampling/samplers",
inputs=[
io.Combo.Input("solver_type", options=["phi_1", "phi_2"]),
From 3760d74005a6954f54657dc59d9e57fd4c44b3fd Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki
Date: Sun, 8 Feb 2026 07:34:52 +0800
Subject: [PATCH 18/33] chore: update embedded docs to v0.4.1 (#12346)
---
requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index 41cc9174b..5e34a2a49 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
comfyui-frontend-package==1.38.13
comfyui-workflow-templates==0.8.31
-comfyui-embedded-docs==0.4.0
+comfyui-embedded-docs==0.4.1
torch
torchsde
torchvision
From f350a842611f4d75da7104c2d2965f45989089b9 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sat, 7 Feb 2026 16:16:28 -0800
Subject: [PATCH 19/33] Disable prompt weights for ltxv2. (#12354)
---
comfy/text_encoders/lt.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py
index 26573fb12..3f87dfd6a 100644
--- a/comfy/text_encoders/lt.py
+++ b/comfy/text_encoders/lt.py
@@ -25,7 +25,7 @@ def ltxv_te(*args, **kwargs):
class Gemma3_12BTokenizer(sd1_clip.SDTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
tokenizer = tokenizer_data.get("spiece_model", None)
- super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
+ super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
def state_dict(self):
return {"spiece_model": self.tokenizer.serialize_model()}
From a0302cc6a85dcb950a7308f7a31a224ef54f3d58 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sun, 8 Feb 2026 18:16:40 -0800
Subject: [PATCH 20/33] Make tonemap latent work on any dim latents. (#12363)
---
comfy_extras/nodes_latent.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py
index 6aecf1561..8d2d7297a 100644
--- a/comfy_extras/nodes_latent.py
+++ b/comfy_extras/nodes_latent.py
@@ -391,8 +391,9 @@ class LatentOperationTonemapReinhard(io.ComfyNode):
latent_vector_magnitude = (torch.linalg.vector_norm(latent, dim=(1)) + 0.0000000001)[:,None]
normalized_latent = latent / latent_vector_magnitude
- mean = torch.mean(latent_vector_magnitude, dim=(1,2,3), keepdim=True)
- std = torch.std(latent_vector_magnitude, dim=(1,2,3), keepdim=True)
+ dims = list(range(1, latent_vector_magnitude.ndim))
+ mean = torch.mean(latent_vector_magnitude, dim=dims, keepdim=True)
+ std = torch.std(latent_vector_magnitude, dim=dims, keepdim=True)
top = (std * 5 + mean) * multiplier
From 62315fbb15861e64b917d0a072dad5dc9a15173c Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Mon, 9 Feb 2026 13:16:08 -0800
Subject: [PATCH 21/33] Dynamic VRAM fixes - Ace 1.5 performance + a VRAM leak
(#12368)
* revert threaded model loader change
This change was only needed to get around the pytorch 2.7 mempool bugs,
and should have been reverted along with #12260. This fixes a different
memory leak where pytorch gets confused about cache emptying.
* load non comfy weights
* MPDynamic: Pre-generate the tensors for vbars
Apparently this is an expensive operation that slows down things.
* bump to aimdo 1.8
New features:
watermark limit feature
logging enhancements
-O2 build on linux
---
comfy/model_management.py | 37 ++++++-------------------------------
comfy/model_patcher.py | 7 ++++++-
comfy/ops.py | 2 +-
execution.py | 7 ++++++-
requirements.txt | 2 +-
5 files changed, 20 insertions(+), 35 deletions(-)
diff --git a/comfy/model_management.py b/comfy/model_management.py
index b6291f340..6018c1ab6 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -19,7 +19,7 @@
import psutil
import logging
from enum import Enum
-from comfy.cli_args import args, PerformanceFeature, enables_dynamic_vram
+from comfy.cli_args import args, PerformanceFeature
import threading
import torch
import sys
@@ -651,7 +651,7 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, ram_
soft_empty_cache()
return unloaded_models
-def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
+def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
cleanup_models_gc()
global vram_state
@@ -747,26 +747,6 @@ def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, m
current_loaded_models.insert(0, loaded_model)
return
-def load_models_gpu_thread(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load):
- with torch.inference_mode():
- load_models_gpu_orig(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load)
- soft_empty_cache()
-
-def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
- #Deliberately load models outside of the Aimdo mempool so they can be retained accross
- #nodes. Use a dummy thread to do it as pytorch documents that mempool contexts are
- #thread local. So exploit that to escape context
- if enables_dynamic_vram():
- t = threading.Thread(
- target=load_models_gpu_thread,
- args=(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load)
- )
- t.start()
- t.join()
- else:
- load_models_gpu_orig(models, memory_required=memory_required, force_patch_weights=force_patch_weights,
- minimum_memory_required=minimum_memory_required, force_full_load=force_full_load)
-
def load_model_gpu(model):
return load_models_gpu([model])
@@ -1226,21 +1206,16 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str
if dtype is None:
dtype = weight._model_dtype
- r = torch.empty_like(weight, dtype=dtype, device=device)
-
signature = comfy_aimdo.model_vbar.vbar_fault(weight._v)
if signature is not None:
- raw_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device)
- v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, raw_tensor)[0]
+ v_tensor = comfy.memory_management.interpret_gathered_like(cast_geometry, weight._v_tensor)[0]
if not comfy_aimdo.model_vbar.vbar_signature_compare(signature, weight._v_signature):
weight._v_signature = signature
#Send it over
v_tensor.copy_(weight, non_blocking=non_blocking)
- #always take a deep copy even if _v is good, as we have no reasonable point to unpin
- #a non comfy weight
- r.copy_(v_tensor)
- comfy_aimdo.model_vbar.vbar_unpin(weight._v)
- return r
+ return v_tensor.to(dtype=dtype)
+
+ r = torch.empty_like(weight, dtype=dtype, device=device)
if weight.dtype != r.dtype and weight.dtype != weight._model_dtype:
#Offloaded casting could skip this, however it would make the quantizations
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index d888dbcfb..b9a117a7c 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1492,7 +1492,9 @@ class ModelPatcherDynamic(ModelPatcher):
if vbar is not None:
vbar.prioritize()
- #We have way more tools for acceleration on comfy weight offloading, so always
+ #We force reserve VRAM for the non comfy-weight so we dont have to deal
+ #with pin and unpin syncrhonization which can be expensive for small weights
+ #with a high layer rate (e.g. autoregressive LLMs).
#prioritize the non-comfy weights (note the order reverse).
loading = self._load_list(prio_comfy_cast_weights=True)
loading.sort(reverse=True)
@@ -1541,6 +1543,7 @@ class ModelPatcherDynamic(ModelPatcher):
if vbar is not None and not hasattr(m, "_v"):
m._v = vbar.alloc(v_weight_size)
+ m._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(m._v, device_to)
allocated_size += v_weight_size
else:
@@ -1555,8 +1558,10 @@ class ModelPatcherDynamic(ModelPatcher):
weight_size = geometry.numel() * geometry.element_size()
if vbar is not None and not hasattr(weight, "_v"):
weight._v = vbar.alloc(weight_size)
+ weight._v_tensor = comfy_aimdo.torch.aimdo_to_tensor(weight._v, device_to)
weight._model_dtype = model_dtype
allocated_size += weight_size
+ vbar.set_watermark_limit(allocated_size)
logging.info(f"Model {self.model.__class__.__name__} prepared for dynamic VRAM loading. {allocated_size // (1024 ** 2)}MB Staged. {num_patches} patches attached.")
diff --git a/comfy/ops.py b/comfy/ops.py
index 0f4eca7c7..ea0d70702 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -87,7 +87,7 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
if signature is not None:
- xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device)
+ xfer_dest = s._v_tensor
resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
if not resident:
diff --git a/execution.py b/execution.py
index 3dbab82e6..896862c6b 100644
--- a/execution.py
+++ b/execution.py
@@ -13,8 +13,11 @@ from contextlib import nullcontext
import torch
+from comfy.cli_args import args
import comfy.memory_management
import comfy.model_management
+import comfy_aimdo.model_vbar
+
from latent_preview import set_preview_method
import nodes
from comfy_execution.caching import (
@@ -527,8 +530,10 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data)
finally:
if allocator is not None:
+ if args.verbose == "DEBUG":
+ comfy_aimdo.model_vbar.vbars_analyze()
comfy.model_management.reset_cast_buffers()
- torch.cuda.synchronize()
+ comfy_aimdo.model_vbar.vbars_reset_watermark_limits()
if has_pending_tasks:
pending_async_nodes[unique_id] = output_data
diff --git a/requirements.txt b/requirements.txt
index 5e34a2a49..4fda07fde 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,7 @@ alembic
SQLAlchemy
av>=14.2.0
comfy-kitchen>=0.2.7
-comfy-aimdo>=0.1.7
+comfy-aimdo>=0.1.8
requests
#non essential dependencies:
From baf8c874557f1522a99d47d94faad12b0257c8f1 Mon Sep 17 00:00:00 2001
From: blepping <157360029+blepping@users.noreply.github.com>
Date: Mon, 9 Feb 2026 17:41:49 -0700
Subject: [PATCH 22/33] Iimprovements to ACE-Steps 1.5 text encoding (part 2)
(#12350)
---
comfy/text_encoders/ace15.py | 114 +++++++++++++++++++++++++----------
1 file changed, 81 insertions(+), 33 deletions(-)
diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index 00dd5ba90..5dac644c2 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -3,6 +3,7 @@ import comfy.text_encoders.llama
from comfy import sd1_clip
import torch
import math
+from tqdm.auto import trange
import yaml
import comfy.utils
@@ -23,6 +24,8 @@ def sample_manual_loop_no_classes(
audio_end_id: int = 215669,
eos_token_id: int = 151645,
):
+ if ids is None:
+ return []
device = model.execution_device
if execution_dtype is None:
@@ -32,6 +35,7 @@ def sample_manual_loop_no_classes(
execution_dtype = torch.float32
embeds, attention_mask, num_tokens, embeds_info = model.process_tokens(ids, device)
+ embeds_batch = embeds.shape[0]
for i, t in enumerate(paddings):
attention_mask[i, :t] = 0
attention_mask[i, t:] = 1
@@ -41,22 +45,27 @@ def sample_manual_loop_no_classes(
generator = torch.Generator(device=device)
generator.manual_seed(seed)
model_config = model.transformer.model.config
+ past_kv_shape = [embeds_batch, model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim]
for x in range(model_config.num_hidden_layers):
- past_key_values.append((torch.empty([embeds.shape[0], model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim], device=device, dtype=execution_dtype), torch.empty([embeds.shape[0], model_config.num_key_value_heads, embeds.shape[1] + min_tokens, model_config.head_dim], device=device, dtype=execution_dtype), 0))
+ past_key_values.append((torch.empty(past_kv_shape, device=device, dtype=execution_dtype), torch.empty(past_kv_shape, device=device, dtype=execution_dtype), 0))
progress_bar = comfy.utils.ProgressBar(max_new_tokens)
- for step in range(max_new_tokens):
+ for step in trange(max_new_tokens, desc="LM sampling"):
outputs = model.transformer(None, attention_mask, embeds=embeds.to(execution_dtype), num_tokens=num_tokens, intermediate_output=None, dtype=execution_dtype, embeds_info=embeds_info, past_key_values=past_key_values)
next_token_logits = model.transformer.logits(outputs[0])[:, -1]
past_key_values = outputs[2]
- cond_logits = next_token_logits[0:1]
- uncond_logits = next_token_logits[1:2]
- cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
+ if cfg_scale != 1.0:
+ cond_logits = next_token_logits[0:1]
+ uncond_logits = next_token_logits[1:2]
+ cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
+ else:
+ cfg_logits = next_token_logits[0:1]
- if eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step:
+ use_eos_score = eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step
+ if use_eos_score:
eos_score = cfg_logits[:, eos_token_id].clone()
remove_logit_value = torch.finfo(cfg_logits.dtype).min
@@ -64,7 +73,7 @@ def sample_manual_loop_no_classes(
cfg_logits[:, :audio_start_id] = remove_logit_value
cfg_logits[:, audio_end_id:] = remove_logit_value
- if eos_token_id is not None and eos_token_id < audio_start_id and min_tokens < step:
+ if use_eos_score:
cfg_logits[:, eos_token_id] = eos_score
if top_k is not None and top_k > 0:
@@ -93,8 +102,8 @@ def sample_manual_loop_no_classes(
break
embed, _, _, _ = model.process_tokens([[token]], device)
- embeds = embed.repeat(2, 1, 1)
- attention_mask = torch.cat([attention_mask, torch.ones((2, 1), device=device, dtype=attention_mask.dtype)], dim=1)
+ embeds = embed.repeat(embeds_batch, 1, 1)
+ attention_mask = torch.cat([attention_mask, torch.ones((embeds_batch, 1), device=device, dtype=attention_mask.dtype)], dim=1)
output_audio_codes.append(token - audio_start_id)
progress_bar.update_absolute(step)
@@ -104,22 +113,29 @@ def sample_manual_loop_no_classes(
def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0, cfg_scale=2.0, temperature=0.85, top_p=0.9, top_k=0):
positive = [[token for token, _ in inner_list] for inner_list in positive]
- negative = [[token for token, _ in inner_list] for inner_list in negative]
positive = positive[0]
- negative = negative[0]
- neg_pad = 0
- if len(negative) < len(positive):
- neg_pad = (len(positive) - len(negative))
- negative = [model.special_tokens["pad"]] * neg_pad + negative
+ if cfg_scale != 1.0:
+ negative = [[token for token, _ in inner_list] for inner_list in negative]
+ negative = negative[0]
- pos_pad = 0
- if len(negative) > len(positive):
- pos_pad = (len(negative) - len(positive))
- positive = [model.special_tokens["pad"]] * pos_pad + positive
+ neg_pad = 0
+ if len(negative) < len(positive):
+ neg_pad = (len(positive) - len(negative))
+ negative = [model.special_tokens["pad"]] * neg_pad + negative
- paddings = [pos_pad, neg_pad]
- return sample_manual_loop_no_classes(model, [positive, negative], paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+ pos_pad = 0
+ if len(negative) > len(positive):
+ pos_pad = (len(negative) - len(positive))
+ positive = [model.special_tokens["pad"]] * pos_pad + positive
+
+ paddings = [pos_pad, neg_pad]
+ ids = [positive, negative]
+ else:
+ paddings = []
+ ids = [positive]
+
+ return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
@@ -129,12 +145,12 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
def _metas_to_cot(self, *, return_yaml: bool = False, **kwargs) -> str:
user_metas = {
k: kwargs.pop(k)
- for k in ("bpm", "duration", "keyscale", "timesignature", "language", "caption")
+ for k in ("bpm", "duration", "keyscale", "timesignature", "language")
if k in kwargs
}
timesignature = user_metas.get("timesignature")
if isinstance(timesignature, str) and timesignature.endswith("/4"):
- user_metas["timesignature"] = timesignature.rsplit("/", 1)[0]
+ user_metas["timesignature"] = timesignature[:-2]
user_metas = {
k: v if not isinstance(v, str) or not v.isdigit() else int(v)
for k, v in user_metas.items()
@@ -147,8 +163,11 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
return f"\n{meta_yaml}\n" if not return_yaml else meta_yaml
def _metas_to_cap(self, **kwargs) -> str:
- use_keys = ("bpm", "duration", "keyscale", "timesignature")
+ use_keys = ("bpm", "timesignature", "keyscale", "duration")
user_metas = { k: kwargs.pop(k, "N/A") for k in use_keys }
+ timesignature = user_metas.get("timesignature")
+ if isinstance(timesignature, str) and timesignature.endswith("/4"):
+ user_metas["timesignature"] = timesignature[:-2]
duration = user_metas["duration"]
if duration == "N/A":
user_metas["duration"] = "30 seconds"
@@ -159,9 +178,13 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
return "\n".join(f"- {k}: {user_metas[k]}" for k in use_keys)
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
- out = {}
+ text = text.strip()
+ text_negative = kwargs.get("caption_negative", text).strip()
lyrics = kwargs.get("lyrics", "")
+ lyrics_negative = kwargs.get("lyrics_negative", lyrics)
duration = kwargs.get("duration", 120)
+ if isinstance(duration, str):
+ duration = float(duration.split(None, 1)[0])
language = kwargs.get("language")
seed = kwargs.get("seed", 0)
@@ -171,21 +194,46 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
top_p = kwargs.get("top_p", 0.9)
top_k = kwargs.get("top_k", 0.0)
-
duration = math.ceil(duration)
kwargs["duration"] = duration
+ tokens_duration = duration * 5
+ min_tokens = int(kwargs.get("min_tokens", tokens_duration))
+ max_tokens = int(kwargs.get("max_tokens", tokens_duration))
+
+ metas_negative = {
+ k.rsplit("_", 1)[0]: kwargs.pop(k)
+ for k in ("bpm_negative", "duration_negative", "keyscale_negative", "timesignature_negative", "language_negative", "caption_negative")
+ if k in kwargs
+ }
+ if not kwargs.get("use_negative_caption"):
+ _ = metas_negative.pop("caption", None)
cot_text = self._metas_to_cot(caption = text, **kwargs)
+ cot_text_negative = "\n" if not metas_negative else self._metas_to_cot(**metas_negative)
meta_cap = self._metas_to_cap(**kwargs)
- lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n# Lyric\n{}\n<|im_end|>\n<|im_start|>assistant\n{}\n<|im_end|>\n"
+ lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n\n# Lyric\n{}\n<|im_end|>\n<|im_start|>assistant\n{}\n\n<|im_end|>\n"
+ lyrics_template = "# Languages\n{}\n\n# Lyric\n{}<|endoftext|><|endoftext|>"
+ qwen3_06b_template = "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}\n\n# Metas\n{}\n<|endoftext|>\n<|endoftext|>"
- out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, cot_text), disable_weights=True)
- out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, "\n"), disable_weights=True)
+ llm_prompts = {
+ "lm_prompt": lm_template.format(text, lyrics.strip(), cot_text),
+ "lm_prompt_negative": lm_template.format(text_negative, lyrics_negative.strip(), cot_text_negative),
+ "lyrics": lyrics_template.format(language if language is not None else "", lyrics),
+ "qwen3_06b": qwen3_06b_template.format(text, meta_cap),
+ }
- out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric\n{}<|endoftext|><|endoftext|>".format(language if language is not None else "", lyrics), return_word_ids, disable_weights=True, **kwargs)
- out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}\n# Metas\n{}\n<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
- out["lm_metadata"] = {"min_tokens": duration * 5,
+ out = {
+ prompt_key: self.qwen3_06b.tokenize_with_weights(
+ prompt,
+ prompt_key == "qwen3_06b" and return_word_ids,
+ disable_weights = True,
+ **kwargs,
+ )
+ for prompt_key, prompt in llm_prompts.items()
+ }
+ out["lm_metadata"] = {"min_tokens": min_tokens,
+ "max_tokens": max_tokens,
"seed": seed,
"generate_audio_codes": generate_audio_codes,
"cfg_scale": cfg_scale,
@@ -252,7 +300,7 @@ class ACE15TEModel(torch.nn.Module):
lm_metadata = token_weight_pairs["lm_metadata"]
if lm_metadata["generate_audio_codes"]:
- audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"], cfg_scale=lm_metadata["cfg_scale"], temperature=lm_metadata["temperature"], top_p=lm_metadata["top_p"], top_k=lm_metadata["top_k"])
+ audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["max_tokens"], seed=lm_metadata["seed"], cfg_scale=lm_metadata["cfg_scale"], temperature=lm_metadata["temperature"], top_p=lm_metadata["top_p"], top_k=lm_metadata["top_k"])
out["audio_codes"] = [audio_codes]
return base_out, None, out
From a4be04c5d750cc5d62256f7f86bb5a7c0a78e28d Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 9 Feb 2026 16:45:56 -0800
Subject: [PATCH 23/33] Ace step prompts match now. (#12376)
---
comfy/text_encoders/ace15.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index 5dac644c2..73697b3c1 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -145,7 +145,7 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
def _metas_to_cot(self, *, return_yaml: bool = False, **kwargs) -> str:
user_metas = {
k: kwargs.pop(k)
- for k in ("bpm", "duration", "keyscale", "timesignature", "language")
+ for k in ("bpm", "duration", "keyscale", "timesignature")
if k in kwargs
}
timesignature = user_metas.get("timesignature")
@@ -208,8 +208,8 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
if not kwargs.get("use_negative_caption"):
_ = metas_negative.pop("caption", None)
- cot_text = self._metas_to_cot(caption = text, **kwargs)
- cot_text_negative = "\n" if not metas_negative else self._metas_to_cot(**metas_negative)
+ cot_text = self._metas_to_cot(caption=text, **kwargs)
+ cot_text_negative = "\n\n" if not metas_negative else self._metas_to_cot(**metas_negative)
meta_cap = self._metas_to_cap(**kwargs)
lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n\n# Lyric\n{}\n<|im_end|>\n<|im_start|>assistant\n{}\n\n<|im_end|>\n"
From 349a636a2b0f15aba2930b9af905bb805d2fe30b Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki
Date: Tue, 10 Feb 2026 10:25:34 +0800
Subject: [PATCH 24/33] chore: update workflow templates to v0.8.37 (#12377)
---
requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index 4fda07fde..4e2773f5d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
comfyui-frontend-package==1.38.13
-comfyui-workflow-templates==0.8.31
+comfyui-workflow-templates==0.8.37
comfyui-embedded-docs==0.4.1
torch
torchsde
From c1b63a7e78b606bc14cd49a02e9338274db28a60 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 10 Feb 2026 04:58:27 +0200
Subject: [PATCH 25/33] fix(Moonvalley-API-Nodes): adjust "steps" parameter to
not raise exception (#12370)
---
comfy_api_nodes/nodes_moonvalley.py | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py
index 08315fa2b..78a230529 100644
--- a/comfy_api_nodes/nodes_moonvalley.py
+++ b/comfy_api_nodes/nodes_moonvalley.py
@@ -219,8 +219,8 @@ class MoonvalleyImg2VideoNode(IO.ComfyNode):
),
IO.Int.Input(
"steps",
- default=33,
- min=1,
+ default=80,
+ min=75, # steps should be greater or equal to cooldown_steps(75) + warmup_steps(0)
max=100,
step=1,
tooltip="Number of denoising steps",
@@ -340,8 +340,8 @@ class MoonvalleyVideo2VideoNode(IO.ComfyNode):
),
IO.Int.Input(
"steps",
- default=33,
- min=1,
+ default=60,
+ min=60, # steps should be greater or equal to cooldown_steps(36) + warmup_steps(24)
max=100,
step=1,
display_mode=IO.NumberDisplay.number,
@@ -370,7 +370,7 @@ class MoonvalleyVideo2VideoNode(IO.ComfyNode):
video: Input.Video | None = None,
control_type: str = "Motion Transfer",
motion_intensity: int | None = 100,
- steps=33,
+ steps=60,
prompt_adherence=4.5,
) -> IO.NodeOutput:
validated_video = validate_video_to_video_input(video)
@@ -465,8 +465,8 @@ class MoonvalleyTxt2VideoNode(IO.ComfyNode):
),
IO.Int.Input(
"steps",
- default=33,
- min=1,
+ default=80,
+ min=75, # steps should be greater or equal to cooldown_steps(75) + warmup_steps(0)
max=100,
step=1,
tooltip="Inference steps",
From 8ca842a8edb26006e730e631ec1153cd42f46d3b Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 10 Feb 2026 19:34:54 +0200
Subject: [PATCH 26/33] feat(api-nodes-Kling): add new models (V3, O3) (#12389)
* feat(api-nodes-Kling): add new models (V3, O3)
* remove storyboard from VideoToVideo node
* added check for total duration of storyboards
* fixed other small things
* updated display name for nodes
* added "fake" seed
---
comfy_api_nodes/apis/__init__.py | 8 +-
comfy_api_nodes/apis/kling.py | 46 +-
comfy_api_nodes/nodes_kling.py | 764 ++++++++++++++++++++++++++++---
3 files changed, 750 insertions(+), 68 deletions(-)
diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py
index ee2aa1ce6..46a583b5e 100644
--- a/comfy_api_nodes/apis/__init__.py
+++ b/comfy_api_nodes/apis/__init__.py
@@ -1197,12 +1197,6 @@ class KlingImageGenImageReferenceType(str, Enum):
face = 'face'
-class KlingImageGenModelName(str, Enum):
- kling_v1 = 'kling-v1'
- kling_v1_5 = 'kling-v1-5'
- kling_v2 = 'kling-v2'
-
-
class KlingImageGenerationsRequest(BaseModel):
aspect_ratio: Optional[KlingImageGenAspectRatio] = '16:9'
callback_url: Optional[AnyUrl] = Field(
@@ -1218,7 +1212,7 @@ class KlingImageGenerationsRequest(BaseModel):
0.5, description='Reference intensity for user-uploaded images', ge=0.0, le=1.0
)
image_reference: Optional[KlingImageGenImageReferenceType] = None
- model_name: Optional[KlingImageGenModelName] = 'kling-v1'
+ model_name: str = Field(...)
n: Optional[int] = Field(1, description='Number of generated images', ge=1, le=9)
negative_prompt: Optional[str] = Field(
None, description='Negative text prompt', max_length=200
diff --git a/comfy_api_nodes/apis/kling.py b/comfy_api_nodes/apis/kling.py
index bf54ede3e..9c0446075 100644
--- a/comfy_api_nodes/apis/kling.py
+++ b/comfy_api_nodes/apis/kling.py
@@ -1,12 +1,22 @@
from pydantic import BaseModel, Field
+class MultiPromptEntry(BaseModel):
+ index: int = Field(...)
+ prompt: str = Field(...)
+ duration: str = Field(...)
+
+
class OmniProText2VideoRequest(BaseModel):
model_name: str = Field(..., description="kling-video-o1")
aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'")
duration: str = Field(..., description="'5' or '10'")
prompt: str = Field(...)
mode: str = Field("pro")
+ multi_shot: bool | None = Field(None)
+ multi_prompt: list[MultiPromptEntry] | None = Field(None)
+ shot_type: str | None = Field(None)
+ sound: str = Field(..., description="'on' or 'off'")
class OmniParamImage(BaseModel):
@@ -26,6 +36,10 @@ class OmniProFirstLastFrameRequest(BaseModel):
duration: str = Field(..., description="'5' or '10'")
prompt: str = Field(...)
mode: str = Field("pro")
+ sound: str | None = Field(None, description="'on' or 'off'")
+ multi_shot: bool | None = Field(None)
+ multi_prompt: list[MultiPromptEntry] | None = Field(None)
+ shot_type: str | None = Field(None)
class OmniProReferences2VideoRequest(BaseModel):
@@ -38,6 +52,10 @@ class OmniProReferences2VideoRequest(BaseModel):
duration: str | None = Field(..., description="From 3 to 10.")
prompt: str = Field(...)
mode: str = Field("pro")
+ sound: str | None = Field(None, description="'on' or 'off'")
+ multi_shot: bool | None = Field(None)
+ multi_prompt: list[MultiPromptEntry] | None = Field(None)
+ shot_type: str | None = Field(None)
class TaskStatusVideoResult(BaseModel):
@@ -54,6 +72,7 @@ class TaskStatusImageResult(BaseModel):
class TaskStatusResults(BaseModel):
videos: list[TaskStatusVideoResult] | None = Field(None)
images: list[TaskStatusImageResult] | None = Field(None)
+ series_images: list[TaskStatusImageResult] | None = Field(None)
class TaskStatusResponseData(BaseModel):
@@ -77,31 +96,42 @@ class OmniImageParamImage(BaseModel):
class OmniProImageRequest(BaseModel):
- model_name: str = Field(..., description="kling-image-o1")
- resolution: str = Field(..., description="'1k' or '2k'")
+ model_name: str = Field(...)
+ resolution: str = Field(...)
aspect_ratio: str | None = Field(...)
prompt: str = Field(...)
mode: str = Field("pro")
n: int | None = Field(1, le=9)
image_list: list[OmniImageParamImage] | None = Field(..., max_length=10)
+ result_type: str | None = Field(None, description="Set to 'series' for series generation")
+ series_amount: int | None = Field(None, ge=2, le=9, description="Number of images in a series")
class TextToVideoWithAudioRequest(BaseModel):
- model_name: str = Field(..., description="kling-v2-6")
+ model_name: str = Field(...)
aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'")
- duration: str = Field(..., description="'5' or '10'")
- prompt: str = Field(...)
+ duration: str = Field(...)
+ prompt: str | None = Field(...)
+ negative_prompt: str | None = Field(None)
mode: str = Field("pro")
sound: str = Field(..., description="'on' or 'off'")
+ multi_shot: bool | None = Field(None)
+ multi_prompt: list[MultiPromptEntry] | None = Field(None)
+ shot_type: str | None = Field(None)
class ImageToVideoWithAudioRequest(BaseModel):
- model_name: str = Field(..., description="kling-v2-6")
+ model_name: str = Field(...)
image: str = Field(...)
- duration: str = Field(..., description="'5' or '10'")
- prompt: str = Field(...)
+ image_tail: str | None = Field(None)
+ duration: str = Field(...)
+ prompt: str | None = Field(...)
+ negative_prompt: str | None = Field(None)
mode: str = Field("pro")
sound: str = Field(..., description="'on' or 'off'")
+ multi_shot: bool | None = Field(None)
+ multi_prompt: list[MultiPromptEntry] | None = Field(None)
+ shot_type: str | None = Field(None)
class MotionControlRequest(BaseModel):
diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py
index 739fe1855..b89c85561 100644
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@@ -38,7 +38,6 @@ from comfy_api_nodes.apis import (
KlingImageGenerationsRequest,
KlingImageGenerationsResponse,
KlingImageGenImageReferenceType,
- KlingImageGenModelName,
KlingImageGenAspectRatio,
KlingVideoEffectsRequest,
KlingVideoEffectsResponse,
@@ -52,6 +51,7 @@ from comfy_api_nodes.apis import (
from comfy_api_nodes.apis.kling import (
ImageToVideoWithAudioRequest,
MotionControlRequest,
+ MultiPromptEntry,
OmniImageParamImage,
OmniParamImage,
OmniParamVideo,
@@ -71,6 +71,7 @@ from comfy_api_nodes.util import (
sync_op,
tensor_to_base64_string,
upload_audio_to_comfyapi,
+ upload_image_to_comfyapi,
upload_images_to_comfyapi,
upload_video_to_comfyapi,
validate_image_aspect_ratio,
@@ -80,6 +81,31 @@ from comfy_api_nodes.util import (
validate_video_duration,
)
+
+def _generate_storyboard_inputs(count: int) -> list:
+ inputs = []
+ for i in range(1, count + 1):
+ inputs.extend(
+ [
+ IO.String.Input(
+ f"storyboard_{i}_prompt",
+ multiline=True,
+ default="",
+ tooltip=f"Prompt for storyboard segment {i}. Max 512 characters.",
+ ),
+ IO.Int.Input(
+ f"storyboard_{i}_duration",
+ default=4,
+ min=1,
+ max=15,
+ display_mode=IO.NumberDisplay.slider,
+ tooltip=f"Duration for storyboard segment {i} in seconds.",
+ ),
+ ]
+ )
+ return inputs
+
+
KLING_API_VERSION = "v1"
PATH_TEXT_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/text2video"
PATH_IMAGE_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/image2video"
@@ -820,20 +846,48 @@ class OmniProTextToVideoNode(IO.ComfyNode):
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="KlingOmniProTextToVideoNode",
- display_name="Kling Omni Text to Video (Pro)",
+ display_name="Kling 3.0 Omni Text to Video",
category="api node/video/Kling",
description="Use text prompts to generate videos with the latest Kling model.",
inputs=[
- IO.Combo.Input("model_name", options=["kling-video-o1"]),
+ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]),
IO.String.Input(
"prompt",
multiline=True,
tooltip="A text prompt describing the video content. "
- "This can include both positive and negative descriptions.",
+ "This can include both positive and negative descriptions. "
+ "Ignored when storyboards are enabled.",
),
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
- IO.Combo.Input("duration", options=[5, 10]),
+ IO.Int.Input("duration", default=5, min=3, max=15, display_mode=IO.NumberDisplay.slider),
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
+ IO.DynamicCombo.Input(
+ "storyboards",
+ options=[
+ IO.DynamicCombo.Option("disabled", []),
+ IO.DynamicCombo.Option("1 storyboard", _generate_storyboard_inputs(1)),
+ IO.DynamicCombo.Option("2 storyboards", _generate_storyboard_inputs(2)),
+ IO.DynamicCombo.Option("3 storyboards", _generate_storyboard_inputs(3)),
+ IO.DynamicCombo.Option("4 storyboards", _generate_storyboard_inputs(4)),
+ IO.DynamicCombo.Option("5 storyboards", _generate_storyboard_inputs(5)),
+ IO.DynamicCombo.Option("6 storyboards", _generate_storyboard_inputs(6)),
+ ],
+ tooltip="Generate a series of video segments with individual prompts and durations. "
+ "Ignored for o1 model.",
+ optional=True,
+ ),
+ IO.Boolean.Input("generate_audio", default=False, optional=True),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ optional=True,
+ ),
],
outputs=[
IO.Video.Output(),
@@ -845,11 +899,15 @@ class OmniProTextToVideoNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution"]),
+ depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
expr="""
(
$mode := (widgets.resolution = "720p") ? "std" : "pro";
- $rates := {"std": 0.084, "pro": 0.112};
+ $isV3 := $contains(widgets.model_name, "v3");
+ $audio := $isV3 and widgets.generate_audio;
+ $rates := $audio
+ ? {"std": 0.112, "pro": 0.14}
+ : {"std": 0.084, "pro": 0.112};
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
)
""",
@@ -864,8 +922,45 @@ class OmniProTextToVideoNode(IO.ComfyNode):
aspect_ratio: str,
duration: int,
resolution: str = "1080p",
+ storyboards: dict | None = None,
+ generate_audio: bool = False,
+ seed: int = 0,
) -> IO.NodeOutput:
- validate_string(prompt, min_length=1, max_length=2500)
+ _ = seed
+ if model_name == "kling-video-o1":
+ if duration not in (5, 10):
+ raise ValueError("kling-video-o1 only supports durations of 5 or 10 seconds.")
+ if generate_audio:
+ raise ValueError("kling-video-o1 does not support audio generation.")
+ stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
+ if stories_enabled and model_name == "kling-video-o1":
+ raise ValueError("kling-video-o1 does not support storyboards.")
+ validate_string(prompt, strip_whitespace=True, min_length=0 if stories_enabled else 1, max_length=2500)
+
+ multi_shot = None
+ multi_prompt_list = None
+ if stories_enabled:
+ count = int(storyboards["storyboards"].split()[0])
+ multi_shot = True
+ multi_prompt_list = []
+ for i in range(1, count + 1):
+ sb_prompt = storyboards[f"storyboard_{i}_prompt"]
+ sb_duration = storyboards[f"storyboard_{i}_duration"]
+ validate_string(sb_prompt, field_name=f"storyboard_{i}_prompt", min_length=1, max_length=512)
+ multi_prompt_list.append(
+ MultiPromptEntry(
+ index=i,
+ prompt=sb_prompt,
+ duration=str(sb_duration),
+ )
+ )
+ total_storyboard_duration = sum(int(e.duration) for e in multi_prompt_list)
+ if total_storyboard_duration != duration:
+ raise ValueError(
+ f"Total storyboard duration ({total_storyboard_duration}s) "
+ f"must equal the global duration ({duration}s)."
+ )
+
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
@@ -876,6 +971,10 @@ class OmniProTextToVideoNode(IO.ComfyNode):
aspect_ratio=aspect_ratio,
duration=str(duration),
mode="pro" if resolution == "1080p" else "std",
+ multi_shot=multi_shot,
+ multi_prompt=multi_prompt_list,
+ shot_type="customize" if multi_shot else None,
+ sound="on" if generate_audio else "off",
),
)
return await finish_omni_video_task(cls, response)
@@ -887,24 +986,26 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="KlingOmniProFirstLastFrameNode",
- display_name="Kling Omni First-Last-Frame to Video (Pro)",
+ display_name="Kling 3.0 Omni First-Last-Frame to Video",
category="api node/video/Kling",
description="Use a start frame, an optional end frame, or reference images with the latest Kling model.",
inputs=[
- IO.Combo.Input("model_name", options=["kling-video-o1"]),
+ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]),
IO.String.Input(
"prompt",
multiline=True,
tooltip="A text prompt describing the video content. "
- "This can include both positive and negative descriptions.",
+ "This can include both positive and negative descriptions. "
+ "Ignored when storyboards are enabled.",
),
- IO.Int.Input("duration", default=5, min=3, max=10, display_mode=IO.NumberDisplay.slider),
+ IO.Int.Input("duration", default=5, min=3, max=15, display_mode=IO.NumberDisplay.slider),
IO.Image.Input("first_frame"),
IO.Image.Input(
"end_frame",
optional=True,
tooltip="An optional end frame for the video. "
- "This cannot be used simultaneously with 'reference_images'.",
+ "This cannot be used simultaneously with 'reference_images'. "
+ "Does not work with storyboards.",
),
IO.Image.Input(
"reference_images",
@@ -912,6 +1013,38 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
tooltip="Up to 6 additional reference images.",
),
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
+ IO.DynamicCombo.Input(
+ "storyboards",
+ options=[
+ IO.DynamicCombo.Option("disabled", []),
+ IO.DynamicCombo.Option("1 storyboard", _generate_storyboard_inputs(1)),
+ IO.DynamicCombo.Option("2 storyboards", _generate_storyboard_inputs(2)),
+ IO.DynamicCombo.Option("3 storyboards", _generate_storyboard_inputs(3)),
+ IO.DynamicCombo.Option("4 storyboards", _generate_storyboard_inputs(4)),
+ IO.DynamicCombo.Option("5 storyboards", _generate_storyboard_inputs(5)),
+ IO.DynamicCombo.Option("6 storyboards", _generate_storyboard_inputs(6)),
+ ],
+ tooltip="Generate a series of video segments with individual prompts and durations. "
+ "Only supported for kling-v3-omni.",
+ optional=True,
+ ),
+ IO.Boolean.Input(
+ "generate_audio",
+ default=False,
+ optional=True,
+ tooltip="Generate audio for the video. Only supported for kling-v3-omni.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ optional=True,
+ ),
],
outputs=[
IO.Video.Output(),
@@ -923,11 +1056,15 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution"]),
+ depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
expr="""
(
$mode := (widgets.resolution = "720p") ? "std" : "pro";
- $rates := {"std": 0.084, "pro": 0.112};
+ $isV3 := $contains(widgets.model_name, "v3");
+ $audio := $isV3 and widgets.generate_audio;
+ $rates := $audio
+ ? {"std": 0.112, "pro": 0.14}
+ : {"std": 0.084, "pro": 0.112};
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
)
""",
@@ -944,15 +1081,59 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
end_frame: Input.Image | None = None,
reference_images: Input.Image | None = None,
resolution: str = "1080p",
+ storyboards: dict | None = None,
+ generate_audio: bool = False,
+ seed: int = 0,
) -> IO.NodeOutput:
+ _ = seed
+ if model_name == "kling-video-o1":
+ if duration > 10:
+ raise ValueError("kling-video-o1 does not support durations greater than 10 seconds.")
+ if generate_audio:
+ raise ValueError("kling-video-o1 does not support audio generation.")
+ stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
+ if stories_enabled and model_name == "kling-video-o1":
+ raise ValueError("kling-video-o1 does not support storyboards.")
prompt = normalize_omni_prompt_references(prompt)
- validate_string(prompt, min_length=1, max_length=2500)
+ validate_string(prompt, strip_whitespace=True, min_length=0 if stories_enabled else 1, max_length=2500)
if end_frame is not None and reference_images is not None:
raise ValueError("The 'end_frame' input cannot be used simultaneously with 'reference_images'.")
- if duration not in (5, 10) and end_frame is None and reference_images is None:
+ if end_frame is not None and stories_enabled:
+ raise ValueError("The 'end_frame' input cannot be used simultaneously with storyboards.")
+ if (
+ model_name == "kling-video-o1"
+ and duration not in (5, 10)
+ and end_frame is None
+ and reference_images is None
+ ):
raise ValueError(
"Duration is only supported for 5 or 10 seconds if there is no end frame or reference images."
)
+
+ multi_shot = None
+ multi_prompt_list = None
+ if stories_enabled:
+ count = int(storyboards["storyboards"].split()[0])
+ multi_shot = True
+ multi_prompt_list = []
+ for i in range(1, count + 1):
+ sb_prompt = storyboards[f"storyboard_{i}_prompt"]
+ sb_duration = storyboards[f"storyboard_{i}_duration"]
+ validate_string(sb_prompt, field_name=f"storyboard_{i}_prompt", min_length=1, max_length=512)
+ multi_prompt_list.append(
+ MultiPromptEntry(
+ index=i,
+ prompt=sb_prompt,
+ duration=str(sb_duration),
+ )
+ )
+ total_storyboard_duration = sum(int(e.duration) for e in multi_prompt_list)
+ if total_storyboard_duration != duration:
+ raise ValueError(
+ f"Total storyboard duration ({total_storyboard_duration}s) "
+ f"must equal the global duration ({duration}s)."
+ )
+
validate_image_dimensions(first_frame, min_width=300, min_height=300)
validate_image_aspect_ratio(first_frame, (1, 2.5), (2.5, 1))
image_list: list[OmniParamImage] = [
@@ -988,6 +1169,10 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
duration=str(duration),
image_list=image_list,
mode="pro" if resolution == "1080p" else "std",
+ sound="on" if generate_audio else "off",
+ multi_shot=multi_shot,
+ multi_prompt=multi_prompt_list,
+ shot_type="customize" if multi_shot else None,
),
)
return await finish_omni_video_task(cls, response)
@@ -999,24 +1184,57 @@ class OmniProImageToVideoNode(IO.ComfyNode):
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="KlingOmniProImageToVideoNode",
- display_name="Kling Omni Image to Video (Pro)",
+ display_name="Kling 3.0 Omni Image to Video",
category="api node/video/Kling",
description="Use up to 7 reference images to generate a video with the latest Kling model.",
inputs=[
- IO.Combo.Input("model_name", options=["kling-video-o1"]),
+ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]),
IO.String.Input(
"prompt",
multiline=True,
tooltip="A text prompt describing the video content. "
- "This can include both positive and negative descriptions.",
+ "This can include both positive and negative descriptions. "
+ "Ignored when storyboards are enabled.",
),
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
- IO.Int.Input("duration", default=3, min=3, max=10, display_mode=IO.NumberDisplay.slider),
+ IO.Int.Input("duration", default=5, min=3, max=15, display_mode=IO.NumberDisplay.slider),
IO.Image.Input(
"reference_images",
tooltip="Up to 7 reference images.",
),
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
+ IO.DynamicCombo.Input(
+ "storyboards",
+ options=[
+ IO.DynamicCombo.Option("disabled", []),
+ IO.DynamicCombo.Option("1 storyboard", _generate_storyboard_inputs(1)),
+ IO.DynamicCombo.Option("2 storyboards", _generate_storyboard_inputs(2)),
+ IO.DynamicCombo.Option("3 storyboards", _generate_storyboard_inputs(3)),
+ IO.DynamicCombo.Option("4 storyboards", _generate_storyboard_inputs(4)),
+ IO.DynamicCombo.Option("5 storyboards", _generate_storyboard_inputs(5)),
+ IO.DynamicCombo.Option("6 storyboards", _generate_storyboard_inputs(6)),
+ ],
+ tooltip="Generate a series of video segments with individual prompts and durations. "
+ "Only supported for kling-v3-omni.",
+ optional=True,
+ ),
+ IO.Boolean.Input(
+ "generate_audio",
+ default=False,
+ optional=True,
+ tooltip="Generate audio for the video. Only supported for kling-v3-omni.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ optional=True,
+ ),
],
outputs=[
IO.Video.Output(),
@@ -1028,11 +1246,15 @@ class OmniProImageToVideoNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution"]),
+ depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
expr="""
(
$mode := (widgets.resolution = "720p") ? "std" : "pro";
- $rates := {"std": 0.084, "pro": 0.112};
+ $isV3 := $contains(widgets.model_name, "v3");
+ $audio := $isV3 and widgets.generate_audio;
+ $rates := $audio
+ ? {"std": 0.112, "pro": 0.14}
+ : {"std": 0.084, "pro": 0.112};
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
)
""",
@@ -1048,9 +1270,46 @@ class OmniProImageToVideoNode(IO.ComfyNode):
duration: int,
reference_images: Input.Image,
resolution: str = "1080p",
+ storyboards: dict | None = None,
+ generate_audio: bool = False,
+ seed: int = 0,
) -> IO.NodeOutput:
+ _ = seed
+ if model_name == "kling-video-o1":
+ if duration > 10:
+ raise ValueError("kling-video-o1 does not support durations greater than 10 seconds.")
+ if generate_audio:
+ raise ValueError("kling-video-o1 does not support audio generation.")
+ stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
+ if stories_enabled and model_name == "kling-video-o1":
+ raise ValueError("kling-video-o1 does not support storyboards.")
prompt = normalize_omni_prompt_references(prompt)
- validate_string(prompt, min_length=1, max_length=2500)
+ validate_string(prompt, strip_whitespace=True, min_length=0 if stories_enabled else 1, max_length=2500)
+
+ multi_shot = None
+ multi_prompt_list = None
+ if stories_enabled:
+ count = int(storyboards["storyboards"].split()[0])
+ multi_shot = True
+ multi_prompt_list = []
+ for i in range(1, count + 1):
+ sb_prompt = storyboards[f"storyboard_{i}_prompt"]
+ sb_duration = storyboards[f"storyboard_{i}_duration"]
+ validate_string(sb_prompt, field_name=f"storyboard_{i}_prompt", min_length=1, max_length=512)
+ multi_prompt_list.append(
+ MultiPromptEntry(
+ index=i,
+ prompt=sb_prompt,
+ duration=str(sb_duration),
+ )
+ )
+ total_storyboard_duration = sum(int(e.duration) for e in multi_prompt_list)
+ if total_storyboard_duration != duration:
+ raise ValueError(
+ f"Total storyboard duration ({total_storyboard_duration}s) "
+ f"must equal the global duration ({duration}s)."
+ )
+
if get_number_of_images(reference_images) > 7:
raise ValueError("The maximum number of reference images is 7.")
for i in reference_images:
@@ -1070,6 +1329,10 @@ class OmniProImageToVideoNode(IO.ComfyNode):
duration=str(duration),
image_list=image_list,
mode="pro" if resolution == "1080p" else "std",
+ sound="on" if generate_audio else "off",
+ multi_shot=multi_shot,
+ multi_prompt=multi_prompt_list,
+ shot_type="customize" if multi_shot else None,
),
)
return await finish_omni_video_task(cls, response)
@@ -1081,11 +1344,11 @@ class OmniProVideoToVideoNode(IO.ComfyNode):
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="KlingOmniProVideoToVideoNode",
- display_name="Kling Omni Video to Video (Pro)",
+ display_name="Kling 3.0 Omni Video to Video",
category="api node/video/Kling",
description="Use a video and up to 4 reference images to generate a video with the latest Kling model.",
inputs=[
- IO.Combo.Input("model_name", options=["kling-video-o1"]),
+ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]),
IO.String.Input(
"prompt",
multiline=True,
@@ -1102,6 +1365,17 @@ class OmniProVideoToVideoNode(IO.ComfyNode):
optional=True,
),
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ optional=True,
+ ),
],
outputs=[
IO.Video.Output(),
@@ -1135,7 +1409,9 @@ class OmniProVideoToVideoNode(IO.ComfyNode):
keep_original_sound: bool,
reference_images: Input.Image | None = None,
resolution: str = "1080p",
+ seed: int = 0,
) -> IO.NodeOutput:
+ _ = seed
prompt = normalize_omni_prompt_references(prompt)
validate_string(prompt, min_length=1, max_length=2500)
validate_video_duration(reference_video, min_duration=3.0, max_duration=10.05)
@@ -1179,11 +1455,11 @@ class OmniProEditVideoNode(IO.ComfyNode):
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="KlingOmniProEditVideoNode",
- display_name="Kling Omni Edit Video (Pro)",
+ display_name="Kling 3.0 Omni Edit Video",
category="api node/video/Kling",
description="Edit an existing video with the latest model from Kling.",
inputs=[
- IO.Combo.Input("model_name", options=["kling-video-o1"]),
+ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]),
IO.String.Input(
"prompt",
multiline=True,
@@ -1198,6 +1474,17 @@ class OmniProEditVideoNode(IO.ComfyNode):
optional=True,
),
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ optional=True,
+ ),
],
outputs=[
IO.Video.Output(),
@@ -1229,7 +1516,9 @@ class OmniProEditVideoNode(IO.ComfyNode):
keep_original_sound: bool,
reference_images: Input.Image | None = None,
resolution: str = "1080p",
+ seed: int = 0,
) -> IO.NodeOutput:
+ _ = seed
prompt = normalize_omni_prompt_references(prompt)
validate_string(prompt, min_length=1, max_length=2500)
validate_video_duration(video, min_duration=3.0, max_duration=10.05)
@@ -1273,27 +1562,43 @@ class OmniProImageNode(IO.ComfyNode):
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="KlingOmniProImageNode",
- display_name="Kling Omni Image (Pro)",
+ display_name="Kling 3.0 Omni Image",
category="api node/image/Kling",
description="Create or edit images with the latest model from Kling.",
inputs=[
- IO.Combo.Input("model_name", options=["kling-image-o1"]),
+ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-image-o1"]),
IO.String.Input(
"prompt",
multiline=True,
tooltip="A text prompt describing the image content. "
"This can include both positive and negative descriptions.",
),
- IO.Combo.Input("resolution", options=["1K", "2K"]),
+ IO.Combo.Input("resolution", options=["1K", "2K", "4K"]),
IO.Combo.Input(
"aspect_ratio",
options=["16:9", "9:16", "1:1", "4:3", "3:4", "3:2", "2:3", "21:9"],
),
+ IO.Combo.Input(
+ "series_amount",
+ options=["disabled", "2", "3", "4", "5", "6", "7", "8", "9"],
+ tooltip="Generate a series of images. Not supported for kling-image-o1.",
+ ),
IO.Image.Input(
"reference_images",
tooltip="Up to 10 additional reference images.",
optional=True,
),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ optional=True,
+ ),
],
outputs=[
IO.Image.Output(),
@@ -1305,7 +1610,16 @@ class OmniProImageNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- expr="""{"type":"usd","usd":0.028}""",
+ depends_on=IO.PriceBadgeDepends(widgets=["resolution", "series_amount", "model_name"]),
+ expr="""
+ (
+ $prices := {"1k": 0.028, "2k": 0.028, "4k": 0.056};
+ $base := $lookup($prices, widgets.resolution);
+ $isO1 := widgets.model_name = "kling-image-o1";
+ $mult := ($isO1 or widgets.series_amount = "disabled") ? 1 : $number(widgets.series_amount);
+ {"type":"usd","usd": $base * $mult}
+ )
+ """,
),
)
@@ -1316,8 +1630,13 @@ class OmniProImageNode(IO.ComfyNode):
prompt: str,
resolution: str,
aspect_ratio: str,
+ series_amount: str = "disabled",
reference_images: Input.Image | None = None,
+ seed: int = 0,
) -> IO.NodeOutput:
+ _ = seed
+ if model_name == "kling-image-o1" and resolution == "4K":
+ raise ValueError("4K resolution is not supported for kling-image-o1 model.")
prompt = normalize_omni_prompt_references(prompt)
validate_string(prompt, min_length=1, max_length=2500)
image_list: list[OmniImageParamImage] = []
@@ -1329,6 +1648,9 @@ class OmniProImageNode(IO.ComfyNode):
validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
image_list.append(OmniImageParamImage(image=i))
+ use_series = series_amount != "disabled"
+ if use_series and model_name == "kling-image-o1":
+ raise ValueError("kling-image-o1 does not support series generation.")
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/kling/v1/images/omni-image", method="POST"),
@@ -1339,6 +1661,8 @@ class OmniProImageNode(IO.ComfyNode):
resolution=resolution.lower(),
aspect_ratio=aspect_ratio,
image_list=image_list if image_list else None,
+ result_type="series" if use_series else None,
+ series_amount=int(series_amount) if use_series else None,
),
)
if response.code:
@@ -1351,7 +1675,9 @@ class OmniProImageNode(IO.ComfyNode):
response_model=TaskStatusResponse,
status_extractor=lambda r: (r.data.task_status if r.data else None),
)
- return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.task_result.images[0].url))
+ images = final_response.data.task_result.series_images or final_response.data.task_result.images
+ tensors = [await download_url_to_image_tensor(img.url) for img in images]
+ return IO.NodeOutput(torch.cat(tensors, dim=0))
class KlingCameraControlT2VNode(IO.ComfyNode):
@@ -2119,7 +2445,7 @@ class KlingImageGenerationNode(IO.ComfyNode):
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="KlingImageGenerationNode",
- display_name="Kling Image Generation",
+ display_name="Kling 3.0 Image",
category="api node/image/Kling",
description="Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.",
inputs=[
@@ -2147,11 +2473,7 @@ class KlingImageGenerationNode(IO.ComfyNode):
display_mode=IO.NumberDisplay.slider,
tooltip="Subject reference similarity",
),
- IO.Combo.Input(
- "model_name",
- options=[i.value for i in KlingImageGenModelName],
- default="kling-v2",
- ),
+ IO.Combo.Input("model_name", options=["kling-v3", "kling-v2", "kling-v1-5"]),
IO.Combo.Input(
"aspect_ratio",
options=[i.value for i in KlingImageGenAspectRatio],
@@ -2165,6 +2487,17 @@ class KlingImageGenerationNode(IO.ComfyNode):
tooltip="Number of generated images",
),
IO.Image.Input("image", optional=True),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ optional=True,
+ ),
],
outputs=[
IO.Image.Output(),
@@ -2183,7 +2516,7 @@ class KlingImageGenerationNode(IO.ComfyNode):
$base :=
$contains($m,"kling-v1-5")
? (inputs.image.connected ? 0.028 : 0.014)
- : ($contains($m,"kling-v1") ? 0.0035 : 0.014);
+ : $contains($m,"kling-v3") ? 0.028 : 0.014;
{"type":"usd","usd": $base * widgets.n}
)
""",
@@ -2193,7 +2526,7 @@ class KlingImageGenerationNode(IO.ComfyNode):
@classmethod
async def execute(
cls,
- model_name: KlingImageGenModelName,
+ model_name: str,
prompt: str,
negative_prompt: str,
image_type: KlingImageGenImageReferenceType,
@@ -2202,17 +2535,11 @@ class KlingImageGenerationNode(IO.ComfyNode):
n: int,
aspect_ratio: KlingImageGenAspectRatio,
image: torch.Tensor | None = None,
+ seed: int = 0,
) -> IO.NodeOutput:
+ _ = seed
validate_string(prompt, field_name="prompt", min_length=1, max_length=MAX_PROMPT_LENGTH_IMAGE_GEN)
validate_string(negative_prompt, field_name="negative_prompt", max_length=MAX_PROMPT_LENGTH_IMAGE_GEN)
-
- if image is None:
- image_type = None
- elif model_name == KlingImageGenModelName.kling_v1:
- raise ValueError(f"The model {KlingImageGenModelName.kling_v1.value} does not support reference images.")
- else:
- image = tensor_to_base64_string(image)
-
task_creation_response = await sync_op(
cls,
ApiEndpoint(path=PATH_IMAGE_GENERATIONS, method="POST"),
@@ -2221,8 +2548,8 @@ class KlingImageGenerationNode(IO.ComfyNode):
model_name=model_name,
prompt=prompt,
negative_prompt=negative_prompt,
- image=image,
- image_reference=image_type,
+ image=tensor_to_base64_string(image) if image is not None else None,
+ image_reference=image_type if image is not None else None,
image_fidelity=image_fidelity,
human_fidelity=human_fidelity,
n=n,
@@ -2252,7 +2579,7 @@ class TextToVideoWithAudio(IO.ComfyNode):
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="KlingTextToVideoWithAudio",
- display_name="Kling Text to Video with Audio",
+ display_name="Kling 2.6 Text to Video with Audio",
category="api node/video/Kling",
inputs=[
IO.Combo.Input("model_name", options=["kling-v2-6"]),
@@ -2320,7 +2647,7 @@ class ImageToVideoWithAudio(IO.ComfyNode):
def define_schema(cls) -> IO.Schema:
return IO.Schema(
node_id="KlingImageToVideoWithAudio",
- display_name="Kling Image(First Frame) to Video with Audio",
+ display_name="Kling 2.6 Image(First Frame) to Video with Audio",
category="api node/video/Kling",
inputs=[
IO.Combo.Input("model_name", options=["kling-v2-6"]),
@@ -2478,6 +2805,335 @@ class MotionControl(IO.ComfyNode):
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
+class KlingVideoNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="KlingVideoNode",
+ display_name="Kling 3.0 Video",
+ category="api node/video/Kling",
+ description="Generate videos with Kling V3. "
+ "Supports text-to-video and image-to-video with optional storyboard multi-prompt and audio generation.",
+ inputs=[
+ IO.DynamicCombo.Input(
+ "multi_shot",
+ options=[
+ IO.DynamicCombo.Option(
+ "disabled",
+ [
+ IO.String.Input("prompt", multiline=True, default=""),
+ IO.String.Input("negative_prompt", multiline=True, default=""),
+ IO.Int.Input(
+ "duration",
+ default=5,
+ min=3,
+ max=15,
+ display_mode=IO.NumberDisplay.slider,
+ ),
+ ],
+ ),
+ IO.DynamicCombo.Option("1 storyboard", _generate_storyboard_inputs(1)),
+ IO.DynamicCombo.Option("2 storyboards", _generate_storyboard_inputs(2)),
+ IO.DynamicCombo.Option("3 storyboards", _generate_storyboard_inputs(3)),
+ IO.DynamicCombo.Option("4 storyboards", _generate_storyboard_inputs(4)),
+ IO.DynamicCombo.Option("5 storyboards", _generate_storyboard_inputs(5)),
+ IO.DynamicCombo.Option("6 storyboards", _generate_storyboard_inputs(6)),
+ ],
+ tooltip="Generate a series of video segments with individual prompts and durations.",
+ ),
+ IO.Boolean.Input("generate_audio", default=True),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "kling-v3",
+ [
+ IO.Combo.Input("resolution", options=["1080p", "720p"]),
+ IO.Combo.Input(
+ "aspect_ratio",
+ options=["16:9", "9:16", "1:1"],
+ tooltip="Ignored in image-to-video mode.",
+ ),
+ ],
+ ),
+ ],
+ tooltip="Model and generation settings.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ ),
+ IO.Image.Input(
+ "start_frame",
+ optional=True,
+ tooltip="Optional start frame image. When connected, switches to image-to-video mode.",
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(
+ widgets=[
+ "model.resolution",
+ "generate_audio",
+ "multi_shot",
+ "multi_shot.duration",
+ "multi_shot.storyboard_1_duration",
+ "multi_shot.storyboard_2_duration",
+ "multi_shot.storyboard_3_duration",
+ "multi_shot.storyboard_4_duration",
+ "multi_shot.storyboard_5_duration",
+ "multi_shot.storyboard_6_duration",
+ ],
+ ),
+ expr="""
+ (
+ $rates := {"1080p": {"off": 0.112, "on": 0.168}, "720p": {"off": 0.084, "on": 0.126}};
+ $res := $lookup(widgets, "model.resolution");
+ $audio := widgets.generate_audio ? "on" : "off";
+ $rate := $lookup($lookup($rates, $res), $audio);
+ $ms := widgets.multi_shot;
+ $isSb := $ms != "disabled";
+ $n := $isSb ? $number($substring($ms, 0, 1)) : 0;
+ $d1 := $lookup(widgets, "multi_shot.storyboard_1_duration");
+ $d2 := $n >= 2 ? $lookup(widgets, "multi_shot.storyboard_2_duration") : 0;
+ $d3 := $n >= 3 ? $lookup(widgets, "multi_shot.storyboard_3_duration") : 0;
+ $d4 := $n >= 4 ? $lookup(widgets, "multi_shot.storyboard_4_duration") : 0;
+ $d5 := $n >= 5 ? $lookup(widgets, "multi_shot.storyboard_5_duration") : 0;
+ $d6 := $n >= 6 ? $lookup(widgets, "multi_shot.storyboard_6_duration") : 0;
+ $dur := $isSb ? $d1 + $d2 + $d3 + $d4 + $d5 + $d6 : $lookup(widgets, "multi_shot.duration");
+ {"type":"usd","usd": $rate * $dur}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ multi_shot: dict,
+ generate_audio: bool,
+ model: dict,
+ seed: int,
+ start_frame: Input.Image | None = None,
+ ) -> IO.NodeOutput:
+ _ = seed
+ mode = "pro" if model["resolution"] == "1080p" else "std"
+ custom_multi_shot = False
+ if multi_shot["multi_shot"] == "disabled":
+ shot_type = None
+ else:
+ shot_type = "customize"
+ custom_multi_shot = True
+
+ multi_prompt_list = None
+ if shot_type == "customize":
+ count = int(multi_shot["multi_shot"].split()[0])
+ multi_prompt_list = []
+ for i in range(1, count + 1):
+ sb_prompt = multi_shot[f"storyboard_{i}_prompt"]
+ sb_duration = multi_shot[f"storyboard_{i}_duration"]
+ validate_string(sb_prompt, field_name=f"storyboard_{i}_prompt", min_length=1, max_length=512)
+ multi_prompt_list.append(
+ MultiPromptEntry(
+ index=i,
+ prompt=sb_prompt,
+ duration=str(sb_duration),
+ )
+ )
+ duration = sum(int(e.duration) for e in multi_prompt_list)
+ if duration < 3 or duration > 15:
+ raise ValueError(
+ f"Total storyboard duration ({duration}s) must be between 3 and 15 seconds."
+ )
+ else:
+ duration = multi_shot["duration"]
+ validate_string(multi_shot["prompt"], min_length=1, max_length=2500)
+
+ if start_frame is not None:
+ validate_image_dimensions(start_frame, min_width=300, min_height=300)
+ validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))
+ image_url = await upload_image_to_comfyapi(cls, start_frame, wait_label="Uploading start frame")
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/kling/v1/videos/image2video", method="POST"),
+ response_model=TaskStatusResponse,
+ data=ImageToVideoWithAudioRequest(
+ model_name=model["model"],
+ image=image_url,
+ prompt=None if custom_multi_shot else multi_shot["prompt"],
+ negative_prompt=None if custom_multi_shot else multi_shot["negative_prompt"],
+ mode=mode,
+ duration=str(duration),
+ sound="on" if generate_audio else "off",
+ multi_shot=True if shot_type else None,
+ multi_prompt=multi_prompt_list,
+ shot_type=shot_type,
+ ),
+ )
+ poll_path = f"/proxy/kling/v1/videos/image2video/{response.data.task_id}"
+ else:
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/kling/v1/videos/text2video", method="POST"),
+ response_model=TaskStatusResponse,
+ data=TextToVideoWithAudioRequest(
+ model_name=model["model"],
+ aspect_ratio=model["aspect_ratio"],
+ prompt=None if custom_multi_shot else multi_shot["prompt"],
+ negative_prompt=None if custom_multi_shot else multi_shot["negative_prompt"],
+ mode=mode,
+ duration=str(duration),
+ sound="on" if generate_audio else "off",
+ multi_shot=True if shot_type else None,
+ multi_prompt=multi_prompt_list,
+ shot_type=shot_type,
+ ),
+ )
+ poll_path = f"/proxy/kling/v1/videos/text2video/{response.data.task_id}"
+
+ if response.code:
+ raise RuntimeError(
+ f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}"
+ )
+ final_response = await poll_op(
+ cls,
+ ApiEndpoint(path=poll_path),
+ response_model=TaskStatusResponse,
+ status_extractor=lambda r: (r.data.task_status if r.data else None),
+ )
+ return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
+
+
+class KlingFirstLastFrameNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="KlingFirstLastFrameNode",
+ display_name="Kling 3.0 First-Last-Frame to Video",
+ category="api node/video/Kling",
+ description="Generate videos with Kling V3 using first and last frames.",
+ inputs=[
+ IO.String.Input("prompt", multiline=True, default=""),
+ IO.Int.Input(
+ "duration",
+ default=5,
+ min=3,
+ max=15,
+ display_mode=IO.NumberDisplay.slider,
+ ),
+ IO.Image.Input("first_frame"),
+ IO.Image.Input("end_frame"),
+ IO.Boolean.Input("generate_audio", default=True),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "kling-v3",
+ [
+ IO.Combo.Input("resolution", options=["1080p", "720p"]),
+ ],
+ ),
+ ],
+ tooltip="Model and generation settings.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(
+ widgets=["model.resolution", "generate_audio", "duration"],
+ ),
+ expr="""
+ (
+ $rates := {"1080p": {"off": 0.112, "on": 0.168}, "720p": {"off": 0.084, "on": 0.126}};
+ $res := $lookup(widgets, "model.resolution");
+ $audio := widgets.generate_audio ? "on" : "off";
+ $rate := $lookup($lookup($rates, $res), $audio);
+ {"type":"usd","usd": $rate * widgets.duration}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ duration: int,
+ first_frame: Input.Image,
+ end_frame: Input.Image,
+ generate_audio: bool,
+ model: dict,
+ seed: int,
+ ) -> IO.NodeOutput:
+ _ = seed
+ validate_string(prompt, min_length=1, max_length=2500)
+ validate_image_dimensions(first_frame, min_width=300, min_height=300)
+ validate_image_aspect_ratio(first_frame, (1, 2.5), (2.5, 1))
+ validate_image_dimensions(end_frame, min_width=300, min_height=300)
+ validate_image_aspect_ratio(end_frame, (1, 2.5), (2.5, 1))
+ image_url = await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame")
+ image_tail_url = await upload_image_to_comfyapi(cls, end_frame, wait_label="Uploading end frame")
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/kling/v1/videos/image2video", method="POST"),
+ response_model=TaskStatusResponse,
+ data=ImageToVideoWithAudioRequest(
+ model_name=model["model"],
+ image=image_url,
+ image_tail=image_tail_url,
+ prompt=prompt,
+ mode="pro" if model["resolution"] == "1080p" else "std",
+ duration=str(duration),
+ sound="on" if generate_audio else "off",
+ ),
+ )
+ if response.code:
+ raise RuntimeError(
+ f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}"
+ )
+ final_response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/kling/v1/videos/image2video/{response.data.task_id}"),
+ response_model=TaskStatusResponse,
+ status_extractor=lambda r: (r.data.task_status if r.data else None),
+ )
+ return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
+
+
class KlingExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -2504,6 +3160,8 @@ class KlingExtension(ComfyExtension):
TextToVideoWithAudio,
ImageToVideoWithAudio,
MotionControl,
+ KlingVideoNode,
+ KlingFirstLastFrameNode,
]
From 6615db925c9f84843e29db118852e14b643a1a03 Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki
Date: Wed, 11 Feb 2026 02:24:56 +0800
Subject: [PATCH 27/33] chore: update workflow templates to v0.8.38 (#12394)
---
requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index 4e2773f5d..7de6a413c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
comfyui-frontend-package==1.38.13
-comfyui-workflow-templates==0.8.37
+comfyui-workflow-templates==0.8.38
comfyui-embedded-docs==0.4.1
torch
torchsde
From 6648ab68bc934a185c90a2a872c87dc64d093751 Mon Sep 17 00:00:00 2001
From: comfyanonymous
Date: Tue, 10 Feb 2026 13:26:29 -0500
Subject: [PATCH 28/33] ComfyUI v0.13.0
---
comfyui_version.py | 2 +-
pyproject.toml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/comfyui_version.py b/comfyui_version.py
index 706b37763..cf4e89816 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
# This file is automatically generated by the build process when version is
# updated in pyproject.toml.
-__version__ = "0.12.3"
+__version__ = "0.13.0"
diff --git a/pyproject.toml b/pyproject.toml
index f7925b92a..9dab9a50c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "ComfyUI"
-version = "0.12.3"
+version = "0.13.0"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"
From fe053ba5eb34c8abcc5d17a25c114340af1833aa Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Tue, 10 Feb 2026 10:37:17 -0800
Subject: [PATCH 29/33] mp: dont deep-clone objects from model_options (#12382)
If there are non-trivial python objects nested in the model_options, this
causes all sorts of issues. Traverse lists and dicts so clones can safely
overide settings and BYO objects but stop there on the deepclone.
---
comfy/model_patcher.py | 3 +--
comfy/utils.py | 18 ++++++++++++++++++
2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index b9a117a7c..19c9031ea 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -19,7 +19,6 @@
from __future__ import annotations
import collections
-import copy
import inspect
import logging
import math
@@ -317,7 +316,7 @@ class ModelPatcher:
n.object_patches = self.object_patches.copy()
n.weight_wrapper_patches = self.weight_wrapper_patches.copy()
- n.model_options = copy.deepcopy(self.model_options)
+ n.model_options = comfy.utils.deepcopy_list_dict(self.model_options)
n.backup = self.backup
n.object_patches_backup = self.object_patches_backup
n.parent = self
diff --git a/comfy/utils.py b/comfy/utils.py
index 1337e2205..edd80cebe 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -1376,3 +1376,21 @@ def string_to_seed(data):
else:
crc >>= 1
return crc ^ 0xFFFFFFFF
+
+def deepcopy_list_dict(obj, memo=None):
+ if memo is None:
+ memo = {}
+
+ obj_id = id(obj)
+ if obj_id in memo:
+ return memo[obj_id]
+
+ if isinstance(obj, dict):
+ res = {deepcopy_list_dict(k, memo): deepcopy_list_dict(v, memo) for k, v in obj.items()}
+ elif isinstance(obj, list):
+ res = [deepcopy_list_dict(i, memo) for i in obj]
+ else:
+ res = obj
+
+ memo[obj_id] = res
+ return res
From f719f9c06266e7944683009b403e995d4c61d5f0 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Tue, 10 Feb 2026 10:37:46 -0800
Subject: [PATCH 30/33] sd: delay VAE dtype archive until after override
(#12388)
VAEs have host specific dtype logic that should override the dynamic
_model_dtype. Defer the archiving of model dtypes until after.
---
comfy/sd.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/comfy/sd.py b/comfy/sd.py
index bc9407405..f65e7cadd 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -793,8 +793,6 @@ class VAE:
self.first_stage_model = AutoencoderKL(**(config['params']))
self.first_stage_model = self.first_stage_model.eval()
- model_management.archive_model_dtypes(self.first_stage_model)
-
if device is None:
device = model_management.vae_device()
self.device = device
@@ -803,6 +801,7 @@ class VAE:
dtype = model_management.vae_dtype(self.device, self.working_dtypes)
self.vae_dtype = dtype
self.first_stage_model.to(self.vae_dtype)
+ model_management.archive_model_dtypes(self.first_stage_model)
self.output_device = model_management.intermediate_device()
mp = comfy.model_patcher.CoreModelPatcher
From 123a7874a97c4a8b8f06d4b7c2b1a566b8f0d057 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Tue, 10 Feb 2026 10:38:28 -0800
Subject: [PATCH 31/33] ops: Fix vanilla-fp8 loaded lora quality (#12390)
This was missing the stochastic rounding required for fp8 downcast
to be consistent with model_patcher.patch_weight_to_device.
Missed in testing as I spend too much time with quantized tensors
and overlooked the simpler ones.
---
comfy/ops.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/comfy/ops.py b/comfy/ops.py
index ea0d70702..33803b223 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -169,8 +169,8 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
if orig.dtype == dtype and len(fns) == 0:
#The layer actually wants our freshly saved QT
x = y
- else:
- y = x
+ elif update_weight:
+ y = comfy.float.stochastic_rounding(x, orig.dtype, seed = comfy.utils.string_to_seed(s.seed_key))
if update_weight:
orig.copy_(y)
for f in fns:
From 00fff6019ecf0f4306005579e93cef0cd51a3a1c Mon Sep 17 00:00:00 2001
From: guill
Date: Tue, 10 Feb 2026 14:37:14 -0800
Subject: [PATCH 32/33] feat(jobs): add 3d to PREVIEWABLE_MEDIA_TYPES for
first-class 3D output support (#12381)
Co-authored-by: Jedrzej Kosinski
---
comfy_execution/jobs.py | 79 +++++++++++--
tests/execution/test_jobs.py | 208 ++++++++++++++++++++++++++++++++++-
2 files changed, 271 insertions(+), 16 deletions(-)
diff --git a/comfy_execution/jobs.py b/comfy_execution/jobs.py
index bf091a448..370014fb6 100644
--- a/comfy_execution/jobs.py
+++ b/comfy_execution/jobs.py
@@ -20,10 +20,60 @@ class JobStatus:
# Media types that can be previewed in the frontend
-PREVIEWABLE_MEDIA_TYPES = frozenset({'images', 'video', 'audio'})
+PREVIEWABLE_MEDIA_TYPES = frozenset({'images', 'video', 'audio', '3d'})
# 3D file extensions for preview fallback (no dedicated media_type exists)
-THREE_D_EXTENSIONS = frozenset({'.obj', '.fbx', '.gltf', '.glb'})
+THREE_D_EXTENSIONS = frozenset({'.obj', '.fbx', '.gltf', '.glb', '.usdz'})
+
+
+def has_3d_extension(filename: str) -> bool:
+ lower = filename.lower()
+ return any(lower.endswith(ext) for ext in THREE_D_EXTENSIONS)
+
+
+def normalize_output_item(item):
+ """Normalize a single output list item for the jobs API.
+
+ Returns the normalized item, or None to exclude it.
+ String items with 3D extensions become {filename, type, subfolder} dicts.
+ """
+ if item is None:
+ return None
+ if isinstance(item, str):
+ if has_3d_extension(item):
+ return {'filename': item, 'type': 'output', 'subfolder': '', 'mediaType': '3d'}
+ return None
+ if isinstance(item, dict):
+ return item
+ return None
+
+
+def normalize_outputs(outputs: dict) -> dict:
+ """Normalize raw node outputs for the jobs API.
+
+ Transforms string 3D filenames into file output dicts and removes
+ None items. All other items (non-3D strings, dicts, etc.) are
+ preserved as-is.
+ """
+ normalized = {}
+ for node_id, node_outputs in outputs.items():
+ if not isinstance(node_outputs, dict):
+ normalized[node_id] = node_outputs
+ continue
+ normalized_node = {}
+ for media_type, items in node_outputs.items():
+ if media_type == 'animated' or not isinstance(items, list):
+ normalized_node[media_type] = items
+ continue
+ normalized_items = []
+ for item in items:
+ if item is None:
+ continue
+ norm = normalize_output_item(item)
+ normalized_items.append(norm if norm is not None else item)
+ normalized_node[media_type] = normalized_items
+ normalized[node_id] = normalized_node
+ return normalized
def _extract_job_metadata(extra_data: dict) -> tuple[Optional[int], Optional[str]]:
@@ -45,9 +95,9 @@ def is_previewable(media_type: str, item: dict) -> bool:
Maintains backwards compatibility with existing logic.
Priority:
- 1. media_type is 'images', 'video', or 'audio'
+ 1. media_type is 'images', 'video', 'audio', or '3d'
2. format field starts with 'video/' or 'audio/'
- 3. filename has a 3D extension (.obj, .fbx, .gltf, .glb)
+ 3. filename has a 3D extension (.obj, .fbx, .gltf, .glb, .usdz)
"""
if media_type in PREVIEWABLE_MEDIA_TYPES:
return True
@@ -139,7 +189,7 @@ def normalize_history_item(prompt_id: str, history_item: dict, include_outputs:
})
if include_outputs:
- job['outputs'] = outputs
+ job['outputs'] = normalize_outputs(outputs)
job['execution_status'] = status_info
job['workflow'] = {
'prompt': prompt,
@@ -171,18 +221,23 @@ def get_outputs_summary(outputs: dict) -> tuple[int, Optional[dict]]:
continue
for item in items:
- count += 1
-
- if not isinstance(item, dict):
+ normalized = normalize_output_item(item)
+ if normalized is None:
continue
- if preview_output is None and is_previewable(media_type, item):
+ count += 1
+
+ if preview_output is not None:
+ continue
+
+ if isinstance(normalized, dict) and is_previewable(media_type, normalized):
enriched = {
- **item,
+ **normalized,
'nodeId': node_id,
- 'mediaType': media_type
}
- if item.get('type') == 'output':
+ if 'mediaType' not in normalized:
+ enriched['mediaType'] = media_type
+ if normalized.get('type') == 'output':
preview_output = enriched
elif fallback_preview is None:
fallback_preview = enriched
diff --git a/tests/execution/test_jobs.py b/tests/execution/test_jobs.py
index 4d2f9ed36..83c36fe48 100644
--- a/tests/execution/test_jobs.py
+++ b/tests/execution/test_jobs.py
@@ -5,8 +5,11 @@ from comfy_execution.jobs import (
is_previewable,
normalize_queue_item,
normalize_history_item,
+ normalize_output_item,
+ normalize_outputs,
get_outputs_summary,
apply_sorting,
+ has_3d_extension,
)
@@ -35,8 +38,8 @@ class TestIsPreviewable:
"""Unit tests for is_previewable()"""
def test_previewable_media_types(self):
- """Images, video, audio media types should be previewable."""
- for media_type in ['images', 'video', 'audio']:
+ """Images, video, audio, 3d media types should be previewable."""
+ for media_type in ['images', 'video', 'audio', '3d']:
assert is_previewable(media_type, {}) is True
def test_non_previewable_media_types(self):
@@ -46,7 +49,7 @@ class TestIsPreviewable:
def test_3d_extensions_previewable(self):
"""3D file extensions should be previewable regardless of media_type."""
- for ext in ['.obj', '.fbx', '.gltf', '.glb']:
+ for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']:
item = {'filename': f'model{ext}'}
assert is_previewable('files', item) is True
@@ -160,7 +163,7 @@ class TestGetOutputsSummary:
def test_3d_files_previewable(self):
"""3D file extensions should be previewable."""
- for ext in ['.obj', '.fbx', '.gltf', '.glb']:
+ for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']:
outputs = {
'node1': {
'files': [{'filename': f'model{ext}', 'type': 'output'}]
@@ -192,6 +195,64 @@ class TestGetOutputsSummary:
assert preview['mediaType'] == 'images'
assert preview['subfolder'] == 'outputs'
+ def test_string_3d_filename_creates_preview(self):
+ """String items with 3D extensions should synthesize a preview (Preview3D node output).
+ Only the .glb counts — nulls and non-file strings are excluded."""
+ outputs = {
+ 'node1': {
+ 'result': ['preview3d_abc123.glb', None, None]
+ }
+ }
+ count, preview = get_outputs_summary(outputs)
+ assert count == 1
+ assert preview is not None
+ assert preview['filename'] == 'preview3d_abc123.glb'
+ assert preview['mediaType'] == '3d'
+ assert preview['nodeId'] == 'node1'
+ assert preview['type'] == 'output'
+
+ def test_string_non_3d_filename_no_preview(self):
+ """String items without 3D extensions should not create a preview."""
+ outputs = {
+ 'node1': {
+ 'result': ['data.json', None]
+ }
+ }
+ count, preview = get_outputs_summary(outputs)
+ assert count == 0
+ assert preview is None
+
+ def test_string_3d_filename_used_as_fallback(self):
+ """String 3D preview should be used when no dict items are previewable."""
+ outputs = {
+ 'node1': {
+ 'latents': [{'filename': 'latent.safetensors'}],
+ },
+ 'node2': {
+ 'result': ['model.glb', None]
+ }
+ }
+ count, preview = get_outputs_summary(outputs)
+ assert preview is not None
+ assert preview['filename'] == 'model.glb'
+ assert preview['mediaType'] == '3d'
+
+
+class TestHas3DExtension:
+ """Unit tests for has_3d_extension()"""
+
+ def test_recognized_extensions(self):
+ for ext in ['.obj', '.fbx', '.gltf', '.glb', '.usdz']:
+ assert has_3d_extension(f'model{ext}') is True
+
+ def test_case_insensitive(self):
+ assert has_3d_extension('MODEL.GLB') is True
+ assert has_3d_extension('Scene.GLTF') is True
+
+ def test_non_3d_extensions(self):
+ for name in ['photo.png', 'video.mp4', 'data.json', 'model']:
+ assert has_3d_extension(name) is False
+
class TestApplySorting:
"""Unit tests for apply_sorting()"""
@@ -395,3 +456,142 @@ class TestNormalizeHistoryItem:
'prompt': {'nodes': {'1': {}}},
'extra_data': {'create_time': 1234567890, 'client_id': 'abc'},
}
+
+ def test_include_outputs_normalizes_3d_strings(self):
+ """Detail view should transform string 3D filenames into file output dicts."""
+ history_item = {
+ 'prompt': (
+ 5,
+ 'prompt-3d',
+ {'nodes': {}},
+ {'create_time': 1234567890},
+ ['node1'],
+ ),
+ 'status': {'status_str': 'success', 'completed': True, 'messages': []},
+ 'outputs': {
+ 'node1': {
+ 'result': ['preview3d_abc123.glb', None, None]
+ }
+ },
+ }
+ job = normalize_history_item('prompt-3d', history_item, include_outputs=True)
+
+ assert job['outputs_count'] == 1
+ result_items = job['outputs']['node1']['result']
+ assert len(result_items) == 1
+ assert result_items[0] == {
+ 'filename': 'preview3d_abc123.glb',
+ 'type': 'output',
+ 'subfolder': '',
+ 'mediaType': '3d',
+ }
+
+ def test_include_outputs_preserves_dict_items(self):
+ """Detail view normalization should pass dict items through unchanged."""
+ history_item = {
+ 'prompt': (
+ 5,
+ 'prompt-img',
+ {'nodes': {}},
+ {'create_time': 1234567890},
+ ['node1'],
+ ),
+ 'status': {'status_str': 'success', 'completed': True, 'messages': []},
+ 'outputs': {
+ 'node1': {
+ 'images': [
+ {'filename': 'photo.png', 'type': 'output', 'subfolder': ''},
+ ]
+ }
+ },
+ }
+ job = normalize_history_item('prompt-img', history_item, include_outputs=True)
+
+ assert job['outputs_count'] == 1
+ assert job['outputs']['node1']['images'] == [
+ {'filename': 'photo.png', 'type': 'output', 'subfolder': ''},
+ ]
+
+
+class TestNormalizeOutputItem:
+ """Unit tests for normalize_output_item()"""
+
+ def test_none_returns_none(self):
+ assert normalize_output_item(None) is None
+
+ def test_string_3d_extension_synthesizes_dict(self):
+ result = normalize_output_item('model.glb')
+ assert result == {'filename': 'model.glb', 'type': 'output', 'subfolder': '', 'mediaType': '3d'}
+
+ def test_string_non_3d_extension_returns_none(self):
+ assert normalize_output_item('data.json') is None
+
+ def test_string_no_extension_returns_none(self):
+ assert normalize_output_item('camera_info_string') is None
+
+ def test_dict_passes_through(self):
+ item = {'filename': 'test.png', 'type': 'output'}
+ assert normalize_output_item(item) is item
+
+ def test_other_types_return_none(self):
+ assert normalize_output_item(42) is None
+ assert normalize_output_item(True) is None
+
+
+class TestNormalizeOutputs:
+ """Unit tests for normalize_outputs()"""
+
+ def test_empty_outputs(self):
+ assert normalize_outputs({}) == {}
+
+ def test_dict_items_pass_through(self):
+ outputs = {
+ 'node1': {
+ 'images': [{'filename': 'a.png', 'type': 'output'}],
+ }
+ }
+ result = normalize_outputs(outputs)
+ assert result == outputs
+
+ def test_3d_string_synthesized(self):
+ outputs = {
+ 'node1': {
+ 'result': ['model.glb', None, None],
+ }
+ }
+ result = normalize_outputs(outputs)
+ assert result == {
+ 'node1': {
+ 'result': [
+ {'filename': 'model.glb', 'type': 'output', 'subfolder': '', 'mediaType': '3d'},
+ ],
+ }
+ }
+
+ def test_animated_key_preserved(self):
+ outputs = {
+ 'node1': {
+ 'images': [{'filename': 'a.png', 'type': 'output'}],
+ 'animated': [True],
+ }
+ }
+ result = normalize_outputs(outputs)
+ assert result['node1']['animated'] == [True]
+
+ def test_non_dict_node_outputs_preserved(self):
+ outputs = {'node1': 'unexpected_value'}
+ result = normalize_outputs(outputs)
+ assert result == {'node1': 'unexpected_value'}
+
+ def test_none_items_filtered_but_other_types_preserved(self):
+ outputs = {
+ 'node1': {
+ 'result': ['data.json', None, [1, 2, 3]],
+ }
+ }
+ result = normalize_outputs(outputs)
+ assert result == {
+ 'node1': {
+ 'result': ['data.json', [1, 2, 3]],
+ }
+ }
From dbe70b6821994ce92d9cf211cc685862d0b6c0ca Mon Sep 17 00:00:00 2001
From: AustinMroz
Date: Tue, 10 Feb 2026 14:42:21 -0800
Subject: [PATCH 33/33] Add a VideoSlice node (#12107)
* Base TrimVideo implementation
* Raise error if as_trimmed call fails
* Bigger max start_time, tooltips, and formatting
* Count packets unless codec has subframes
* Remove incorrect nested decode
* Add null check for audio streams
* Support non-strict duration
* Added strict_duration bool to node definition
* Empty commit for approval
* Fix duration
* Support 5.1 audio layout on save
---------
Co-authored-by: Jedrzej Kosinski
---
comfy_api/latest/_input/video_types.py | 15 ++
comfy_api/latest/_input_impl/video_types.py | 201 ++++++++++++++------
comfy_extras/nodes_video.py | 51 +++++
3 files changed, 207 insertions(+), 60 deletions(-)
diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py
index e634a0311..451e9526e 100644
--- a/comfy_api/latest/_input/video_types.py
+++ b/comfy_api/latest/_input/video_types.py
@@ -34,6 +34,21 @@ class VideoInput(ABC):
"""
pass
+ @abstractmethod
+ def as_trimmed(
+ self,
+ start_time: float | None = None,
+ duration: float | None = None,
+ strict_duration: bool = False,
+ ) -> VideoInput | None:
+ """
+ Create a new VideoInput which is trimmed to have the corresponding start_time and duration
+
+ Returns:
+ A new VideoInput, or None if the result would have negative duration
+ """
+ pass
+
def get_stream_source(self) -> Union[str, io.BytesIO]:
"""
Get a streamable source for the video. This allows processing without
diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py
index 1405d0b81..3463ed1c9 100644
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@@ -6,6 +6,7 @@ from typing import Optional
from .._input import AudioInput, VideoInput
import av
import io
+import itertools
import json
import numpy as np
import math
@@ -29,7 +30,6 @@ def container_to_output_format(container_format: str | None) -> str | None:
formats = container_format.split(",")
return formats[0]
-
def get_open_write_kwargs(
dest: str | io.BytesIO, container_format: str, to_format: str | None
) -> dict:
@@ -57,12 +57,14 @@ class VideoFromFile(VideoInput):
Class representing video input from a file.
"""
- def __init__(self, file: str | io.BytesIO):
+ def __init__(self, file: str | io.BytesIO, *, start_time: float=0, duration: float=0):
"""
Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object
containing the file contents.
"""
self.__file = file
+ self.__start_time = start_time
+ self.__duration = duration
def get_stream_source(self) -> str | io.BytesIO:
"""
@@ -96,6 +98,16 @@ class VideoFromFile(VideoInput):
Returns:
Duration in seconds
"""
+ raw_duration = self._get_raw_duration()
+ if self.__start_time < 0:
+ duration_from_start = min(raw_duration, -self.__start_time)
+ else:
+ duration_from_start = raw_duration - self.__start_time
+ if self.__duration:
+ return min(self.__duration, duration_from_start)
+ return duration_from_start
+
+ def _get_raw_duration(self) -> float:
if isinstance(self.__file, io.BytesIO):
self.__file.seek(0)
with av.open(self.__file, mode="r") as container:
@@ -113,9 +125,13 @@ class VideoFromFile(VideoInput):
if video_stream and video_stream.average_rate:
frame_count = 0
container.seek(0)
- for packet in container.demux(video_stream):
- for _ in packet.decode():
- frame_count += 1
+ frame_iterator = (
+ container.decode(video_stream)
+ if video_stream.codec.capabilities & 0x100
+ else container.demux(video_stream)
+ )
+ for packet in frame_iterator:
+ frame_count += 1
if frame_count > 0:
return float(frame_count / video_stream.average_rate)
@@ -131,36 +147,54 @@ class VideoFromFile(VideoInput):
with av.open(self.__file, mode="r") as container:
video_stream = self._get_first_video_stream(container)
- # 1. Prefer the frames field if available
- if video_stream.frames and video_stream.frames > 0:
+ # 1. Prefer the frames field if available and usable
+ if (
+ video_stream.frames
+ and video_stream.frames > 0
+ and not self.__start_time
+ and not self.__duration
+ ):
return int(video_stream.frames)
# 2. Try to estimate from duration and average_rate using only metadata
- if container.duration is not None and video_stream.average_rate:
- duration_seconds = float(container.duration / av.time_base)
- estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
- if estimated_frames > 0:
- return estimated_frames
-
if (
getattr(video_stream, "duration", None) is not None
and getattr(video_stream, "time_base", None) is not None
and video_stream.average_rate
):
- duration_seconds = float(video_stream.duration * video_stream.time_base)
+ raw_duration = float(video_stream.duration * video_stream.time_base)
+ if self.__start_time < 0:
+ duration_from_start = min(raw_duration, -self.__start_time)
+ else:
+ duration_from_start = raw_duration - self.__start_time
+ duration_seconds = min(self.__duration, duration_from_start)
estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
if estimated_frames > 0:
return estimated_frames
# 3. Last resort: decode frames and count them (streaming)
- frame_count = 0
- container.seek(0)
- for packet in container.demux(video_stream):
- for _ in packet.decode():
- frame_count += 1
-
- if frame_count == 0:
- raise ValueError(f"Could not determine frame count for file '{self.__file}'")
+ if self.__start_time < 0:
+ start_time = max(self._get_raw_duration() + self.__start_time, 0)
+ else:
+ start_time = self.__start_time
+ frame_count = 1
+ start_pts = int(start_time / video_stream.time_base)
+ end_pts = int((start_time + self.__duration) / video_stream.time_base)
+ container.seek(start_pts, stream=video_stream)
+ frame_iterator = (
+ container.decode(video_stream)
+ if video_stream.codec.capabilities & 0x100
+ else container.demux(video_stream)
+ )
+ for frame in frame_iterator:
+ if frame.pts >= start_pts:
+ break
+ else:
+ raise ValueError(f"Could not determine frame count for file '{self.__file}'\nNo frames exist for start_time {self.__start_time}")
+ for frame in frame_iterator:
+ if frame.pts >= end_pts:
+ break
+ frame_count += 1
return frame_count
def get_frame_rate(self) -> Fraction:
@@ -199,9 +233,21 @@ class VideoFromFile(VideoInput):
return container.format.name
def get_components_internal(self, container: InputContainer) -> VideoComponents:
+ video_stream = self._get_first_video_stream(container)
+ if self.__start_time < 0:
+ start_time = max(self._get_raw_duration() + self.__start_time, 0)
+ else:
+ start_time = self.__start_time
# Get video frames
frames = []
- for frame in container.decode(video=0):
+ start_pts = int(start_time / video_stream.time_base)
+ end_pts = int((start_time + self.__duration) / video_stream.time_base)
+ container.seek(start_pts, stream=video_stream)
+ for frame in container.decode(video_stream):
+ if frame.pts < start_pts:
+ continue
+ if self.__duration and frame.pts >= end_pts:
+ break
img = frame.to_ndarray(format='rgb24') # shape: (H, W, 3)
img = torch.from_numpy(img) / 255.0 # shape: (H, W, 3)
frames.append(img)
@@ -209,31 +255,44 @@ class VideoFromFile(VideoInput):
images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0)
# Get frame rate
- video_stream = next(s for s in container.streams if s.type == 'video')
- frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1)
+ frame_rate = Fraction(video_stream.average_rate) if video_stream.average_rate else Fraction(1)
# Get audio if available
audio = None
- try:
- container.seek(0) # Reset the container to the beginning
- for stream in container.streams:
- if stream.type != 'audio':
- continue
- assert isinstance(stream, av.AudioStream)
- audio_frames = []
- for packet in container.demux(stream):
- for frame in packet.decode():
- assert isinstance(frame, av.AudioFrame)
- audio_frames.append(frame.to_ndarray()) # shape: (channels, samples)
- if len(audio_frames) > 0:
- audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples)
- audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples)
- audio = AudioInput({
- "waveform": audio_tensor,
- "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1,
- })
- except StopIteration:
- pass # No audio stream
+ container.seek(start_pts, stream=video_stream)
+ # Use last stream for consistency
+ if len(container.streams.audio):
+ audio_stream = container.streams.audio[-1]
+ audio_frames = []
+ resample = av.audio.resampler.AudioResampler(format='fltp').resample
+ frames = itertools.chain.from_iterable(
+ map(resample, container.decode(audio_stream))
+ )
+
+ has_first_frame = False
+ for frame in frames:
+ offset_seconds = start_time - frame.pts * audio_stream.time_base
+ to_skip = int(offset_seconds * audio_stream.sample_rate)
+ if to_skip < frame.samples:
+ has_first_frame = True
+ break
+ if has_first_frame:
+ audio_frames.append(frame.to_ndarray()[..., to_skip:])
+
+ for frame in frames:
+ if frame.time > start_time + self.__duration:
+ break
+ audio_frames.append(frame.to_ndarray()) # shape: (channels, samples)
+ if len(audio_frames) > 0:
+ audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples)
+ if self.__duration:
+ audio_data = audio_data[..., :int(self.__duration * audio_stream.sample_rate)]
+
+ audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples)
+ audio = AudioInput({
+ "waveform": audio_tensor,
+ "sample_rate": int(audio_stream.sample_rate) if audio_stream.sample_rate else 1,
+ })
metadata = container.metadata
return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)
@@ -250,7 +309,7 @@ class VideoFromFile(VideoInput):
path: str | io.BytesIO,
format: VideoContainer = VideoContainer.AUTO,
codec: VideoCodec = VideoCodec.AUTO,
- metadata: Optional[dict] = None
+ metadata: Optional[dict] = None,
):
if isinstance(self.__file, io.BytesIO):
self.__file.seek(0) # Reset the BytesIO object to the beginning
@@ -262,15 +321,14 @@ class VideoFromFile(VideoInput):
reuse_streams = False
if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None:
reuse_streams = False
+ if self.__start_time or self.__duration:
+ reuse_streams = False
if not reuse_streams:
components = self.get_components_internal(container)
video = VideoFromComponents(components)
return video.save_to(
- path,
- format=format,
- codec=codec,
- metadata=metadata
+ path, format=format, codec=codec, metadata=metadata
)
streams = container.streams
@@ -304,10 +362,21 @@ class VideoFromFile(VideoInput):
output_container.mux(packet)
def _get_first_video_stream(self, container: InputContainer):
- video_stream = next((s for s in container.streams if s.type == "video"), None)
- if video_stream is None:
- raise ValueError(f"No video stream found in file '{self.__file}'")
- return video_stream
+ if len(container.streams.video):
+ return container.streams.video[0]
+ raise ValueError(f"No video stream found in file '{self.__file}'")
+
+ def as_trimmed(
+ self, start_time: float = 0, duration: float = 0, strict_duration: bool = True
+ ) -> VideoInput | None:
+ trimmed = VideoFromFile(
+ self.get_stream_source(),
+ start_time=start_time + self.__start_time,
+ duration=duration,
+ )
+ if trimmed.get_duration() < duration and strict_duration:
+ return None
+ return trimmed
class VideoFromComponents(VideoInput):
@@ -322,7 +391,7 @@ class VideoFromComponents(VideoInput):
return VideoComponents(
images=self.__components.images,
audio=self.__components.audio,
- frame_rate=self.__components.frame_rate
+ frame_rate=self.__components.frame_rate,
)
def save_to(
@@ -330,7 +399,7 @@ class VideoFromComponents(VideoInput):
path: str,
format: VideoContainer = VideoContainer.AUTO,
codec: VideoCodec = VideoCodec.AUTO,
- metadata: Optional[dict] = None
+ metadata: Optional[dict] = None,
):
if format != VideoContainer.AUTO and format != VideoContainer.MP4:
raise ValueError("Only MP4 format is supported for now")
@@ -357,7 +426,10 @@ class VideoFromComponents(VideoInput):
audio_stream: Optional[av.AudioStream] = None
if self.__components.audio:
audio_sample_rate = int(self.__components.audio['sample_rate'])
- audio_stream = output.add_stream('aac', rate=audio_sample_rate)
+ waveform = self.__components.audio['waveform']
+ waveform = waveform[0, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
+ layout = {1: 'mono', 2: 'stereo', 6: '5.1'}.get(waveform.shape[0], 'stereo')
+ audio_stream = output.add_stream('aac', rate=audio_sample_rate, layout=layout)
# Encode video
for i, frame in enumerate(self.__components.images):
@@ -372,12 +444,21 @@ class VideoFromComponents(VideoInput):
output.mux(packet)
if audio_stream and self.__components.audio:
- waveform = self.__components.audio['waveform']
- waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
- frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().cpu().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo')
+ frame = av.AudioFrame.from_ndarray(waveform.float().cpu().numpy(), format='fltp', layout=layout)
frame.sample_rate = audio_sample_rate
frame.pts = 0
output.mux(audio_stream.encode(frame))
# Flush encoder
output.mux(audio_stream.encode(None))
+
+ def as_trimmed(
+ self,
+ start_time: float | None = None,
+ duration: float | None = None,
+ strict_duration: bool = True,
+ ) -> VideoInput | None:
+ if self.get_duration() < start_time + duration:
+ return None
+ #TODO Consider tracking duration and trimming at time of save?
+ return VideoFromFile(self.get_stream_source(), start_time=start_time, duration=duration)
diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py
index ccf7b63d3..cd765a7c1 100644
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@@ -202,6 +202,56 @@ class LoadVideo(io.ComfyNode):
return True
+class VideoSlice(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="Video Slice",
+ display_name="Video Slice",
+ search_aliases=[
+ "trim video duration",
+ "skip first frames",
+ "frame load cap",
+ "start time",
+ ],
+ category="image/video",
+ inputs=[
+ io.Video.Input("video"),
+ io.Float.Input(
+ "start_time",
+ default=0.0,
+ max=1e5,
+ min=-1e5,
+ step=0.001,
+ tooltip="Start time in seconds",
+ ),
+ io.Float.Input(
+ "duration",
+ default=0.0,
+ min=0.0,
+ step=0.001,
+ tooltip="Duration in seconds, or 0 for unlimited duration",
+ ),
+ io.Boolean.Input(
+ "strict_duration",
+ default=False,
+ tooltip="If True, when the specified duration is not possible, an error will be raised.",
+ ),
+ ],
+ outputs=[
+ io.Video.Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, video: io.Video.Type, start_time: float, duration: float, strict_duration: bool) -> io.NodeOutput:
+ trimmed = video.as_trimmed(start_time, duration, strict_duration=strict_duration)
+ if trimmed is not None:
+ return io.NodeOutput(trimmed)
+ raise ValueError(
+ f"Failed to slice video:\nSource duration: {video.get_duration()}\nStart time: {start_time}\nTarget duration: {duration}"
+ )
+
class VideoExtension(ComfyExtension):
@override
@@ -212,6 +262,7 @@ class VideoExtension(ComfyExtension):
CreateVideo,
GetVideoComponents,
LoadVideo,
+ VideoSlice,
]
async def comfy_entrypoint() -> VideoExtension: