Merge branch 'master' into worksplit-multigpu

2026-03-06 13:50:21 +00:00 · 2025-09-18 14:20:27 -07:00
parent efcd8280d6 dd611a7700
commit bb44c2ecb9
58 changed files with 3122 additions and 1050 deletions
--- a/comfy_extras/nodes_camera_trajectory.py
+++ b/comfy_extras/nodes_camera_trajectory.py
@@ -2,12 +2,12 @@ import nodes
 import torch
 import numpy as np
 from einops import rearrange
+from typing_extensions import override
 import comfy.model_management

+from comfy_api.latest import ComfyExtension, io


-MAX_RESOLUTION = nodes.MAX_RESOLUTION
-
 CAMERA_DICT = {
    "base_T_norm": 1.5,
    "base_angle": np.pi/3,
@@ -148,32 +148,47 @@ def get_camera_motion(angle, T, speed, n=81):
    RT = np.stack(RT)
    return RT

-class WanCameraEmbedding:
+class WanCameraEmbedding(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(cls):
-        return {
-            "required": {
-                "camera_pose":(["Static","Pan Up","Pan Down","Pan Left","Pan Right","Zoom In","Zoom Out","Anti Clockwise (ACW)", "ClockWise (CW)"],{"default":"Static"}),
-                "width": ("INT", {"default": 832, "min": 16, "max": MAX_RESOLUTION, "step": 16}),
-                "height": ("INT", {"default": 480, "min": 16, "max": MAX_RESOLUTION, "step": 16}),
-                "length": ("INT", {"default": 81, "min": 1, "max": MAX_RESOLUTION, "step": 4}),
-            },
-            "optional":{
-                "speed":("FLOAT",{"default":1.0, "min": 0, "max": 10.0, "step": 0.1}),
-                "fx":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.000000001}),
-                "fy":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.000000001}),
-                "cx":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.01}),
-                "cy":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.01}),
-            }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="WanCameraEmbedding",
+            category="camera",
+            inputs=[
+                io.Combo.Input(
+                    "camera_pose",
+                    options=[
+                        "Static",
+                        "Pan Up",
+                        "Pan Down",
+                        "Pan Left",
+                        "Pan Right",
+                        "Zoom In",
+                        "Zoom Out",
+                        "Anti Clockwise (ACW)",
+                        "ClockWise (CW)",
+                    ],
+                    default="Static",
+                ),
+                io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Float.Input("speed", default=1.0, min=0, max=10.0, step=0.1, optional=True),
+                io.Float.Input("fx", default=0.5, min=0, max=1, step=0.000000001, optional=True),
+                io.Float.Input("fy", default=0.5, min=0, max=1, step=0.000000001, optional=True),
+                io.Float.Input("cx", default=0.5, min=0, max=1, step=0.01, optional=True),
+                io.Float.Input("cy", default=0.5, min=0, max=1, step=0.01, optional=True),
+            ],
+            outputs=[
+                io.WanCameraEmbedding.Output(display_name="camera_embedding"),
+                io.Int.Output(display_name="width"),
+                io.Int.Output(display_name="height"),
+                io.Int.Output(display_name="length"),
+            ],
+        )

-        }
-
-    RETURN_TYPES = ("WAN_CAMERA_EMBEDDING","INT","INT","INT")
-    RETURN_NAMES = ("camera_embedding","width","height","length")
-    FUNCTION = "run"
-    CATEGORY = "camera"
-
-    def run(self, camera_pose, width, height, length, speed=1.0,  fx=0.5, fy=0.5, cx=0.5, cy=0.5):
+    @classmethod
+    def execute(cls, camera_pose, width, height, length, speed=1.0, fx=0.5, fy=0.5, cx=0.5, cy=0.5) -> io.NodeOutput:
        """
        Use Camera trajectory as extrinsic parameters to calculate Plücker embeddings (Sitzmannet al., 2021)
        Adapted from https://github.com/aigc-apps/VideoX-Fun/blob/main/comfyui/comfyui_nodes.py
@@ -210,9 +225,15 @@ class WanCameraEmbedding:
        control_camera_video = control_camera_video.contiguous().view(b, f // 4, 4, c, h, w).transpose(2, 3)
        control_camera_video = control_camera_video.contiguous().view(b, f // 4, c * 4, h, w).transpose(1, 2)

-        return (control_camera_video, width, height, length)
+        return io.NodeOutput(control_camera_video, width, height, length)


-NODE_CLASS_MAPPINGS = {
-    "WanCameraEmbedding": WanCameraEmbedding,
-}
+class CameraTrajectoryExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            WanCameraEmbedding,
+        ]
+
+async def comfy_entrypoint() -> CameraTrajectoryExtension:
+    return CameraTrajectoryExtension()
--- a/comfy_extras/nodes_canny.py
+++ b/comfy_extras/nodes_canny.py
@@ -1,25 +1,41 @@
 from kornia.filters import canny
+from typing_extensions import override
+
 import comfy.model_management
+from comfy_api.latest import ComfyExtension, io


-class Canny:
+class Canny(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"image": ("IMAGE",),
-                                "low_threshold": ("FLOAT", {"default": 0.4, "min": 0.01, "max": 0.99, "step": 0.01}),
-                                "high_threshold": ("FLOAT", {"default": 0.8, "min": 0.01, "max": 0.99, "step": 0.01})
-                                }}
+    def define_schema(cls):
+        return io.Schema(
+            node_id="Canny",
+            category="image/preprocessors",
+            inputs=[
+                io.Image.Input("image"),
+                io.Float.Input("low_threshold", default=0.4, min=0.01, max=0.99, step=0.01),
+                io.Float.Input("high_threshold", default=0.8, min=0.01, max=0.99, step=0.01),
+            ],
+            outputs=[io.Image.Output()],
+        )

-    RETURN_TYPES = ("IMAGE",)
-    FUNCTION = "detect_edge"
+    @classmethod
+    def detect_edge(cls, image, low_threshold, high_threshold):
+        # Deprecated: use the V3 schema's `execute` method instead of this.
+        return cls.execute(image, low_threshold, high_threshold)

-    CATEGORY = "image/preprocessors"
-
-    def detect_edge(self, image, low_threshold, high_threshold):
+    @classmethod
+    def execute(cls, image, low_threshold, high_threshold) -> io.NodeOutput:
        output = canny(image.to(comfy.model_management.get_torch_device()).movedim(-1, 1), low_threshold, high_threshold)
        img_out = output[1].to(comfy.model_management.intermediate_device()).repeat(1, 3, 1, 1).movedim(1, -1)
-        return (img_out,)
+        return io.NodeOutput(img_out)

-NODE_CLASS_MAPPINGS = {
-    "Canny": Canny,
-}
+
+class CannyExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [Canny]
+
+
+async def comfy_entrypoint() -> CannyExtension:
+    return CannyExtension()
--- a/comfy_extras/nodes_cfg.py
+++ b/comfy_extras/nodes_cfg.py
@@ -1,5 +1,10 @@
+from typing_extensions import override
+
 import torch

+from comfy_api.latest import ComfyExtension, io
+
+
 # https://github.com/WeichenFan/CFG-Zero-star
 def optimized_scale(positive, negative):
    positive_flat = positive.reshape(positive.shape[0], -1)
@@ -16,17 +21,20 @@ def optimized_scale(positive, negative):

    return st_star.reshape([positive.shape[0]] + [1] * (positive.ndim - 1))

-class CFGZeroStar:
+class CFGZeroStar(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"model": ("MODEL",),
-                            }}
-    RETURN_TYPES = ("MODEL",)
-    RETURN_NAMES = ("patched_model",)
-    FUNCTION = "patch"
-    CATEGORY = "advanced/guidance"
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="CFGZeroStar",
+            category="advanced/guidance",
+            inputs=[
+                io.Model.Input("model"),
+            ],
+            outputs=[io.Model.Output(display_name="patched_model")],
+        )

-    def patch(self, model):
+    @classmethod
+    def execute(cls, model) -> io.NodeOutput:
        m = model.clone()
        def cfg_zero_star(args):
            guidance_scale = args['cond_scale']
@@ -38,21 +46,24 @@ class CFGZeroStar:

            return out + uncond_p * (alpha - 1.0)  + guidance_scale * uncond_p * (1.0 - alpha)
        m.set_model_sampler_post_cfg_function(cfg_zero_star)
-        return (m, )
+        return io.NodeOutput(m)

-class CFGNorm:
+class CFGNorm(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"model": ("MODEL",),
-                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}),
-                            }}
-    RETURN_TYPES = ("MODEL",)
-    RETURN_NAMES = ("patched_model",)
-    FUNCTION = "patch"
-    CATEGORY = "advanced/guidance"
-    EXPERIMENTAL = True
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="CFGNorm",
+            category="advanced/guidance",
+            inputs=[
+                io.Model.Input("model"),
+                io.Float.Input("strength", default=1.0, min=0.0, max=100.0, step=0.01),
+            ],
+            outputs=[io.Model.Output(display_name="patched_model")],
+            is_experimental=True,
+        )

-    def patch(self, model, strength):
+    @classmethod
+    def execute(cls, model, strength) -> io.NodeOutput:
        m = model.clone()
        def cfg_norm(args):
            cond_p = args['cond_denoised']
@@ -64,9 +75,17 @@ class CFGNorm:
            return pred_text_ * scale * strength

        m.set_model_sampler_post_cfg_function(cfg_norm)
-        return (m, )
+        return io.NodeOutput(m)

-NODE_CLASS_MAPPINGS = {
-    "CFGZeroStar": CFGZeroStar,
-    "CFGNorm": CFGNorm,
-}
+
+class CfgExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            CFGZeroStar,
+            CFGNorm,
+        ]
+
+
+async def comfy_entrypoint() -> CfgExtension:
+    return CfgExtension()
--- a/comfy_extras/nodes_chroma_radiance.py
+++ b/comfy_extras/nodes_chroma_radiance.py
@@ -0,0 +1,114 @@
+from typing_extensions import override
+from typing import Callable
+
+import torch
+
+import comfy.model_management
+from comfy_api.latest import ComfyExtension, io
+
+import nodes
+
+class EmptyChromaRadianceLatentImage(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="EmptyChromaRadianceLatentImage",
+            category="latent/chroma_radiance",
+            inputs=[
+                io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input(id="batch_size", default=1, min=1, max=4096),
+            ],
+            outputs=[io.Latent().Output()],
+        )
+
+    @classmethod
+    def execute(cls, *, width: int, height: int, batch_size: int=1) -> io.NodeOutput:
+        latent = torch.zeros((batch_size, 3, height, width), device=comfy.model_management.intermediate_device())
+        return io.NodeOutput({"samples":latent})
+
+
+class ChromaRadianceOptions(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="ChromaRadianceOptions",
+            category="model_patches/chroma_radiance",
+            description="Allows setting advanced options for the Chroma Radiance model.",
+            inputs=[
+                io.Model.Input(id="model"),
+                io.Boolean.Input(
+                    id="preserve_wrapper",
+                    default=True,
+                    tooltip="When enabled, will delegate to an existing model function wrapper if it exists. Generally should be left enabled.",
+                ),
+                io.Float.Input(
+                    id="start_sigma",
+                    default=1.0,
+                    min=0.0,
+                    max=1.0,
+                    tooltip="First sigma that these options will be in effect.",
+                ),
+                io.Float.Input(
+                    id="end_sigma",
+                    default=0.0,
+                    min=0.0,
+                    max=1.0,
+                    tooltip="Last sigma that these options will be in effect.",
+                ),
+                io.Int.Input(
+                    id="nerf_tile_size",
+                    default=-1,
+                    min=-1,
+                    tooltip="Allows overriding the default NeRF tile size. -1 means use the default (32). 0 means use non-tiling mode (may require a lot of VRAM).",
+                ),
+            ],
+            outputs=[io.Model.Output()],
+        )
+
+    @classmethod
+    def execute(
+        cls,
+        *,
+        model: io.Model.Type,
+        preserve_wrapper: bool,
+        start_sigma: float,
+        end_sigma: float,
+        nerf_tile_size: int,
+    ) -> io.NodeOutput:
+        radiance_options = {}
+        if nerf_tile_size >= 0:
+            radiance_options["nerf_tile_size"] = nerf_tile_size
+
+        if not radiance_options:
+            return io.NodeOutput(model)
+
+        old_wrapper = model.model_options.get("model_function_wrapper")
+
+        def model_function_wrapper(apply_model: Callable, args: dict) -> torch.Tensor:
+            c = args["c"].copy()
+            sigma = args["timestep"].max().detach().cpu().item()
+            if end_sigma <= sigma <= start_sigma:
+                transformer_options = c.get("transformer_options", {}).copy()
+                transformer_options["chroma_radiance_options"] = radiance_options.copy()
+                c["transformer_options"] = transformer_options
+            if not (preserve_wrapper and old_wrapper):
+                return apply_model(args["input"], args["timestep"], **c)
+            return old_wrapper(apply_model, args | {"c": c})
+
+        model = model.clone()
+        model.set_model_unet_function_wrapper(model_function_wrapper)
+        return io.NodeOutput(model)
+
+
+class ChromaRadianceExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            EmptyChromaRadianceLatentImage,
+            ChromaRadianceOptions,
+        ]
+
+
+async def comfy_entrypoint() -> ChromaRadianceExtension:
+    return ChromaRadianceExtension()
--- a/comfy_extras/nodes_cond.py
+++ b/comfy_extras/nodes_cond.py
@@ -1,15 +1,25 @@
+from typing_extensions import override
+
+from comfy_api.latest import ComfyExtension, io


-class CLIPTextEncodeControlnet:
+class CLIPTextEncodeControlnet(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"clip": ("CLIP", ), "conditioning": ("CONDITIONING", ), "text": ("STRING", {"multiline": True, "dynamicPrompts": True})}}
-    RETURN_TYPES = ("CONDITIONING",)
-    FUNCTION = "encode"
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="CLIPTextEncodeControlnet",
+            category="_for_testing/conditioning",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.Conditioning.Input("conditioning"),
+                io.String.Input("text", multiline=True, dynamic_prompts=True),
+            ],
+            outputs=[io.Conditioning.Output()],
+            is_experimental=True,
+        )

-    CATEGORY = "_for_testing/conditioning"
-
-    def encode(self, clip, conditioning, text):
+    @classmethod
+    def execute(cls, clip, conditioning, text) -> io.NodeOutput:
        tokens = clip.tokenize(text)
        cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)
        c = []
@@ -18,32 +28,41 @@ class CLIPTextEncodeControlnet:
            n[1]['cross_attn_controlnet'] = cond
            n[1]['pooled_output_controlnet'] = pooled
            c.append(n)
-        return (c, )
+        return io.NodeOutput(c)

-class T5TokenizerOptions:
+class T5TokenizerOptions(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "clip": ("CLIP", ),
-                "min_padding": ("INT", {"default": 0, "min": 0, "max": 10000, "step": 1}),
-                "min_length": ("INT", {"default": 0, "min": 0, "max": 10000, "step": 1}),
-            }
-        }
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="T5TokenizerOptions",
+            category="_for_testing/conditioning",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.Int.Input("min_padding", default=0, min=0, max=10000, step=1),
+                io.Int.Input("min_length", default=0, min=0, max=10000, step=1),
+            ],
+            outputs=[io.Clip.Output()],
+            is_experimental=True,
+        )

-    CATEGORY = "_for_testing/conditioning"
-    RETURN_TYPES = ("CLIP",)
-    FUNCTION = "set_options"
-
-    def set_options(self, clip, min_padding, min_length):
+    @classmethod
+    def execute(cls, clip, min_padding, min_length) -> io.NodeOutput:
        clip = clip.clone()
        for t5_type in ["t5xxl", "pile_t5xl", "t5base", "mt5xl", "umt5xxl"]:
            clip.set_tokenizer_option("{}_min_padding".format(t5_type), min_padding)
            clip.set_tokenizer_option("{}_min_length".format(t5_type), min_length)

-        return (clip, )
+        return io.NodeOutput(clip)

-NODE_CLASS_MAPPINGS = {
-    "CLIPTextEncodeControlnet": CLIPTextEncodeControlnet,
-    "T5TokenizerOptions": T5TokenizerOptions,
-}
+
+class CondExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            CLIPTextEncodeControlnet,
+            T5TokenizerOptions,
+        ]
+
+
+async def comfy_entrypoint() -> CondExtension:
+    return CondExtension()
--- a/comfy_extras/nodes_cosmos.py
+++ b/comfy_extras/nodes_cosmos.py
@@ -1,25 +1,32 @@
+from typing_extensions import override
 import nodes
 import torch
 import comfy.model_management
 import comfy.utils
 import comfy.latent_formats

+from comfy_api.latest import ComfyExtension, io

-class EmptyCosmosLatentVideo:
+
+class EmptyCosmosLatentVideo(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": { "width": ("INT", {"default": 1280, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                              "height": ("INT", {"default": 704, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                              "length": ("INT", {"default": 121, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 8}),
-                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
-    RETURN_TYPES = ("LATENT",)
-    FUNCTION = "generate"
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="EmptyCosmosLatentVideo",
+            category="latent/video",
+            inputs=[
+                io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=121, min=1, max=nodes.MAX_RESOLUTION, step=8),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+            ],
+            outputs=[io.Latent.Output()],
+        )

-    CATEGORY = "latent/video"
-
-    def generate(self, width, height, length, batch_size=1):
+    @classmethod
+    def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 16, ((length - 1) // 8) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-        return ({"samples": latent}, )
+        return io.NodeOutput({"samples": latent})


 def vae_encode_with_padding(vae, image, width, height, length, padding=0):
@@ -33,31 +40,31 @@ def vae_encode_with_padding(vae, image, width, height, length, padding=0):
    return latent_temp[:, :, :latent_len]


-class CosmosImageToVideoLatent:
+class CosmosImageToVideoLatent(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"vae": ("VAE", ),
-                             "width": ("INT", {"default": 1280, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                             "height": ("INT", {"default": 704, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                             "length": ("INT", {"default": 121, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 8}),
-                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
-                },
-                "optional": {"start_image": ("IMAGE", ),
-                             "end_image": ("IMAGE", ),
-                }}
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="CosmosImageToVideoLatent",
+            category="conditioning/inpaint",
+            inputs=[
+                io.Vae.Input("vae"),
+                io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=121, min=1, max=nodes.MAX_RESOLUTION, step=8),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.Image.Input("start_image", optional=True),
+                io.Image.Input("end_image", optional=True),
+            ],
+            outputs=[io.Latent.Output()],
+        )

-
-    RETURN_TYPES = ("LATENT",)
-    FUNCTION = "encode"
-
-    CATEGORY = "conditioning/inpaint"
-
-    def encode(self, vae, width, height, length, batch_size, start_image=None, end_image=None):
+    @classmethod
+    def execute(cls, vae, width, height, length, batch_size, start_image=None, end_image=None) -> io.NodeOutput:
        latent = torch.zeros([1, 16, ((length - 1) // 8) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
        if start_image is None and end_image is None:
            out_latent = {}
            out_latent["samples"] = latent
-            return (out_latent,)
+            return io.NodeOutput(out_latent)

        mask = torch.ones([latent.shape[0], 1, ((length - 1) // 8) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())

@@ -74,33 +81,33 @@ class CosmosImageToVideoLatent:
        out_latent = {}
        out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1))
        out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1))
-        return (out_latent,)
+        return io.NodeOutput(out_latent)

-class CosmosPredict2ImageToVideoLatent:
+class CosmosPredict2ImageToVideoLatent(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"vae": ("VAE", ),
-                             "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                             "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
-                             "length": ("INT", {"default": 93, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
-                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
-                },
-                "optional": {"start_image": ("IMAGE", ),
-                             "end_image": ("IMAGE", ),
-                }}
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="CosmosPredict2ImageToVideoLatent",
+            category="conditioning/inpaint",
+            inputs=[
+                io.Vae.Input("vae"),
+                io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=93, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.Image.Input("start_image", optional=True),
+                io.Image.Input("end_image", optional=True),
+            ],
+            outputs=[io.Latent.Output()],
+        )

-
-    RETURN_TYPES = ("LATENT",)
-    FUNCTION = "encode"
-
-    CATEGORY = "conditioning/inpaint"
-
-    def encode(self, vae, width, height, length, batch_size, start_image=None, end_image=None):
+    @classmethod
+    def execute(cls, vae, width, height, length, batch_size, start_image=None, end_image=None) -> io.NodeOutput:
        latent = torch.zeros([1, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
        if start_image is None and end_image is None:
            out_latent = {}
            out_latent["samples"] = latent
-            return (out_latent,)
+            return io.NodeOutput(out_latent)

        mask = torch.ones([latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())

@@ -119,10 +126,18 @@ class CosmosPredict2ImageToVideoLatent:
        latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask)
        out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1))
        out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1))
-        return (out_latent,)
+        return io.NodeOutput(out_latent)

-NODE_CLASS_MAPPINGS = {
-    "EmptyCosmosLatentVideo": EmptyCosmosLatentVideo,
-    "CosmosImageToVideoLatent": CosmosImageToVideoLatent,
-    "CosmosPredict2ImageToVideoLatent": CosmosPredict2ImageToVideoLatent,
-}
+
+class CosmosExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            EmptyCosmosLatentVideo,
+            CosmosImageToVideoLatent,
+            CosmosPredict2ImageToVideoLatent,
+        ]
+
+
+async def comfy_entrypoint() -> CosmosExtension:
+    return CosmosExtension()
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@@ -128,6 +128,28 @@ class EmptyHunyuanImageLatent:
        latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device())
        return ({"samples":latent}, )

+class HunyuanRefinerLatent:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"positive": ("CONDITIONING", ),
+                             "negative": ("CONDITIONING", ),
+                             "latent": ("LATENT", ),
+                             "noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}),
+                             }}
+
+    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
+    RETURN_NAMES = ("positive", "negative", "latent")
+
+    FUNCTION = "execute"
+
+    def execute(self, positive, negative, latent, noise_augmentation):
+        latent = latent["samples"]
+        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
+        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
+        out_latent = {}
+        out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
+        return (positive, negative, out_latent)
+

 NODE_CLASS_MAPPINGS = {
    "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
@@ -135,4 +157,5 @@ NODE_CLASS_MAPPINGS = {
    "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
    "HunyuanImageToVideo": HunyuanImageToVideo,
    "EmptyHunyuanImageLatent": EmptyHunyuanImageLatent,
+    "HunyuanRefinerLatent": HunyuanRefinerLatent,
 }
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -1015,6 +1015,103 @@ class WanSoundImageToVideoExtend(io.ComfyNode):
        return io.NodeOutput(positive, negative, out_latent)


+def get_audio_emb_window(audio_emb, frame_num, frame0_idx, audio_shift=2):
+    zero_audio_embed = torch.zeros((audio_emb.shape[1], audio_emb.shape[2]), dtype=audio_emb.dtype, device=audio_emb.device)
+    zero_audio_embed_3 = torch.zeros((3, audio_emb.shape[1], audio_emb.shape[2]), dtype=audio_emb.dtype, device=audio_emb.device)  # device=audio_emb.device
+    iter_ = 1 + (frame_num - 1) // 4
+    audio_emb_wind = []
+    for lt_i in range(iter_):
+        if lt_i == 0:
+            st = frame0_idx + lt_i - 2
+            ed = frame0_idx + lt_i + 3
+            wind_feat = torch.stack([
+                audio_emb[i] if (0 <= i < audio_emb.shape[0]) else zero_audio_embed
+                for i in range(st, ed)
+            ], dim=0)
+            wind_feat = torch.cat((zero_audio_embed_3, wind_feat), dim=0)
+        else:
+            st = frame0_idx + 1 + 4 * (lt_i - 1) - audio_shift
+            ed = frame0_idx + 1 + 4 * lt_i + audio_shift
+            wind_feat = torch.stack([
+                audio_emb[i] if (0 <= i < audio_emb.shape[0]) else zero_audio_embed
+                for i in range(st, ed)
+            ], dim=0)
+        audio_emb_wind.append(wind_feat)
+    audio_emb_wind = torch.stack(audio_emb_wind, dim=0)
+
+    return audio_emb_wind, ed - audio_shift
+
+
+class WanHuMoImageToVideo(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="WanHuMoImageToVideo",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=97, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.AudioEncoderOutput.Input("audio_encoder_output", optional=True),
+                io.Image.Input("ref_image", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+            is_experimental=True,
+        )
+
+    @classmethod
+    def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None) -> io.NodeOutput:
+        latent_t = ((length - 1) // 4) + 1
+        latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+
+        if ref_image is not None:
+            ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+            ref_latent = vae.encode(ref_image[:, :, :, :3])
+            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
+            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [torch.zeros_like(ref_latent)]}, append=True)
+        else:
+            zero_latent = torch.zeros([batch_size, 16, 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [zero_latent]}, append=True)
+            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [zero_latent]}, append=True)
+
+        if audio_encoder_output is not None:
+            audio_emb = torch.stack(audio_encoder_output["encoded_audio_all_layers"], dim=2)
+            audio_len = audio_encoder_output["audio_samples"] // 640
+            audio_emb = audio_emb[:, :audio_len * 2]
+
+            feat0 = linear_interpolation(audio_emb[:, :, 0: 8].mean(dim=2), 50, 25)
+            feat1 = linear_interpolation(audio_emb[:, :, 8: 16].mean(dim=2), 50, 25)
+            feat2 = linear_interpolation(audio_emb[:, :, 16: 24].mean(dim=2), 50, 25)
+            feat3 = linear_interpolation(audio_emb[:, :, 24: 32].mean(dim=2), 50, 25)
+            feat4 = linear_interpolation(audio_emb[:, :, 32], 50, 25)
+            audio_emb = torch.stack([feat0, feat1, feat2, feat3, feat4], dim=2)[0]  # [T, 5, 1280]
+            audio_emb, _ = get_audio_emb_window(audio_emb, length, frame0_idx=0)
+
+            # pad for ref latent
+            zero_audio_pad = torch.zeros(ref_latent.shape[2], *audio_emb.shape[1:], device=audio_emb.device, dtype=audio_emb.dtype)
+            audio_emb = torch.cat([audio_emb, zero_audio_pad], dim=0)
+
+            audio_emb = audio_emb.unsqueeze(0)
+            audio_emb_neg = torch.zeros_like(audio_emb)
+            positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_emb})
+            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_emb_neg})
+        else:
+            zero_audio = torch.zeros([batch_size, latent_t + 1, 8, 5, 1280], device=comfy.model_management.intermediate_device())
+            positive = node_helpers.conditioning_set_values(positive, {"audio_embed": zero_audio})
+            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": zero_audio})
+
+        out_latent = {}
+        out_latent["samples"] = latent
+        return io.NodeOutput(positive, negative, out_latent)
+
 class Wan22ImageToVideoLatent(io.ComfyNode):
    @classmethod
    def define_schema(cls):
@@ -1075,6 +1172,7 @@ class WanExtension(ComfyExtension):
            WanPhantomSubjectToVideo,
            WanSoundImageToVideo,
            WanSoundImageToVideoExtend,
+            WanHuMoImageToVideo,
            Wan22ImageToVideoLatent,
        ]