mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-03-06 13:50:21 +00:00
Merge branch 'master' into worksplit-multigpu
This commit is contained in:
@@ -2,12 +2,12 @@ import nodes
|
||||
import torch
|
||||
import numpy as np
|
||||
from einops import rearrange
|
||||
from typing_extensions import override
|
||||
import comfy.model_management
|
||||
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
|
||||
MAX_RESOLUTION = nodes.MAX_RESOLUTION
|
||||
|
||||
CAMERA_DICT = {
|
||||
"base_T_norm": 1.5,
|
||||
"base_angle": np.pi/3,
|
||||
@@ -148,32 +148,47 @@ def get_camera_motion(angle, T, speed, n=81):
|
||||
RT = np.stack(RT)
|
||||
return RT
|
||||
|
||||
class WanCameraEmbedding:
|
||||
class WanCameraEmbedding(io.ComfyNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(cls):
|
||||
return {
|
||||
"required": {
|
||||
"camera_pose":(["Static","Pan Up","Pan Down","Pan Left","Pan Right","Zoom In","Zoom Out","Anti Clockwise (ACW)", "ClockWise (CW)"],{"default":"Static"}),
|
||||
"width": ("INT", {"default": 832, "min": 16, "max": MAX_RESOLUTION, "step": 16}),
|
||||
"height": ("INT", {"default": 480, "min": 16, "max": MAX_RESOLUTION, "step": 16}),
|
||||
"length": ("INT", {"default": 81, "min": 1, "max": MAX_RESOLUTION, "step": 4}),
|
||||
},
|
||||
"optional":{
|
||||
"speed":("FLOAT",{"default":1.0, "min": 0, "max": 10.0, "step": 0.1}),
|
||||
"fx":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.000000001}),
|
||||
"fy":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.000000001}),
|
||||
"cx":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.01}),
|
||||
"cy":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.01}),
|
||||
}
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanCameraEmbedding",
|
||||
category="camera",
|
||||
inputs=[
|
||||
io.Combo.Input(
|
||||
"camera_pose",
|
||||
options=[
|
||||
"Static",
|
||||
"Pan Up",
|
||||
"Pan Down",
|
||||
"Pan Left",
|
||||
"Pan Right",
|
||||
"Zoom In",
|
||||
"Zoom Out",
|
||||
"Anti Clockwise (ACW)",
|
||||
"ClockWise (CW)",
|
||||
],
|
||||
default="Static",
|
||||
),
|
||||
io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||
io.Float.Input("speed", default=1.0, min=0, max=10.0, step=0.1, optional=True),
|
||||
io.Float.Input("fx", default=0.5, min=0, max=1, step=0.000000001, optional=True),
|
||||
io.Float.Input("fy", default=0.5, min=0, max=1, step=0.000000001, optional=True),
|
||||
io.Float.Input("cx", default=0.5, min=0, max=1, step=0.01, optional=True),
|
||||
io.Float.Input("cy", default=0.5, min=0, max=1, step=0.01, optional=True),
|
||||
],
|
||||
outputs=[
|
||||
io.WanCameraEmbedding.Output(display_name="camera_embedding"),
|
||||
io.Int.Output(display_name="width"),
|
||||
io.Int.Output(display_name="height"),
|
||||
io.Int.Output(display_name="length"),
|
||||
],
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
RETURN_TYPES = ("WAN_CAMERA_EMBEDDING","INT","INT","INT")
|
||||
RETURN_NAMES = ("camera_embedding","width","height","length")
|
||||
FUNCTION = "run"
|
||||
CATEGORY = "camera"
|
||||
|
||||
def run(self, camera_pose, width, height, length, speed=1.0, fx=0.5, fy=0.5, cx=0.5, cy=0.5):
|
||||
@classmethod
|
||||
def execute(cls, camera_pose, width, height, length, speed=1.0, fx=0.5, fy=0.5, cx=0.5, cy=0.5) -> io.NodeOutput:
|
||||
"""
|
||||
Use Camera trajectory as extrinsic parameters to calculate Plücker embeddings (Sitzmannet al., 2021)
|
||||
Adapted from https://github.com/aigc-apps/VideoX-Fun/blob/main/comfyui/comfyui_nodes.py
|
||||
@@ -210,9 +225,15 @@ class WanCameraEmbedding:
|
||||
control_camera_video = control_camera_video.contiguous().view(b, f // 4, 4, c, h, w).transpose(2, 3)
|
||||
control_camera_video = control_camera_video.contiguous().view(b, f // 4, c * 4, h, w).transpose(1, 2)
|
||||
|
||||
return (control_camera_video, width, height, length)
|
||||
return io.NodeOutput(control_camera_video, width, height, length)
|
||||
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"WanCameraEmbedding": WanCameraEmbedding,
|
||||
}
|
||||
class CameraTrajectoryExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
WanCameraEmbedding,
|
||||
]
|
||||
|
||||
async def comfy_entrypoint() -> CameraTrajectoryExtension:
|
||||
return CameraTrajectoryExtension()
|
||||
|
||||
@@ -1,25 +1,41 @@
|
||||
from kornia.filters import canny
|
||||
from typing_extensions import override
|
||||
|
||||
import comfy.model_management
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
|
||||
class Canny:
|
||||
class Canny(io.ComfyNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"image": ("IMAGE",),
|
||||
"low_threshold": ("FLOAT", {"default": 0.4, "min": 0.01, "max": 0.99, "step": 0.01}),
|
||||
"high_threshold": ("FLOAT", {"default": 0.8, "min": 0.01, "max": 0.99, "step": 0.01})
|
||||
}}
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="Canny",
|
||||
category="image/preprocessors",
|
||||
inputs=[
|
||||
io.Image.Input("image"),
|
||||
io.Float.Input("low_threshold", default=0.4, min=0.01, max=0.99, step=0.01),
|
||||
io.Float.Input("high_threshold", default=0.8, min=0.01, max=0.99, step=0.01),
|
||||
],
|
||||
outputs=[io.Image.Output()],
|
||||
)
|
||||
|
||||
RETURN_TYPES = ("IMAGE",)
|
||||
FUNCTION = "detect_edge"
|
||||
@classmethod
|
||||
def detect_edge(cls, image, low_threshold, high_threshold):
|
||||
# Deprecated: use the V3 schema's `execute` method instead of this.
|
||||
return cls.execute(image, low_threshold, high_threshold)
|
||||
|
||||
CATEGORY = "image/preprocessors"
|
||||
|
||||
def detect_edge(self, image, low_threshold, high_threshold):
|
||||
@classmethod
|
||||
def execute(cls, image, low_threshold, high_threshold) -> io.NodeOutput:
|
||||
output = canny(image.to(comfy.model_management.get_torch_device()).movedim(-1, 1), low_threshold, high_threshold)
|
||||
img_out = output[1].to(comfy.model_management.intermediate_device()).repeat(1, 3, 1, 1).movedim(1, -1)
|
||||
return (img_out,)
|
||||
return io.NodeOutput(img_out)
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"Canny": Canny,
|
||||
}
|
||||
|
||||
class CannyExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [Canny]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> CannyExtension:
|
||||
return CannyExtension()
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
from typing_extensions import override
|
||||
|
||||
import torch
|
||||
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
|
||||
# https://github.com/WeichenFan/CFG-Zero-star
|
||||
def optimized_scale(positive, negative):
|
||||
positive_flat = positive.reshape(positive.shape[0], -1)
|
||||
@@ -16,17 +21,20 @@ def optimized_scale(positive, negative):
|
||||
|
||||
return st_star.reshape([positive.shape[0]] + [1] * (positive.ndim - 1))
|
||||
|
||||
class CFGZeroStar:
|
||||
class CFGZeroStar(io.ComfyNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"model": ("MODEL",),
|
||||
}}
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
RETURN_NAMES = ("patched_model",)
|
||||
FUNCTION = "patch"
|
||||
CATEGORY = "advanced/guidance"
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="CFGZeroStar",
|
||||
category="advanced/guidance",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
],
|
||||
outputs=[io.Model.Output(display_name="patched_model")],
|
||||
)
|
||||
|
||||
def patch(self, model):
|
||||
@classmethod
|
||||
def execute(cls, model) -> io.NodeOutput:
|
||||
m = model.clone()
|
||||
def cfg_zero_star(args):
|
||||
guidance_scale = args['cond_scale']
|
||||
@@ -38,21 +46,24 @@ class CFGZeroStar:
|
||||
|
||||
return out + uncond_p * (alpha - 1.0) + guidance_scale * uncond_p * (1.0 - alpha)
|
||||
m.set_model_sampler_post_cfg_function(cfg_zero_star)
|
||||
return (m, )
|
||||
return io.NodeOutput(m)
|
||||
|
||||
class CFGNorm:
|
||||
class CFGNorm(io.ComfyNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"model": ("MODEL",),
|
||||
"strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}),
|
||||
}}
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
RETURN_NAMES = ("patched_model",)
|
||||
FUNCTION = "patch"
|
||||
CATEGORY = "advanced/guidance"
|
||||
EXPERIMENTAL = True
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="CFGNorm",
|
||||
category="advanced/guidance",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Float.Input("strength", default=1.0, min=0.0, max=100.0, step=0.01),
|
||||
],
|
||||
outputs=[io.Model.Output(display_name="patched_model")],
|
||||
is_experimental=True,
|
||||
)
|
||||
|
||||
def patch(self, model, strength):
|
||||
@classmethod
|
||||
def execute(cls, model, strength) -> io.NodeOutput:
|
||||
m = model.clone()
|
||||
def cfg_norm(args):
|
||||
cond_p = args['cond_denoised']
|
||||
@@ -64,9 +75,17 @@ class CFGNorm:
|
||||
return pred_text_ * scale * strength
|
||||
|
||||
m.set_model_sampler_post_cfg_function(cfg_norm)
|
||||
return (m, )
|
||||
return io.NodeOutput(m)
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"CFGZeroStar": CFGZeroStar,
|
||||
"CFGNorm": CFGNorm,
|
||||
}
|
||||
|
||||
class CfgExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
CFGZeroStar,
|
||||
CFGNorm,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> CfgExtension:
|
||||
return CfgExtension()
|
||||
|
||||
114
comfy_extras/nodes_chroma_radiance.py
Normal file
114
comfy_extras/nodes_chroma_radiance.py
Normal file
@@ -0,0 +1,114 @@
|
||||
from typing_extensions import override
|
||||
from typing import Callable
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.model_management
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
import nodes
|
||||
|
||||
class EmptyChromaRadianceLatentImage(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="EmptyChromaRadianceLatentImage",
|
||||
category="latent/chroma_radiance",
|
||||
inputs=[
|
||||
io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input(id="batch_size", default=1, min=1, max=4096),
|
||||
],
|
||||
outputs=[io.Latent().Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, *, width: int, height: int, batch_size: int=1) -> io.NodeOutput:
|
||||
latent = torch.zeros((batch_size, 3, height, width), device=comfy.model_management.intermediate_device())
|
||||
return io.NodeOutput({"samples":latent})
|
||||
|
||||
|
||||
class ChromaRadianceOptions(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="ChromaRadianceOptions",
|
||||
category="model_patches/chroma_radiance",
|
||||
description="Allows setting advanced options for the Chroma Radiance model.",
|
||||
inputs=[
|
||||
io.Model.Input(id="model"),
|
||||
io.Boolean.Input(
|
||||
id="preserve_wrapper",
|
||||
default=True,
|
||||
tooltip="When enabled, will delegate to an existing model function wrapper if it exists. Generally should be left enabled.",
|
||||
),
|
||||
io.Float.Input(
|
||||
id="start_sigma",
|
||||
default=1.0,
|
||||
min=0.0,
|
||||
max=1.0,
|
||||
tooltip="First sigma that these options will be in effect.",
|
||||
),
|
||||
io.Float.Input(
|
||||
id="end_sigma",
|
||||
default=0.0,
|
||||
min=0.0,
|
||||
max=1.0,
|
||||
tooltip="Last sigma that these options will be in effect.",
|
||||
),
|
||||
io.Int.Input(
|
||||
id="nerf_tile_size",
|
||||
default=-1,
|
||||
min=-1,
|
||||
tooltip="Allows overriding the default NeRF tile size. -1 means use the default (32). 0 means use non-tiling mode (may require a lot of VRAM).",
|
||||
),
|
||||
],
|
||||
outputs=[io.Model.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(
|
||||
cls,
|
||||
*,
|
||||
model: io.Model.Type,
|
||||
preserve_wrapper: bool,
|
||||
start_sigma: float,
|
||||
end_sigma: float,
|
||||
nerf_tile_size: int,
|
||||
) -> io.NodeOutput:
|
||||
radiance_options = {}
|
||||
if nerf_tile_size >= 0:
|
||||
radiance_options["nerf_tile_size"] = nerf_tile_size
|
||||
|
||||
if not radiance_options:
|
||||
return io.NodeOutput(model)
|
||||
|
||||
old_wrapper = model.model_options.get("model_function_wrapper")
|
||||
|
||||
def model_function_wrapper(apply_model: Callable, args: dict) -> torch.Tensor:
|
||||
c = args["c"].copy()
|
||||
sigma = args["timestep"].max().detach().cpu().item()
|
||||
if end_sigma <= sigma <= start_sigma:
|
||||
transformer_options = c.get("transformer_options", {}).copy()
|
||||
transformer_options["chroma_radiance_options"] = radiance_options.copy()
|
||||
c["transformer_options"] = transformer_options
|
||||
if not (preserve_wrapper and old_wrapper):
|
||||
return apply_model(args["input"], args["timestep"], **c)
|
||||
return old_wrapper(apply_model, args | {"c": c})
|
||||
|
||||
model = model.clone()
|
||||
model.set_model_unet_function_wrapper(model_function_wrapper)
|
||||
return io.NodeOutput(model)
|
||||
|
||||
|
||||
class ChromaRadianceExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
EmptyChromaRadianceLatentImage,
|
||||
ChromaRadianceOptions,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> ChromaRadianceExtension:
|
||||
return ChromaRadianceExtension()
|
||||
@@ -1,15 +1,25 @@
|
||||
from typing_extensions import override
|
||||
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
|
||||
class CLIPTextEncodeControlnet:
|
||||
class CLIPTextEncodeControlnet(io.ComfyNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"clip": ("CLIP", ), "conditioning": ("CONDITIONING", ), "text": ("STRING", {"multiline": True, "dynamicPrompts": True})}}
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "encode"
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeControlnet",
|
||||
category="_for_testing/conditioning",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.Conditioning.Input("conditioning"),
|
||||
io.String.Input("text", multiline=True, dynamic_prompts=True),
|
||||
],
|
||||
outputs=[io.Conditioning.Output()],
|
||||
is_experimental=True,
|
||||
)
|
||||
|
||||
CATEGORY = "_for_testing/conditioning"
|
||||
|
||||
def encode(self, clip, conditioning, text):
|
||||
@classmethod
|
||||
def execute(cls, clip, conditioning, text) -> io.NodeOutput:
|
||||
tokens = clip.tokenize(text)
|
||||
cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True)
|
||||
c = []
|
||||
@@ -18,32 +28,41 @@ class CLIPTextEncodeControlnet:
|
||||
n[1]['cross_attn_controlnet'] = cond
|
||||
n[1]['pooled_output_controlnet'] = pooled
|
||||
c.append(n)
|
||||
return (c, )
|
||||
return io.NodeOutput(c)
|
||||
|
||||
class T5TokenizerOptions:
|
||||
class T5TokenizerOptions(io.ComfyNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {
|
||||
"required": {
|
||||
"clip": ("CLIP", ),
|
||||
"min_padding": ("INT", {"default": 0, "min": 0, "max": 10000, "step": 1}),
|
||||
"min_length": ("INT", {"default": 0, "min": 0, "max": 10000, "step": 1}),
|
||||
}
|
||||
}
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="T5TokenizerOptions",
|
||||
category="_for_testing/conditioning",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.Int.Input("min_padding", default=0, min=0, max=10000, step=1),
|
||||
io.Int.Input("min_length", default=0, min=0, max=10000, step=1),
|
||||
],
|
||||
outputs=[io.Clip.Output()],
|
||||
is_experimental=True,
|
||||
)
|
||||
|
||||
CATEGORY = "_for_testing/conditioning"
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "set_options"
|
||||
|
||||
def set_options(self, clip, min_padding, min_length):
|
||||
@classmethod
|
||||
def execute(cls, clip, min_padding, min_length) -> io.NodeOutput:
|
||||
clip = clip.clone()
|
||||
for t5_type in ["t5xxl", "pile_t5xl", "t5base", "mt5xl", "umt5xxl"]:
|
||||
clip.set_tokenizer_option("{}_min_padding".format(t5_type), min_padding)
|
||||
clip.set_tokenizer_option("{}_min_length".format(t5_type), min_length)
|
||||
|
||||
return (clip, )
|
||||
return io.NodeOutput(clip)
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"CLIPTextEncodeControlnet": CLIPTextEncodeControlnet,
|
||||
"T5TokenizerOptions": T5TokenizerOptions,
|
||||
}
|
||||
|
||||
class CondExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
CLIPTextEncodeControlnet,
|
||||
T5TokenizerOptions,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> CondExtension:
|
||||
return CondExtension()
|
||||
|
||||
@@ -1,25 +1,32 @@
|
||||
from typing_extensions import override
|
||||
import nodes
|
||||
import torch
|
||||
import comfy.model_management
|
||||
import comfy.utils
|
||||
import comfy.latent_formats
|
||||
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
class EmptyCosmosLatentVideo:
|
||||
|
||||
class EmptyCosmosLatentVideo(io.ComfyNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "width": ("INT", {"default": 1280, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
||||
"height": ("INT", {"default": 704, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
||||
"length": ("INT", {"default": 121, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 8}),
|
||||
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
|
||||
RETURN_TYPES = ("LATENT",)
|
||||
FUNCTION = "generate"
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="EmptyCosmosLatentVideo",
|
||||
category="latent/video",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("length", default=121, min=1, max=nodes.MAX_RESOLUTION, step=8),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
],
|
||||
outputs=[io.Latent.Output()],
|
||||
)
|
||||
|
||||
CATEGORY = "latent/video"
|
||||
|
||||
def generate(self, width, height, length, batch_size=1):
|
||||
@classmethod
|
||||
def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 16, ((length - 1) // 8) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
return ({"samples": latent}, )
|
||||
return io.NodeOutput({"samples": latent})
|
||||
|
||||
|
||||
def vae_encode_with_padding(vae, image, width, height, length, padding=0):
|
||||
@@ -33,31 +40,31 @@ def vae_encode_with_padding(vae, image, width, height, length, padding=0):
|
||||
return latent_temp[:, :, :latent_len]
|
||||
|
||||
|
||||
class CosmosImageToVideoLatent:
|
||||
class CosmosImageToVideoLatent(io.ComfyNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"vae": ("VAE", ),
|
||||
"width": ("INT", {"default": 1280, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
||||
"height": ("INT", {"default": 704, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
||||
"length": ("INT", {"default": 121, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 8}),
|
||||
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
||||
},
|
||||
"optional": {"start_image": ("IMAGE", ),
|
||||
"end_image": ("IMAGE", ),
|
||||
}}
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="CosmosImageToVideoLatent",
|
||||
category="conditioning/inpaint",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("length", default=121, min=1, max=nodes.MAX_RESOLUTION, step=8),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
io.Image.Input("start_image", optional=True),
|
||||
io.Image.Input("end_image", optional=True),
|
||||
],
|
||||
outputs=[io.Latent.Output()],
|
||||
)
|
||||
|
||||
|
||||
RETURN_TYPES = ("LATENT",)
|
||||
FUNCTION = "encode"
|
||||
|
||||
CATEGORY = "conditioning/inpaint"
|
||||
|
||||
def encode(self, vae, width, height, length, batch_size, start_image=None, end_image=None):
|
||||
@classmethod
|
||||
def execute(cls, vae, width, height, length, batch_size, start_image=None, end_image=None) -> io.NodeOutput:
|
||||
latent = torch.zeros([1, 16, ((length - 1) // 8) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
if start_image is None and end_image is None:
|
||||
out_latent = {}
|
||||
out_latent["samples"] = latent
|
||||
return (out_latent,)
|
||||
return io.NodeOutput(out_latent)
|
||||
|
||||
mask = torch.ones([latent.shape[0], 1, ((length - 1) // 8) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
|
||||
|
||||
@@ -74,33 +81,33 @@ class CosmosImageToVideoLatent:
|
||||
out_latent = {}
|
||||
out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1))
|
||||
out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1))
|
||||
return (out_latent,)
|
||||
return io.NodeOutput(out_latent)
|
||||
|
||||
class CosmosPredict2ImageToVideoLatent:
|
||||
class CosmosPredict2ImageToVideoLatent(io.ComfyNode):
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"vae": ("VAE", ),
|
||||
"width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
||||
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
|
||||
"length": ("INT", {"default": 93, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
|
||||
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
||||
},
|
||||
"optional": {"start_image": ("IMAGE", ),
|
||||
"end_image": ("IMAGE", ),
|
||||
}}
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="CosmosPredict2ImageToVideoLatent",
|
||||
category="conditioning/inpaint",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("length", default=93, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
io.Image.Input("start_image", optional=True),
|
||||
io.Image.Input("end_image", optional=True),
|
||||
],
|
||||
outputs=[io.Latent.Output()],
|
||||
)
|
||||
|
||||
|
||||
RETURN_TYPES = ("LATENT",)
|
||||
FUNCTION = "encode"
|
||||
|
||||
CATEGORY = "conditioning/inpaint"
|
||||
|
||||
def encode(self, vae, width, height, length, batch_size, start_image=None, end_image=None):
|
||||
@classmethod
|
||||
def execute(cls, vae, width, height, length, batch_size, start_image=None, end_image=None) -> io.NodeOutput:
|
||||
latent = torch.zeros([1, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
if start_image is None and end_image is None:
|
||||
out_latent = {}
|
||||
out_latent["samples"] = latent
|
||||
return (out_latent,)
|
||||
return io.NodeOutput(out_latent)
|
||||
|
||||
mask = torch.ones([latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
|
||||
|
||||
@@ -119,10 +126,18 @@ class CosmosPredict2ImageToVideoLatent:
|
||||
latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask)
|
||||
out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1))
|
||||
out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1))
|
||||
return (out_latent,)
|
||||
return io.NodeOutput(out_latent)
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"EmptyCosmosLatentVideo": EmptyCosmosLatentVideo,
|
||||
"CosmosImageToVideoLatent": CosmosImageToVideoLatent,
|
||||
"CosmosPredict2ImageToVideoLatent": CosmosPredict2ImageToVideoLatent,
|
||||
}
|
||||
|
||||
class CosmosExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
EmptyCosmosLatentVideo,
|
||||
CosmosImageToVideoLatent,
|
||||
CosmosPredict2ImageToVideoLatent,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> CosmosExtension:
|
||||
return CosmosExtension()
|
||||
|
||||
@@ -128,6 +128,28 @@ class EmptyHunyuanImageLatent:
|
||||
latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device())
|
||||
return ({"samples":latent}, )
|
||||
|
||||
class HunyuanRefinerLatent:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": {"positive": ("CONDITIONING", ),
|
||||
"negative": ("CONDITIONING", ),
|
||||
"latent": ("LATENT", ),
|
||||
"noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||
}}
|
||||
|
||||
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
||||
RETURN_NAMES = ("positive", "negative", "latent")
|
||||
|
||||
FUNCTION = "execute"
|
||||
|
||||
def execute(self, positive, negative, latent, noise_augmentation):
|
||||
latent = latent["samples"]
|
||||
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
|
||||
out_latent = {}
|
||||
out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
|
||||
return (positive, negative, out_latent)
|
||||
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
|
||||
@@ -135,4 +157,5 @@ NODE_CLASS_MAPPINGS = {
|
||||
"EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
|
||||
"HunyuanImageToVideo": HunyuanImageToVideo,
|
||||
"EmptyHunyuanImageLatent": EmptyHunyuanImageLatent,
|
||||
"HunyuanRefinerLatent": HunyuanRefinerLatent,
|
||||
}
|
||||
|
||||
@@ -1015,6 +1015,103 @@ class WanSoundImageToVideoExtend(io.ComfyNode):
|
||||
return io.NodeOutput(positive, negative, out_latent)
|
||||
|
||||
|
||||
def get_audio_emb_window(audio_emb, frame_num, frame0_idx, audio_shift=2):
|
||||
zero_audio_embed = torch.zeros((audio_emb.shape[1], audio_emb.shape[2]), dtype=audio_emb.dtype, device=audio_emb.device)
|
||||
zero_audio_embed_3 = torch.zeros((3, audio_emb.shape[1], audio_emb.shape[2]), dtype=audio_emb.dtype, device=audio_emb.device) # device=audio_emb.device
|
||||
iter_ = 1 + (frame_num - 1) // 4
|
||||
audio_emb_wind = []
|
||||
for lt_i in range(iter_):
|
||||
if lt_i == 0:
|
||||
st = frame0_idx + lt_i - 2
|
||||
ed = frame0_idx + lt_i + 3
|
||||
wind_feat = torch.stack([
|
||||
audio_emb[i] if (0 <= i < audio_emb.shape[0]) else zero_audio_embed
|
||||
for i in range(st, ed)
|
||||
], dim=0)
|
||||
wind_feat = torch.cat((zero_audio_embed_3, wind_feat), dim=0)
|
||||
else:
|
||||
st = frame0_idx + 1 + 4 * (lt_i - 1) - audio_shift
|
||||
ed = frame0_idx + 1 + 4 * lt_i + audio_shift
|
||||
wind_feat = torch.stack([
|
||||
audio_emb[i] if (0 <= i < audio_emb.shape[0]) else zero_audio_embed
|
||||
for i in range(st, ed)
|
||||
], dim=0)
|
||||
audio_emb_wind.append(wind_feat)
|
||||
audio_emb_wind = torch.stack(audio_emb_wind, dim=0)
|
||||
|
||||
return audio_emb_wind, ed - audio_shift
|
||||
|
||||
|
||||
class WanHuMoImageToVideo(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanHuMoImageToVideo",
|
||||
category="conditioning/video_models",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("length", default=97, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
io.AudioEncoderOutput.Input("audio_encoder_output", optional=True),
|
||||
io.Image.Input("ref_image", optional=True),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(display_name="positive"),
|
||||
io.Conditioning.Output(display_name="negative"),
|
||||
io.Latent.Output(display_name="latent"),
|
||||
],
|
||||
is_experimental=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None) -> io.NodeOutput:
|
||||
latent_t = ((length - 1) // 4) + 1
|
||||
latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
|
||||
if ref_image is not None:
|
||||
ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||
ref_latent = vae.encode(ref_image[:, :, :, :3])
|
||||
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
|
||||
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [torch.zeros_like(ref_latent)]}, append=True)
|
||||
else:
|
||||
zero_latent = torch.zeros([batch_size, 16, 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [zero_latent]}, append=True)
|
||||
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [zero_latent]}, append=True)
|
||||
|
||||
if audio_encoder_output is not None:
|
||||
audio_emb = torch.stack(audio_encoder_output["encoded_audio_all_layers"], dim=2)
|
||||
audio_len = audio_encoder_output["audio_samples"] // 640
|
||||
audio_emb = audio_emb[:, :audio_len * 2]
|
||||
|
||||
feat0 = linear_interpolation(audio_emb[:, :, 0: 8].mean(dim=2), 50, 25)
|
||||
feat1 = linear_interpolation(audio_emb[:, :, 8: 16].mean(dim=2), 50, 25)
|
||||
feat2 = linear_interpolation(audio_emb[:, :, 16: 24].mean(dim=2), 50, 25)
|
||||
feat3 = linear_interpolation(audio_emb[:, :, 24: 32].mean(dim=2), 50, 25)
|
||||
feat4 = linear_interpolation(audio_emb[:, :, 32], 50, 25)
|
||||
audio_emb = torch.stack([feat0, feat1, feat2, feat3, feat4], dim=2)[0] # [T, 5, 1280]
|
||||
audio_emb, _ = get_audio_emb_window(audio_emb, length, frame0_idx=0)
|
||||
|
||||
# pad for ref latent
|
||||
zero_audio_pad = torch.zeros(ref_latent.shape[2], *audio_emb.shape[1:], device=audio_emb.device, dtype=audio_emb.dtype)
|
||||
audio_emb = torch.cat([audio_emb, zero_audio_pad], dim=0)
|
||||
|
||||
audio_emb = audio_emb.unsqueeze(0)
|
||||
audio_emb_neg = torch.zeros_like(audio_emb)
|
||||
positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_emb})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_emb_neg})
|
||||
else:
|
||||
zero_audio = torch.zeros([batch_size, latent_t + 1, 8, 5, 1280], device=comfy.model_management.intermediate_device())
|
||||
positive = node_helpers.conditioning_set_values(positive, {"audio_embed": zero_audio})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"audio_embed": zero_audio})
|
||||
|
||||
out_latent = {}
|
||||
out_latent["samples"] = latent
|
||||
return io.NodeOutput(positive, negative, out_latent)
|
||||
|
||||
class Wan22ImageToVideoLatent(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
@@ -1075,6 +1172,7 @@ class WanExtension(ComfyExtension):
|
||||
WanPhantomSubjectToVideo,
|
||||
WanSoundImageToVideo,
|
||||
WanSoundImageToVideoExtend,
|
||||
WanHuMoImageToVideo,
|
||||
Wan22ImageToVideoLatent,
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user