mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-04-22 23:49:05 +00:00
Compare commits
24 Commits
feature/de
...
void-model
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b68042b5c4 | ||
|
|
7609381243 | ||
|
|
6c4b1cebd1 | ||
|
|
fe8906144c | ||
|
|
f4240442b6 | ||
|
|
c176f951e8 | ||
|
|
b4518ea697 | ||
|
|
a0a69c9b34 | ||
|
|
2a45d487d4 | ||
|
|
52156edbee | ||
|
|
dff15d7e5f | ||
|
|
c8a843e240 | ||
|
|
9ca7cdb17e | ||
|
|
e962c3f846 | ||
|
|
3e961f9960 | ||
|
|
174f68885c | ||
|
|
541f26ae23 | ||
|
|
cee57f6827 | ||
|
|
9904f4d73f | ||
|
|
73bd1dd2c8 | ||
|
|
220a044fab | ||
|
|
92571c7fe5 | ||
|
|
6841484cde | ||
|
|
1112d597c8 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -24,4 +24,3 @@ web_custom_versions/
|
||||
openapi.yaml
|
||||
filtered-openapi.yaml
|
||||
uv.lock
|
||||
.comfy_environment
|
||||
|
||||
@@ -195,9 +195,7 @@ The portable above currently comes with python 3.13 and pytorch cuda 13.0. Updat
|
||||
|
||||
#### Alternative Downloads:
|
||||
|
||||
[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
|
||||
|
||||
[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
|
||||
[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
|
||||
|
||||
[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ class InternalRoutes:
|
||||
(entry for entry in os.scandir(directory) if is_visible_file(entry)),
|
||||
key=lambda entry: -entry.stat().st_mtime
|
||||
)
|
||||
return web.json_response([f"{entry.name} [{directory_type}]" for entry in sorted_files], status=200)
|
||||
return web.json_response([entry.name for entry in sorted_files], status=200)
|
||||
|
||||
|
||||
def get_app(self):
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
import folder_paths
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_DEPLOY_ENV = "local_git"
|
||||
_ENV_FILENAME = ".comfy_environment"
|
||||
|
||||
_cached_value: str | None = None
|
||||
|
||||
|
||||
def get_deploy_environment() -> str:
|
||||
global _cached_value
|
||||
if _cached_value is not None:
|
||||
return _cached_value
|
||||
|
||||
env_file = os.path.join(folder_paths.base_path, _ENV_FILENAME)
|
||||
try:
|
||||
with open(env_file, encoding="utf-8") as f:
|
||||
first_line = f.readline().strip()
|
||||
value = "".join(c for c in first_line if 32 <= ord(c) < 127)
|
||||
if value:
|
||||
_cached_value = value
|
||||
return _cached_value
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.error("Failed to read %s: %s", env_file, e)
|
||||
|
||||
_cached_value = _DEFAULT_DEPLOY_ENV
|
||||
return _cached_value
|
||||
@@ -783,3 +783,28 @@ class ZImagePixelSpace(ChromaRadiance):
|
||||
No VAE encoding/decoding — the model operates directly on RGB pixels.
|
||||
"""
|
||||
pass
|
||||
|
||||
class CogVideoX(LatentFormat):
|
||||
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
|
||||
|
||||
scale_factor matches the vae/config.json scaling_factor for the 2b variant.
|
||||
The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
|
||||
use a different value; see CogVideoX1_5 below.
|
||||
"""
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
|
||||
def __init__(self):
|
||||
self.scale_factor = 1.15258426
|
||||
|
||||
|
||||
class CogVideoX1_5(CogVideoX):
|
||||
"""Latent format for 5b-class CogVideoX checkpoints.
|
||||
|
||||
Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
|
||||
V1.5-5b family (including VOID inpainting). All of these have
|
||||
scaling_factor=0.7 in their vae/config.json. Auto-selected in
|
||||
supported_models.CogVideoX_T2V based on transformer hidden dim.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.scale_factor = 0.7
|
||||
|
||||
573
comfy/ldm/cogvideo/model.py
Normal file
573
comfy/ldm/cogvideo/model.py
Normal file
@@ -0,0 +1,573 @@
|
||||
# CogVideoX 3D Transformer - ported to ComfyUI native ops
|
||||
# Architecture reference: diffusers CogVideoXTransformer3DModel
|
||||
# Style reference: comfy/ldm/wan/model.py
|
||||
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
import comfy.patcher_extension
|
||||
import comfy.ldm.common_dit
|
||||
|
||||
|
||||
def _get_1d_rotary_pos_embed(dim, pos, theta=10000.0):
|
||||
"""Returns (cos, sin) each with shape [seq_len, dim].
|
||||
|
||||
Frequencies are computed at dim//2 resolution then repeat_interleaved
|
||||
to full dim, matching CogVideoX's interleaved (real, imag) pair format.
|
||||
"""
|
||||
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim))
|
||||
angles = torch.outer(pos.float(), freqs.float())
|
||||
cos = angles.cos().repeat_interleave(2, dim=-1).float()
|
||||
sin = angles.sin().repeat_interleave(2, dim=-1).float()
|
||||
return (cos, sin)
|
||||
|
||||
|
||||
def apply_rotary_emb(x, freqs_cos_sin):
|
||||
"""Apply CogVideoX rotary embedding to query or key tensor.
|
||||
|
||||
x: [B, heads, seq_len, head_dim]
|
||||
freqs_cos_sin: (cos, sin) each [seq_len, head_dim//2]
|
||||
|
||||
Uses interleaved pair rotation (same as diffusers CogVideoX/Flux).
|
||||
head_dim is reshaped to (-1, 2) pairs, rotated, then flattened back.
|
||||
"""
|
||||
cos, sin = freqs_cos_sin
|
||||
cos = cos[None, None, :, :].to(x.device)
|
||||
sin = sin[None, None, :, :].to(x.device)
|
||||
|
||||
# Interleaved pairs: [B, H, S, D] -> [B, H, S, D//2, 2] -> (real, imag)
|
||||
x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
|
||||
x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
|
||||
|
||||
return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
|
||||
|
||||
|
||||
def get_timestep_embedding(timesteps, dim, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1, max_period=10000):
|
||||
half = dim // 2
|
||||
freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half)
|
||||
args = timesteps[:, None].float() * freqs[None] * scale
|
||||
embedding = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
|
||||
if flip_sin_to_cos:
|
||||
embedding = torch.cat([embedding[:, half:], embedding[:, :half]], dim=-1)
|
||||
if dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding
|
||||
|
||||
|
||||
def get_3d_sincos_pos_embed(embed_dim, spatial_size, temporal_size, spatial_interpolation_scale=1.0, temporal_interpolation_scale=1.0, device=None):
|
||||
if isinstance(spatial_size, int):
|
||||
spatial_size = (spatial_size, spatial_size)
|
||||
|
||||
grid_w = torch.arange(spatial_size[0], dtype=torch.float32, device=device) / spatial_interpolation_scale
|
||||
grid_h = torch.arange(spatial_size[1], dtype=torch.float32, device=device) / spatial_interpolation_scale
|
||||
grid_t = torch.arange(temporal_size, dtype=torch.float32, device=device) / temporal_interpolation_scale
|
||||
|
||||
grid_t, grid_h, grid_w = torch.meshgrid(grid_t, grid_h, grid_w, indexing="ij")
|
||||
|
||||
embed_dim_spatial = 2 * (embed_dim // 3)
|
||||
embed_dim_temporal = embed_dim // 3
|
||||
|
||||
pos_embed_spatial = _get_2d_sincos_pos_embed(embed_dim_spatial, grid_h, grid_w, device=device)
|
||||
pos_embed_temporal = _get_1d_sincos_pos_embed(embed_dim_temporal, grid_t[:, 0, 0], device=device)
|
||||
|
||||
T, H, W = grid_t.shape
|
||||
pos_embed_temporal = pos_embed_temporal.unsqueeze(1).unsqueeze(1).expand(-1, H, W, -1)
|
||||
pos_embed = torch.cat([pos_embed_temporal, pos_embed_spatial], dim=-1)
|
||||
|
||||
return pos_embed
|
||||
|
||||
|
||||
def _get_2d_sincos_pos_embed(embed_dim, grid_h, grid_w, device=None):
|
||||
T, H, W = grid_h.shape
|
||||
half_dim = embed_dim // 2
|
||||
pos_h = _get_1d_sincos_pos_embed(half_dim, grid_h.reshape(-1), device=device).reshape(T, H, W, half_dim)
|
||||
pos_w = _get_1d_sincos_pos_embed(half_dim, grid_w.reshape(-1), device=device).reshape(T, H, W, half_dim)
|
||||
return torch.cat([pos_h, pos_w], dim=-1)
|
||||
|
||||
|
||||
def _get_1d_sincos_pos_embed(embed_dim, pos, device=None):
|
||||
half = embed_dim // 2
|
||||
freqs = torch.exp(-math.log(10000.0) * torch.arange(start=0, end=half, dtype=torch.float32, device=device) / half)
|
||||
args = pos.float().reshape(-1)[:, None] * freqs[None]
|
||||
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||
if embed_dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding
|
||||
|
||||
|
||||
|
||||
class CogVideoXPatchEmbed(nn.Module):
|
||||
def __init__(self, patch_size=2, patch_size_t=None, in_channels=16, dim=1920,
|
||||
text_dim=4096, bias=True, sample_width=90, sample_height=60,
|
||||
sample_frames=49, temporal_compression_ratio=4,
|
||||
max_text_seq_length=226, spatial_interpolation_scale=1.875,
|
||||
temporal_interpolation_scale=1.0, use_positional_embeddings=True,
|
||||
use_learned_positional_embeddings=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.patch_size_t = patch_size_t
|
||||
self.dim = dim
|
||||
self.sample_height = sample_height
|
||||
self.sample_width = sample_width
|
||||
self.sample_frames = sample_frames
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.use_positional_embeddings = use_positional_embeddings
|
||||
self.use_learned_positional_embeddings = use_learned_positional_embeddings
|
||||
|
||||
if patch_size_t is None:
|
||||
self.proj = operations.Conv2d(in_channels, dim, kernel_size=patch_size, stride=patch_size, bias=bias, device=device, dtype=dtype)
|
||||
else:
|
||||
self.proj = operations.Linear(in_channels * patch_size * patch_size * patch_size_t, dim, device=device, dtype=dtype)
|
||||
|
||||
self.text_proj = operations.Linear(text_dim, dim, device=device, dtype=dtype)
|
||||
|
||||
if use_positional_embeddings or use_learned_positional_embeddings:
|
||||
persistent = use_learned_positional_embeddings
|
||||
pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
|
||||
self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
|
||||
|
||||
def _get_positional_embeddings(self, sample_height, sample_width, sample_frames, device=None):
|
||||
post_patch_height = sample_height // self.patch_size
|
||||
post_patch_width = sample_width // self.patch_size
|
||||
post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
|
||||
if self.patch_size_t is not None:
|
||||
post_time_compression_frames = post_time_compression_frames // self.patch_size_t
|
||||
num_patches = post_patch_height * post_patch_width * post_time_compression_frames
|
||||
|
||||
pos_embedding = get_3d_sincos_pos_embed(
|
||||
self.dim,
|
||||
(post_patch_width, post_patch_height),
|
||||
post_time_compression_frames,
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
device=device,
|
||||
)
|
||||
pos_embedding = pos_embedding.reshape(-1, self.dim)
|
||||
joint_pos_embedding = pos_embedding.new_zeros(
|
||||
1, self.max_text_seq_length + num_patches, self.dim, requires_grad=False
|
||||
)
|
||||
joint_pos_embedding.data[:, self.max_text_seq_length:].copy_(pos_embedding)
|
||||
return joint_pos_embedding
|
||||
|
||||
def forward(self, text_embeds, image_embeds):
|
||||
input_dtype = text_embeds.dtype
|
||||
text_embeds = self.text_proj(text_embeds.to(self.text_proj.weight.dtype)).to(input_dtype)
|
||||
batch_size, num_frames, channels, height, width = image_embeds.shape
|
||||
|
||||
proj_dtype = self.proj.weight.dtype
|
||||
if self.patch_size_t is None:
|
||||
image_embeds = image_embeds.reshape(-1, channels, height, width)
|
||||
image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
|
||||
image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
|
||||
image_embeds = image_embeds.flatten(3).transpose(2, 3)
|
||||
image_embeds = image_embeds.flatten(1, 2)
|
||||
else:
|
||||
p = self.patch_size
|
||||
p_t = self.patch_size_t
|
||||
image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
|
||||
image_embeds = image_embeds.reshape(
|
||||
batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
|
||||
)
|
||||
image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
|
||||
image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
|
||||
|
||||
embeds = torch.cat([text_embeds, image_embeds], dim=1).contiguous()
|
||||
|
||||
if self.use_positional_embeddings or self.use_learned_positional_embeddings:
|
||||
text_seq_length = text_embeds.shape[1]
|
||||
num_image_patches = image_embeds.shape[1]
|
||||
|
||||
if self.use_learned_positional_embeddings:
|
||||
image_pos = self.pos_embedding[
|
||||
:, self.max_text_seq_length:self.max_text_seq_length + num_image_patches
|
||||
].to(device=embeds.device, dtype=embeds.dtype)
|
||||
else:
|
||||
image_pos = get_3d_sincos_pos_embed(
|
||||
self.dim,
|
||||
(width // self.patch_size, height // self.patch_size),
|
||||
num_image_patches // ((height // self.patch_size) * (width // self.patch_size)),
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
device=embeds.device,
|
||||
).reshape(1, num_image_patches, self.dim).to(dtype=embeds.dtype)
|
||||
|
||||
# Build joint: zeros for text + sincos for image
|
||||
joint_pos = torch.zeros(1, text_seq_length + num_image_patches, self.dim, device=embeds.device, dtype=embeds.dtype)
|
||||
joint_pos[:, text_seq_length:] = image_pos
|
||||
embeds = embeds + joint_pos
|
||||
|
||||
return embeds
|
||||
|
||||
|
||||
class CogVideoXLayerNormZero(nn.Module):
|
||||
def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5, bias=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.silu = nn.SiLU()
|
||||
self.linear = operations.Linear(time_dim, 6 * dim, bias=bias, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_states, encoder_hidden_states, temb):
|
||||
shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
|
||||
hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
|
||||
encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
|
||||
return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
|
||||
|
||||
|
||||
class CogVideoXAdaLayerNorm(nn.Module):
|
||||
def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.silu = nn.SiLU()
|
||||
self.linear = operations.Linear(time_dim, 2 * dim, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, temb):
|
||||
temb = self.linear(self.silu(temb))
|
||||
shift, scale = temb.chunk(2, dim=1)
|
||||
x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
|
||||
return x
|
||||
|
||||
|
||||
class CogVideoXBlock(nn.Module):
|
||||
def __init__(self, dim, num_heads, head_dim, time_dim,
|
||||
eps=1e-5, ff_inner_dim=None, ff_bias=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = head_dim
|
||||
|
||||
self.norm1 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
# Self-attention (joint text + latent)
|
||||
self.q = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.k = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.v = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.norm_q = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.norm_k = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.attn_out = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
|
||||
self.norm2 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
# Feed-forward (GELU approximate)
|
||||
inner_dim = ff_inner_dim or dim * 4
|
||||
self.ff_proj = operations.Linear(dim, inner_dim, bias=ff_bias, device=device, dtype=dtype)
|
||||
self.ff_out = operations.Linear(inner_dim, dim, bias=ff_bias, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_states, encoder_hidden_states, temb, image_rotary_emb=None, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
text_seq_length = encoder_hidden_states.size(1)
|
||||
|
||||
# Norm & modulate
|
||||
norm_hidden, norm_encoder, gate_msa, enc_gate_msa = self.norm1(hidden_states, encoder_hidden_states, temb)
|
||||
|
||||
# Joint self-attention
|
||||
qkv_input = torch.cat([norm_encoder, norm_hidden], dim=1)
|
||||
b, s, _ = qkv_input.shape
|
||||
n, d = self.num_heads, self.head_dim
|
||||
|
||||
q = self.q(qkv_input).view(b, s, n, d)
|
||||
k = self.k(qkv_input).view(b, s, n, d)
|
||||
v = self.v(qkv_input)
|
||||
|
||||
q = self.norm_q(q).view(b, s, n, d)
|
||||
k = self.norm_k(k).view(b, s, n, d)
|
||||
|
||||
# Apply rotary embeddings to image tokens only (diffusers format: [B, heads, seq, head_dim])
|
||||
if image_rotary_emb is not None:
|
||||
q_img = q[:, text_seq_length:].transpose(1, 2) # [B, heads, img_seq, head_dim]
|
||||
k_img = k[:, text_seq_length:].transpose(1, 2)
|
||||
q_img = apply_rotary_emb(q_img, image_rotary_emb)
|
||||
k_img = apply_rotary_emb(k_img, image_rotary_emb)
|
||||
q = torch.cat([q[:, :text_seq_length], q_img.transpose(1, 2)], dim=1)
|
||||
k = torch.cat([k[:, :text_seq_length], k_img.transpose(1, 2)], dim=1)
|
||||
|
||||
attn_out = optimized_attention(
|
||||
q.reshape(b, s, n * d),
|
||||
k.reshape(b, s, n * d),
|
||||
v,
|
||||
heads=self.num_heads,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
attn_out = self.attn_out(attn_out)
|
||||
|
||||
attn_encoder, attn_hidden = attn_out.split([text_seq_length, s - text_seq_length], dim=1)
|
||||
|
||||
hidden_states = hidden_states + gate_msa * attn_hidden
|
||||
encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder
|
||||
|
||||
# Norm & modulate for FF
|
||||
norm_hidden, norm_encoder, gate_ff, enc_gate_ff = self.norm2(hidden_states, encoder_hidden_states, temb)
|
||||
|
||||
# Feed-forward (GELU on concatenated text + latent)
|
||||
ff_input = torch.cat([norm_encoder, norm_hidden], dim=1)
|
||||
ff_output = self.ff_out(F.gelu(self.ff_proj(ff_input), approximate="tanh"))
|
||||
|
||||
hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
|
||||
encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
|
||||
|
||||
return hidden_states, encoder_hidden_states
|
||||
|
||||
|
||||
class CogVideoXTransformer3DModel(nn.Module):
|
||||
def __init__(self,
|
||||
num_attention_heads=30,
|
||||
attention_head_dim=64,
|
||||
in_channels=16,
|
||||
out_channels=16,
|
||||
flip_sin_to_cos=True,
|
||||
freq_shift=0,
|
||||
time_embed_dim=512,
|
||||
ofs_embed_dim=None,
|
||||
text_embed_dim=4096,
|
||||
num_layers=30,
|
||||
dropout=0.0,
|
||||
attention_bias=True,
|
||||
sample_width=90,
|
||||
sample_height=60,
|
||||
sample_frames=49,
|
||||
patch_size=2,
|
||||
patch_size_t=None,
|
||||
temporal_compression_ratio=4,
|
||||
max_text_seq_length=226,
|
||||
spatial_interpolation_scale=1.875,
|
||||
temporal_interpolation_scale=1.0,
|
||||
use_rotary_positional_embeddings=False,
|
||||
use_learned_positional_embeddings=False,
|
||||
patch_bias=True,
|
||||
image_model=None,
|
||||
device=None,
|
||||
dtype=None,
|
||||
operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
dim = num_attention_heads * attention_head_dim
|
||||
self.dim = dim
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_head_dim = attention_head_dim
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.patch_size = patch_size
|
||||
self.patch_size_t = patch_size_t
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.use_rotary_positional_embeddings = use_rotary_positional_embeddings
|
||||
|
||||
# 1. Patch embedding
|
||||
self.patch_embed = CogVideoXPatchEmbed(
|
||||
patch_size=patch_size,
|
||||
patch_size_t=patch_size_t,
|
||||
in_channels=in_channels,
|
||||
dim=dim,
|
||||
text_dim=text_embed_dim,
|
||||
bias=patch_bias,
|
||||
sample_width=sample_width,
|
||||
sample_height=sample_height,
|
||||
sample_frames=sample_frames,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
max_text_seq_length=max_text_seq_length,
|
||||
spatial_interpolation_scale=spatial_interpolation_scale,
|
||||
temporal_interpolation_scale=temporal_interpolation_scale,
|
||||
use_positional_embeddings=not use_rotary_positional_embeddings,
|
||||
use_learned_positional_embeddings=use_learned_positional_embeddings,
|
||||
device=device, dtype=torch.float32, operations=operations,
|
||||
)
|
||||
|
||||
# 2. Time embedding
|
||||
self.time_proj_dim = dim
|
||||
self.time_proj_flip = flip_sin_to_cos
|
||||
self.time_proj_shift = freq_shift
|
||||
self.time_embedding_linear_1 = operations.Linear(dim, time_embed_dim, device=device, dtype=dtype)
|
||||
self.time_embedding_act = nn.SiLU()
|
||||
self.time_embedding_linear_2 = operations.Linear(time_embed_dim, time_embed_dim, device=device, dtype=dtype)
|
||||
|
||||
# Optional OFS embedding (CogVideoX 1.5 I2V)
|
||||
self.ofs_proj_dim = ofs_embed_dim
|
||||
if ofs_embed_dim:
|
||||
self.ofs_embedding_linear_1 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
|
||||
self.ofs_embedding_act = nn.SiLU()
|
||||
self.ofs_embedding_linear_2 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
|
||||
else:
|
||||
self.ofs_embedding_linear_1 = None
|
||||
|
||||
# 3. Transformer blocks
|
||||
self.blocks = nn.ModuleList([
|
||||
CogVideoXBlock(
|
||||
dim=dim,
|
||||
num_heads=num_attention_heads,
|
||||
head_dim=attention_head_dim,
|
||||
time_dim=time_embed_dim,
|
||||
eps=1e-5,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.norm_final = operations.LayerNorm(dim, eps=1e-5, elementwise_affine=True, device=device, dtype=dtype)
|
||||
|
||||
# 4. Output
|
||||
self.norm_out = CogVideoXAdaLayerNorm(
|
||||
time_dim=time_embed_dim, dim=dim, eps=1e-5,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
|
||||
if patch_size_t is None:
|
||||
output_dim = patch_size * patch_size * out_channels
|
||||
else:
|
||||
output_dim = patch_size * patch_size * patch_size_t * out_channels
|
||||
|
||||
self.proj_out = operations.Linear(dim, output_dim, device=device, dtype=dtype)
|
||||
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
|
||||
def forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||
).execute(x, timestep, context, ofs, transformer_options, **kwargs)
|
||||
|
||||
def _forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
# ComfyUI passes [B, C, T, H, W]
|
||||
batch_size, channels, t, h, w = x.shape
|
||||
|
||||
# Pad to patch size (temporal + spatial), same pattern as WAN
|
||||
p_t = self.patch_size_t if self.patch_size_t is not None else 1
|
||||
x = comfy.ldm.common_dit.pad_to_patch_size(x, (p_t, self.patch_size, self.patch_size))
|
||||
|
||||
# CogVideoX expects [B, T, C, H, W]
|
||||
x = x.permute(0, 2, 1, 3, 4)
|
||||
batch_size, num_frames, channels, height, width = x.shape
|
||||
|
||||
# Time embedding
|
||||
t_emb = get_timestep_embedding(timestep, self.time_proj_dim, self.time_proj_flip, self.time_proj_shift)
|
||||
t_emb = t_emb.to(dtype=x.dtype)
|
||||
emb = self.time_embedding_linear_2(self.time_embedding_act(self.time_embedding_linear_1(t_emb)))
|
||||
|
||||
if self.ofs_embedding_linear_1 is not None and ofs is not None:
|
||||
ofs_emb = get_timestep_embedding(ofs, self.ofs_proj_dim, self.time_proj_flip, self.time_proj_shift)
|
||||
ofs_emb = ofs_emb.to(dtype=x.dtype)
|
||||
ofs_emb = self.ofs_embedding_linear_2(self.ofs_embedding_act(self.ofs_embedding_linear_1(ofs_emb)))
|
||||
emb = emb + ofs_emb
|
||||
|
||||
# Patch embedding
|
||||
hidden_states = self.patch_embed(context, x)
|
||||
|
||||
text_seq_length = context.shape[1]
|
||||
encoder_hidden_states = hidden_states[:, :text_seq_length]
|
||||
hidden_states = hidden_states[:, text_seq_length:]
|
||||
|
||||
# Rotary embeddings (if used)
|
||||
image_rotary_emb = None
|
||||
if self.use_rotary_positional_embeddings:
|
||||
post_patch_height = height // self.patch_size
|
||||
post_patch_width = width // self.patch_size
|
||||
if self.patch_size_t is None:
|
||||
post_time = num_frames
|
||||
else:
|
||||
post_time = num_frames // self.patch_size_t
|
||||
image_rotary_emb = self._get_rotary_emb(post_patch_height, post_patch_width, post_time, device=x.device)
|
||||
|
||||
# Transformer blocks
|
||||
for i, block in enumerate(self.blocks):
|
||||
hidden_states, encoder_hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
temb=emb,
|
||||
image_rotary_emb=image_rotary_emb,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
hidden_states = self.norm_final(hidden_states)
|
||||
|
||||
# Output projection
|
||||
hidden_states = self.norm_out(hidden_states, temb=emb)
|
||||
hidden_states = self.proj_out(hidden_states)
|
||||
|
||||
# Unpatchify
|
||||
p = self.patch_size
|
||||
p_t = self.patch_size_t
|
||||
|
||||
if p_t is None:
|
||||
output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
|
||||
output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
|
||||
else:
|
||||
output = hidden_states.reshape(
|
||||
batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
|
||||
)
|
||||
output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
|
||||
|
||||
# Back to ComfyUI format [B, C, T, H, W] and crop padding
|
||||
output = output.permute(0, 2, 1, 3, 4)[:, :, :t, :h, :w]
|
||||
return output
|
||||
|
||||
def _get_rotary_emb(self, h, w, t, device):
|
||||
"""Compute CogVideoX 3D rotary positional embeddings.
|
||||
|
||||
For CogVideoX 1.5 (patch_size_t != None): uses "slice" mode — grid positions
|
||||
are integer arange computed at max_size, then sliced to actual size.
|
||||
For CogVideoX 1.0 (patch_size_t == None): uses "linspace" mode with crop coords
|
||||
scaled by spatial_interpolation_scale.
|
||||
"""
|
||||
d = self.attention_head_dim
|
||||
dim_t = d // 4
|
||||
dim_h = d // 8 * 3
|
||||
dim_w = d // 8 * 3
|
||||
|
||||
if self.patch_size_t is not None:
|
||||
# CogVideoX 1.5: "slice" mode — positions are simple integer indices
|
||||
# Compute at max(sample_size, actual_size) then slice to actual
|
||||
base_h = self.patch_embed.sample_height // self.patch_size
|
||||
base_w = self.patch_embed.sample_width // self.patch_size
|
||||
max_h = max(base_h, h)
|
||||
max_w = max(base_w, w)
|
||||
|
||||
grid_h = torch.arange(max_h, device=device, dtype=torch.float32)
|
||||
grid_w = torch.arange(max_w, device=device, dtype=torch.float32)
|
||||
grid_t = torch.arange(t, device=device, dtype=torch.float32)
|
||||
else:
|
||||
# CogVideoX 1.0: "linspace" mode with interpolation scale
|
||||
grid_h = torch.linspace(0, h - 1, h, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
|
||||
grid_w = torch.linspace(0, w - 1, w, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
|
||||
grid_t = torch.arange(t, device=device, dtype=torch.float32)
|
||||
|
||||
freqs_t = _get_1d_rotary_pos_embed(dim_t, grid_t)
|
||||
freqs_h = _get_1d_rotary_pos_embed(dim_h, grid_h)
|
||||
freqs_w = _get_1d_rotary_pos_embed(dim_w, grid_w)
|
||||
|
||||
t_cos, t_sin = freqs_t
|
||||
h_cos, h_sin = freqs_h
|
||||
w_cos, w_sin = freqs_w
|
||||
|
||||
# Slice to actual size (for "slice" mode where grids may be larger)
|
||||
t_cos, t_sin = t_cos[:t], t_sin[:t]
|
||||
h_cos, h_sin = h_cos[:h], h_sin[:h]
|
||||
w_cos, w_sin = w_cos[:w], w_sin[:w]
|
||||
|
||||
# Broadcast and concatenate into [T*H*W, head_dim]
|
||||
t_cos = t_cos[:, None, None, :].expand(-1, h, w, -1)
|
||||
t_sin = t_sin[:, None, None, :].expand(-1, h, w, -1)
|
||||
h_cos = h_cos[None, :, None, :].expand(t, -1, w, -1)
|
||||
h_sin = h_sin[None, :, None, :].expand(t, -1, w, -1)
|
||||
w_cos = w_cos[None, None, :, :].expand(t, h, -1, -1)
|
||||
w_sin = w_sin[None, None, :, :].expand(t, h, -1, -1)
|
||||
|
||||
cos = torch.cat([t_cos, h_cos, w_cos], dim=-1).reshape(t * h * w, -1)
|
||||
sin = torch.cat([t_sin, h_sin, w_sin], dim=-1).reshape(t * h * w, -1)
|
||||
return (cos, sin)
|
||||
566
comfy/ldm/cogvideo/vae.py
Normal file
566
comfy/ldm/cogvideo/vae.py
Normal file
@@ -0,0 +1,566 @@
|
||||
# CogVideoX VAE - ported to ComfyUI native ops
|
||||
# Architecture reference: diffusers AutoencoderKLCogVideoX
|
||||
# Style reference: comfy/ldm/wan/vae.py
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import comfy.ops
|
||||
ops = comfy.ops.disable_weight_init
|
||||
|
||||
|
||||
class CausalConv3d(nn.Module):
|
||||
"""Causal 3D convolution with temporal padding.
|
||||
|
||||
Uses comfy.ops.Conv3d with autopad='causal_zero' fast path: when input has
|
||||
a single temporal frame and no cache, the 3D conv weight is sliced to act
|
||||
as a 2D conv, avoiding computation on zero-padded temporal dimensions.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, pad_mode="constant"):
|
||||
super().__init__()
|
||||
if isinstance(kernel_size, int):
|
||||
kernel_size = (kernel_size,) * 3
|
||||
|
||||
time_kernel, height_kernel, width_kernel = kernel_size
|
||||
self.time_kernel_size = time_kernel
|
||||
self.pad_mode = pad_mode
|
||||
|
||||
height_pad = (height_kernel - 1) // 2
|
||||
width_pad = (width_kernel - 1) // 2
|
||||
self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_kernel - 1, 0)
|
||||
|
||||
stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
|
||||
dilation = (dilation, 1, 1)
|
||||
self.conv = ops.Conv3d(
|
||||
in_channels, out_channels, kernel_size,
|
||||
stride=stride, dilation=dilation,
|
||||
padding=(0, height_pad, width_pad),
|
||||
)
|
||||
|
||||
def forward(self, x, conv_cache=None):
|
||||
if self.pad_mode == "replicate":
|
||||
x = F.pad(x, self.time_causal_padding, mode="replicate")
|
||||
conv_cache = None
|
||||
else:
|
||||
kernel_t = self.time_kernel_size
|
||||
if kernel_t > 1:
|
||||
if conv_cache is None and x.shape[2] == 1:
|
||||
# Fast path: single frame, no cache. All temporal padding
|
||||
# frames are copies of the input (replicate-style), so the
|
||||
# 3D conv reduces to a 2D conv with summed temporal kernel.
|
||||
w = comfy.ops.cast_to_input(self.conv.weight, x)
|
||||
b = comfy.ops.cast_to_input(self.conv.bias, x) if self.conv.bias is not None else None
|
||||
w2d = w.sum(dim=2, keepdim=True)
|
||||
out = F.conv3d(x, w2d, b,
|
||||
self.conv.stride, self.conv.padding,
|
||||
self.conv.dilation, self.conv.groups)
|
||||
return out, None
|
||||
cached = [conv_cache] if conv_cache is not None else [x[:, :, :1]] * (kernel_t - 1)
|
||||
x = torch.cat(cached + [x], dim=2)
|
||||
conv_cache = x[:, :, -self.time_kernel_size + 1:].clone() if self.time_kernel_size > 1 else None
|
||||
|
||||
out = self.conv(x)
|
||||
return out, conv_cache
|
||||
|
||||
|
||||
def _interpolate_zq(zq, target_size):
|
||||
"""Interpolate latent z to target (T, H, W), matching CogVideoX's first-frame-special handling."""
|
||||
t = target_size[0]
|
||||
if t > 1 and t % 2 == 1:
|
||||
z_first = F.interpolate(zq[:, :, :1], size=(1, target_size[1], target_size[2]))
|
||||
z_rest = F.interpolate(zq[:, :, 1:], size=(t - 1, target_size[1], target_size[2]))
|
||||
return torch.cat([z_first, z_rest], dim=2)
|
||||
return F.interpolate(zq, size=target_size)
|
||||
|
||||
|
||||
class SpatialNorm3D(nn.Module):
|
||||
"""Spatially conditioned normalization."""
|
||||
def __init__(self, f_channels, zq_channels, groups=32):
|
||||
super().__init__()
|
||||
self.norm_layer = ops.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
|
||||
self.conv_y = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
|
||||
self.conv_b = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
|
||||
|
||||
def forward(self, f, zq, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
if zq.shape[-3:] != f.shape[-3:]:
|
||||
zq = _interpolate_zq(zq, f.shape[-3:])
|
||||
|
||||
conv_y, new_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
|
||||
conv_b, new_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
|
||||
|
||||
return self.norm_layer(f) * conv_y + conv_b, new_cache
|
||||
|
||||
|
||||
class ResnetBlock3D(nn.Module):
|
||||
"""3D ResNet block with optional spatial norm."""
|
||||
def __init__(self, in_channels, out_channels=None, temb_channels=512, groups=32,
|
||||
eps=1e-6, act_fn="silu", spatial_norm_dim=None, pad_mode="first"):
|
||||
super().__init__()
|
||||
out_channels = out_channels or in_channels
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.spatial_norm_dim = spatial_norm_dim
|
||||
|
||||
if act_fn == "silu":
|
||||
self.nonlinearity = nn.SiLU()
|
||||
elif act_fn == "swish":
|
||||
self.nonlinearity = nn.SiLU()
|
||||
else:
|
||||
self.nonlinearity = nn.SiLU()
|
||||
|
||||
if spatial_norm_dim is None:
|
||||
self.norm1 = ops.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
|
||||
self.norm2 = ops.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
|
||||
else:
|
||||
self.norm1 = SpatialNorm3D(in_channels, spatial_norm_dim, groups=groups)
|
||||
self.norm2 = SpatialNorm3D(out_channels, spatial_norm_dim, groups=groups)
|
||||
|
||||
self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
if temb_channels > 0:
|
||||
self.temb_proj = ops.Linear(temb_channels, out_channels)
|
||||
|
||||
self.conv2 = CausalConv3d(out_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
if in_channels != out_channels:
|
||||
self.conv_shortcut = ops.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
|
||||
else:
|
||||
self.conv_shortcut = None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
residual = x
|
||||
|
||||
if zq is not None:
|
||||
x, new_cache["norm1"] = self.norm1(x, zq, conv_cache=conv_cache.get("norm1"))
|
||||
else:
|
||||
x = self.norm1(x)
|
||||
|
||||
x = self.nonlinearity(x)
|
||||
x, new_cache["conv1"] = self.conv1(x, conv_cache=conv_cache.get("conv1"))
|
||||
|
||||
if temb is not None and hasattr(self, "temb_proj"):
|
||||
x = x + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
|
||||
|
||||
if zq is not None:
|
||||
x, new_cache["norm2"] = self.norm2(x, zq, conv_cache=conv_cache.get("norm2"))
|
||||
else:
|
||||
x = self.norm2(x)
|
||||
|
||||
x = self.nonlinearity(x)
|
||||
x, new_cache["conv2"] = self.conv2(x, conv_cache=conv_cache.get("conv2"))
|
||||
|
||||
if self.conv_shortcut is not None:
|
||||
residual = self.conv_shortcut(residual)
|
||||
|
||||
return x + residual, new_cache
|
||||
|
||||
|
||||
class Downsample3D(nn.Module):
|
||||
"""3D downsampling with optional temporal compression."""
|
||||
def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=0, compress_time=False):
|
||||
super().__init__()
|
||||
self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
|
||||
self.compress_time = compress_time
|
||||
|
||||
def forward(self, x):
|
||||
if self.compress_time:
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 3, 4, 1, 2).reshape(b * h * w, c, t)
|
||||
if t % 2 == 1:
|
||||
x_first, x_rest = x[..., 0], x[..., 1:]
|
||||
if x_rest.shape[-1] > 0:
|
||||
x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
|
||||
x = torch.cat([x_first[..., None], x_rest], dim=-1)
|
||||
x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
|
||||
else:
|
||||
x = F.avg_pool1d(x, kernel_size=2, stride=2)
|
||||
x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
|
||||
|
||||
pad = (0, 1, 0, 1)
|
||||
x = F.pad(x, pad, mode="constant", value=0)
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = self.conv(x)
|
||||
x = x.reshape(b, t, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
|
||||
return x
|
||||
|
||||
|
||||
class Upsample3D(nn.Module):
|
||||
"""3D upsampling with optional temporal decompression."""
|
||||
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, compress_time=False):
|
||||
super().__init__()
|
||||
self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
|
||||
self.compress_time = compress_time
|
||||
|
||||
def forward(self, x):
|
||||
if self.compress_time:
|
||||
if x.shape[2] > 1 and x.shape[2] % 2 == 1:
|
||||
x_first, x_rest = x[:, :, 0], x[:, :, 1:]
|
||||
x_first = F.interpolate(x_first, scale_factor=2.0)
|
||||
x_rest = F.interpolate(x_rest, scale_factor=2.0)
|
||||
x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2)
|
||||
elif x.shape[2] > 1:
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
else:
|
||||
x = x.squeeze(2)
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
x = x[:, :, None, :, :]
|
||||
else:
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
x = x.reshape(b, t, c, *x.shape[2:]).permute(0, 2, 1, 3, 4)
|
||||
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = self.conv(x)
|
||||
x = x.reshape(b, t, *x.shape[1:]).permute(0, 2, 1, 3, 4)
|
||||
return x
|
||||
|
||||
|
||||
class DownBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, add_downsample=True,
|
||||
compress_time=False, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
out_channels=out_channels,
|
||||
temb_channels=temb_channels,
|
||||
groups=groups, eps=eps, act_fn=act_fn, pad_mode=pad_mode,
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.downsamplers = nn.ModuleList([Downsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_downsample else None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
if self.downsamplers is not None:
|
||||
for ds in self.downsamplers:
|
||||
x = ds(x)
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class MidBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=None, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels, out_channels=in_channels,
|
||||
temb_channels=temb_channels, groups=groups, eps=eps,
|
||||
act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class UpBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=16,
|
||||
add_upsample=True, compress_time=False, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
out_channels=out_channels,
|
||||
temb_channels=temb_channels, groups=groups, eps=eps,
|
||||
act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.upsamplers = nn.ModuleList([Upsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_upsample else None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
if self.upsamplers is not None:
|
||||
for us in self.upsamplers:
|
||||
x = us(x)
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class Encoder3D(nn.Module):
|
||||
def __init__(self, in_channels=3, out_channels=16,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
layers_per_block=3, act_fn="silu",
|
||||
eps=1e-6, groups=32, pad_mode="first",
|
||||
temporal_compression_ratio=4):
|
||||
super().__init__()
|
||||
temporal_compress_level = int(np.log2(temporal_compression_ratio))
|
||||
|
||||
self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
self.down_blocks = nn.ModuleList()
|
||||
output_channel = block_out_channels[0]
|
||||
for i in range(len(block_out_channels)):
|
||||
input_channel = output_channel
|
||||
output_channel = block_out_channels[i]
|
||||
is_final = i == len(block_out_channels) - 1
|
||||
compress_time = i < temporal_compress_level
|
||||
|
||||
self.down_blocks.append(DownBlock3D(
|
||||
in_channels=input_channel, out_channels=output_channel,
|
||||
temb_channels=0, num_layers=layers_per_block,
|
||||
eps=eps, act_fn=act_fn, groups=groups,
|
||||
add_downsample=not is_final, compress_time=compress_time,
|
||||
))
|
||||
|
||||
self.mid_block = MidBlock3D(
|
||||
in_channels=block_out_channels[-1], temb_channels=0,
|
||||
num_layers=2, eps=eps, act_fn=act_fn, groups=groups, pad_mode=pad_mode,
|
||||
)
|
||||
|
||||
self.norm_out = ops.GroupNorm(groups, block_out_channels[-1], eps=1e-6)
|
||||
self.conv_act = nn.SiLU()
|
||||
self.conv_out = CausalConv3d(block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
def forward(self, x, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
x, new_cache["conv_in"] = self.conv_in(x, conv_cache=conv_cache.get("conv_in"))
|
||||
|
||||
for i, block in enumerate(self.down_blocks):
|
||||
key = f"down_block_{i}"
|
||||
x, new_cache[key] = block(x, None, None, conv_cache.get(key))
|
||||
|
||||
x, new_cache["mid_block"] = self.mid_block(x, None, None, conv_cache=conv_cache.get("mid_block"))
|
||||
|
||||
x = self.norm_out(x)
|
||||
x = self.conv_act(x)
|
||||
x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
|
||||
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class Decoder3D(nn.Module):
|
||||
def __init__(self, in_channels=16, out_channels=3,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
layers_per_block=3, act_fn="silu",
|
||||
eps=1e-6, groups=32, pad_mode="first",
|
||||
temporal_compression_ratio=4):
|
||||
super().__init__()
|
||||
reversed_channels = list(reversed(block_out_channels))
|
||||
temporal_compress_level = int(np.log2(temporal_compression_ratio))
|
||||
|
||||
self.conv_in = CausalConv3d(in_channels, reversed_channels[0], kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
self.mid_block = MidBlock3D(
|
||||
in_channels=reversed_channels[0], temb_channels=0,
|
||||
num_layers=2, eps=eps, act_fn=act_fn, groups=groups,
|
||||
spatial_norm_dim=in_channels, pad_mode=pad_mode,
|
||||
)
|
||||
|
||||
self.up_blocks = nn.ModuleList()
|
||||
output_channel = reversed_channels[0]
|
||||
for i in range(len(block_out_channels)):
|
||||
prev_channel = output_channel
|
||||
output_channel = reversed_channels[i]
|
||||
is_final = i == len(block_out_channels) - 1
|
||||
compress_time = i < temporal_compress_level
|
||||
|
||||
self.up_blocks.append(UpBlock3D(
|
||||
in_channels=prev_channel, out_channels=output_channel,
|
||||
temb_channels=0, num_layers=layers_per_block + 1,
|
||||
eps=eps, act_fn=act_fn, groups=groups,
|
||||
spatial_norm_dim=in_channels,
|
||||
add_upsample=not is_final, compress_time=compress_time,
|
||||
))
|
||||
|
||||
self.norm_out = SpatialNorm3D(reversed_channels[-1], in_channels, groups=groups)
|
||||
self.conv_act = nn.SiLU()
|
||||
self.conv_out = CausalConv3d(reversed_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
def forward(self, sample, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
x, new_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
|
||||
|
||||
x, new_cache["mid_block"] = self.mid_block(x, None, sample, conv_cache=conv_cache.get("mid_block"))
|
||||
|
||||
for i, block in enumerate(self.up_blocks):
|
||||
key = f"up_block_{i}"
|
||||
x, new_cache[key] = block(x, None, sample, conv_cache=conv_cache.get(key))
|
||||
|
||||
x, new_cache["norm_out"] = self.norm_out(x, sample, conv_cache=conv_cache.get("norm_out"))
|
||||
x = self.conv_act(x)
|
||||
x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
|
||||
|
||||
return x, new_cache
|
||||
|
||||
|
||||
|
||||
class AutoencoderKLCogVideoX(nn.Module):
|
||||
"""CogVideoX VAE. Spatial tiling/slicing handled by ComfyUI's VAE wrapper.
|
||||
|
||||
Uses rolling temporal decode: conv_in + mid_block + temporal up_blocks run
|
||||
on the full (low-res) tensor, then the expensive spatial-only up_blocks +
|
||||
norm_out + conv_out are processed in small temporal chunks with conv_cache
|
||||
carrying causal state between chunks. This keeps peak VRAM proportional to
|
||||
chunk_size rather than total frame count.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels=3, out_channels=3,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
latent_channels=16, layers_per_block=3,
|
||||
act_fn="silu", eps=1e-6, groups=32,
|
||||
temporal_compression_ratio=4,
|
||||
):
|
||||
super().__init__()
|
||||
self.latent_channels = latent_channels
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
|
||||
self.encoder = Encoder3D(
|
||||
in_channels=in_channels, out_channels=latent_channels,
|
||||
block_out_channels=block_out_channels, layers_per_block=layers_per_block,
|
||||
act_fn=act_fn, eps=eps, groups=groups,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
)
|
||||
self.decoder = Decoder3D(
|
||||
in_channels=latent_channels, out_channels=out_channels,
|
||||
block_out_channels=block_out_channels, layers_per_block=layers_per_block,
|
||||
act_fn=act_fn, eps=eps, groups=groups,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
)
|
||||
|
||||
self.num_latent_frames_batch_size = 2
|
||||
self.num_sample_frames_batch_size = 8
|
||||
|
||||
def encode(self, x):
|
||||
t = x.shape[2]
|
||||
frame_batch = self.num_sample_frames_batch_size
|
||||
remainder = t % frame_batch
|
||||
conv_cache = None
|
||||
enc = []
|
||||
|
||||
# Process remainder frames first so only the first chunk can have an
|
||||
# odd temporal dimension — where Downsample3D's first-frame-special
|
||||
# handling in temporal compression is actually correct.
|
||||
if remainder > 0:
|
||||
chunk, conv_cache = self.encoder(x[:, :, :remainder], conv_cache=conv_cache)
|
||||
enc.append(chunk.to(x.device))
|
||||
|
||||
for start in range(remainder, t, frame_batch):
|
||||
chunk, conv_cache = self.encoder(x[:, :, start:start + frame_batch], conv_cache=conv_cache)
|
||||
enc.append(chunk.to(x.device))
|
||||
|
||||
enc = torch.cat(enc, dim=2)
|
||||
mean, _ = enc.chunk(2, dim=1)
|
||||
return mean
|
||||
|
||||
def decode(self, z):
|
||||
return self._decode_rolling(z)
|
||||
|
||||
def _decode_batched(self, z):
|
||||
"""Original batched decode - processes 2 latent frames through full decoder."""
|
||||
t = z.shape[2]
|
||||
frame_batch = self.num_latent_frames_batch_size
|
||||
num_batches = max(t // frame_batch, 1)
|
||||
conv_cache = None
|
||||
dec = []
|
||||
for i in range(num_batches):
|
||||
remaining = t % frame_batch
|
||||
start = frame_batch * i + (0 if i == 0 else remaining)
|
||||
end = frame_batch * (i + 1) + remaining
|
||||
chunk, conv_cache = self.decoder(z[:, :, start:end], conv_cache=conv_cache)
|
||||
dec.append(chunk.cpu())
|
||||
return torch.cat(dec, dim=2).to(z.device)
|
||||
|
||||
def _decode_rolling(self, z):
|
||||
"""Rolling decode - processes low-res layers on full tensor, then rolls
|
||||
through expensive high-res layers in temporal chunks."""
|
||||
decoder = self.decoder
|
||||
device = z.device
|
||||
|
||||
# Determine which up_blocks have temporal upsample vs spatial-only.
|
||||
# Temporal up_blocks are cheap (low res), spatial-only are expensive.
|
||||
temporal_compress_level = int(np.log2(self.temporal_compression_ratio))
|
||||
split_at = temporal_compress_level # first N up_blocks do temporal upsample
|
||||
|
||||
# Phase 1: conv_in + mid_block + temporal up_blocks on full tensor (low/medium res)
|
||||
x, _ = decoder.conv_in(z)
|
||||
x, _ = decoder.mid_block(x, None, z)
|
||||
|
||||
for i in range(split_at):
|
||||
x, _ = decoder.up_blocks[i](x, None, z)
|
||||
|
||||
# Phase 2: remaining spatial-only up_blocks + norm_out + conv_out in temporal chunks
|
||||
remaining_blocks = list(range(split_at, len(decoder.up_blocks)))
|
||||
chunk_size = 4 # pixel frames per chunk through high-res layers
|
||||
t_expanded = x.shape[2]
|
||||
|
||||
if t_expanded <= chunk_size or len(remaining_blocks) == 0:
|
||||
# Small enough to process in one go
|
||||
for i in remaining_blocks:
|
||||
x, _ = decoder.up_blocks[i](x, None, z)
|
||||
x, _ = decoder.norm_out(x, z)
|
||||
x = decoder.conv_act(x)
|
||||
x, _ = decoder.conv_out(x)
|
||||
return x
|
||||
|
||||
# Expand z temporally once to match Phase 2's time dimension.
|
||||
# z stays at latent spatial resolution so this is small (~16 MB vs ~1.3 GB
|
||||
# for the old approach of pre-interpolating to every pixel resolution).
|
||||
z_time_expanded = _interpolate_zq(z, (t_expanded, z.shape[3], z.shape[4]))
|
||||
|
||||
# Process in temporal chunks, interpolating spatially per-chunk to avoid
|
||||
# allocating full [B, C, t_expanded, H, W] tensors at each resolution.
|
||||
dec_out = []
|
||||
conv_caches = {}
|
||||
|
||||
for chunk_start in range(0, t_expanded, chunk_size):
|
||||
chunk_end = min(chunk_start + chunk_size, t_expanded)
|
||||
x_chunk = x[:, :, chunk_start:chunk_end]
|
||||
z_t_chunk = z_time_expanded[:, :, chunk_start:chunk_end]
|
||||
z_spatial_cache = {}
|
||||
|
||||
for i in remaining_blocks:
|
||||
block = decoder.up_blocks[i]
|
||||
cache_key = f"up_block_{i}"
|
||||
hw_key = (x_chunk.shape[3], x_chunk.shape[4])
|
||||
if hw_key not in z_spatial_cache:
|
||||
if z_t_chunk.shape[3] == hw_key[0] and z_t_chunk.shape[4] == hw_key[1]:
|
||||
z_spatial_cache[hw_key] = z_t_chunk
|
||||
else:
|
||||
z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
|
||||
x_chunk, new_cache = block(x_chunk, None, z_spatial_cache[hw_key], conv_cache=conv_caches.get(cache_key))
|
||||
conv_caches[cache_key] = new_cache
|
||||
|
||||
hw_key = (x_chunk.shape[3], x_chunk.shape[4])
|
||||
if hw_key not in z_spatial_cache:
|
||||
z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
|
||||
x_chunk, new_cache = decoder.norm_out(x_chunk, z_spatial_cache[hw_key], conv_cache=conv_caches.get("norm_out"))
|
||||
conv_caches["norm_out"] = new_cache
|
||||
x_chunk = decoder.conv_act(x_chunk)
|
||||
x_chunk, new_cache = decoder.conv_out(x_chunk, conv_cache=conv_caches.get("conv_out"))
|
||||
conv_caches["conv_out"] = new_cache
|
||||
|
||||
dec_out.append(x_chunk.cpu())
|
||||
del z_spatial_cache
|
||||
|
||||
del x, z_time_expanded
|
||||
return torch.cat(dec_out, dim=2).to(device)
|
||||
@@ -15,7 +15,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
|
||||
|
||||
scale = torch.arange(0, dim, 2, dtype=torch.float64, device=device) / dim
|
||||
omega = 1.0 / (theta**scale)
|
||||
out = torch.einsum("...n,d->...nd", pos.to(device), omega)
|
||||
out = torch.einsum("...n,d->...nd", pos, omega)
|
||||
out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
|
||||
return out.to(dtype=torch.float32, device=pos.device)
|
||||
|
||||
@@ -118,6 +118,8 @@ class ErnieImageAttention(nn.Module):
|
||||
query = apply_rotary_emb(query, image_rotary_emb)
|
||||
key = apply_rotary_emb(key, image_rotary_emb)
|
||||
|
||||
query, key = query.to(x.dtype), key.to(x.dtype)
|
||||
|
||||
q_flat = query.reshape(B, S, -1)
|
||||
k_flat = key.reshape(B, S, -1)
|
||||
|
||||
@@ -159,16 +161,16 @@ class ErnieImageSharedAdaLNBlock(nn.Module):
|
||||
|
||||
residual = x
|
||||
x_norm = self.adaLN_sa_ln(x)
|
||||
x_norm = x_norm * (1 + scale_msa) + shift_msa
|
||||
x_norm = (x_norm.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype)
|
||||
|
||||
attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
|
||||
x = residual + gate_msa * attn_out
|
||||
x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype)
|
||||
|
||||
residual = x
|
||||
x_norm = self.adaLN_mlp_ln(x)
|
||||
x_norm = x_norm * (1 + scale_mlp) + shift_mlp
|
||||
x_norm = (x_norm.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype)
|
||||
|
||||
return residual + gate_mlp * self.mlp(x_norm)
|
||||
return residual + (gate_mlp.float() * self.mlp(x_norm).float()).to(x.dtype)
|
||||
|
||||
class ErnieImageAdaLNContinuous(nn.Module):
|
||||
def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
|
||||
@@ -181,7 +183,7 @@ class ErnieImageAdaLNContinuous(nn.Module):
|
||||
def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
|
||||
scale, shift = self.linear(conditioning).chunk(2, dim=-1)
|
||||
x = self.norm(x)
|
||||
x = torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1))
|
||||
x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
|
||||
return x
|
||||
|
||||
class ErnieImageModel(nn.Module):
|
||||
|
||||
@@ -4,6 +4,9 @@ import math
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.model_patcher
|
||||
import comfy.utils as utils
|
||||
from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution
|
||||
from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
|
||||
from comfy.ldm.lightricks.vae.causal_audio_autoencoder import (
|
||||
@@ -40,6 +43,30 @@ class AudioVAEComponentConfig:
|
||||
|
||||
return cls(autoencoder=audio_config, vocoder=vocoder_config)
|
||||
|
||||
|
||||
class ModelDeviceManager:
|
||||
"""Manages device placement and GPU residency for the composed model."""
|
||||
|
||||
def __init__(self, module: torch.nn.Module):
|
||||
load_device = comfy.model_management.get_torch_device()
|
||||
offload_device = comfy.model_management.vae_offload_device()
|
||||
self.patcher = comfy.model_patcher.ModelPatcher(module, load_device, offload_device)
|
||||
|
||||
def ensure_model_loaded(self) -> None:
|
||||
comfy.model_management.free_memory(
|
||||
self.patcher.model_size(),
|
||||
self.patcher.load_device,
|
||||
)
|
||||
comfy.model_management.load_model_gpu(self.patcher)
|
||||
|
||||
def move_to_load_device(self, tensor: torch.Tensor) -> torch.Tensor:
|
||||
return tensor.to(self.patcher.load_device)
|
||||
|
||||
@property
|
||||
def load_device(self):
|
||||
return self.patcher.load_device
|
||||
|
||||
|
||||
class AudioLatentNormalizer:
|
||||
"""Applies per-channel statistics in patch space and restores original layout."""
|
||||
|
||||
@@ -105,17 +132,23 @@ class AudioPreprocessor:
|
||||
class AudioVAE(torch.nn.Module):
|
||||
"""High-level Audio VAE wrapper exposing encode and decode entry points."""
|
||||
|
||||
def __init__(self, metadata: dict):
|
||||
def __init__(self, state_dict: dict, metadata: dict):
|
||||
super().__init__()
|
||||
|
||||
component_config = AudioVAEComponentConfig.from_metadata(metadata)
|
||||
|
||||
vae_sd = utils.state_dict_prefix_replace(state_dict, {"audio_vae.": ""}, filter_keys=True)
|
||||
vocoder_sd = utils.state_dict_prefix_replace(state_dict, {"vocoder.": ""}, filter_keys=True)
|
||||
|
||||
self.autoencoder = CausalAudioAutoencoder(config=component_config.autoencoder)
|
||||
if "bwe" in component_config.vocoder:
|
||||
self.vocoder = VocoderWithBWE(config=component_config.vocoder)
|
||||
else:
|
||||
self.vocoder = Vocoder(config=component_config.vocoder)
|
||||
|
||||
self.autoencoder.load_state_dict(vae_sd, strict=False)
|
||||
self.vocoder.load_state_dict(vocoder_sd, strict=False)
|
||||
|
||||
autoencoder_config = self.autoencoder.get_config()
|
||||
self.normalizer = AudioLatentNormalizer(
|
||||
AudioPatchifier(
|
||||
@@ -135,12 +168,18 @@ class AudioVAE(torch.nn.Module):
|
||||
n_fft=autoencoder_config["n_fft"],
|
||||
)
|
||||
|
||||
def encode(self, audio, sample_rate=44100) -> torch.Tensor:
|
||||
self.device_manager = ModelDeviceManager(self)
|
||||
|
||||
def encode(self, audio: dict) -> torch.Tensor:
|
||||
"""Encode a waveform dictionary into normalized latent tensors."""
|
||||
|
||||
waveform = audio
|
||||
waveform_sample_rate = sample_rate
|
||||
waveform = audio["waveform"]
|
||||
waveform_sample_rate = audio["sample_rate"]
|
||||
input_device = waveform.device
|
||||
# Ensure that Audio VAE is loaded on the correct device.
|
||||
self.device_manager.ensure_model_loaded()
|
||||
|
||||
waveform = self.device_manager.move_to_load_device(waveform)
|
||||
expected_channels = self.autoencoder.encoder.in_channels
|
||||
if waveform.shape[1] != expected_channels:
|
||||
if waveform.shape[1] == 1:
|
||||
@@ -151,7 +190,7 @@ class AudioVAE(torch.nn.Module):
|
||||
)
|
||||
|
||||
mel_spec = self.preprocessor.waveform_to_mel(
|
||||
waveform, waveform_sample_rate, device=waveform.device
|
||||
waveform, waveform_sample_rate, device=self.device_manager.load_device
|
||||
)
|
||||
|
||||
latents = self.autoencoder.encode(mel_spec)
|
||||
@@ -165,13 +204,17 @@ class AudioVAE(torch.nn.Module):
|
||||
"""Decode normalized latent tensors into an audio waveform."""
|
||||
original_shape = latents.shape
|
||||
|
||||
# Ensure that Audio VAE is loaded on the correct device.
|
||||
self.device_manager.ensure_model_loaded()
|
||||
|
||||
latents = self.device_manager.move_to_load_device(latents)
|
||||
latents = self.normalizer.denormalize(latents)
|
||||
|
||||
target_shape = self.target_shape_from_latents(original_shape)
|
||||
mel_spec = self.autoencoder.decode(latents, target_shape=target_shape)
|
||||
|
||||
waveform = self.run_vocoder(mel_spec)
|
||||
return waveform
|
||||
return self.device_manager.move_to_load_device(waveform)
|
||||
|
||||
def target_shape_from_latents(self, latents_shape):
|
||||
batch, _, time, _ = latents_shape
|
||||
|
||||
@@ -34,16 +34,6 @@ class TimestepBlock(nn.Module):
|
||||
#This is needed because accelerate makes a copy of transformer_options which breaks "transformer_index"
|
||||
def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, output_shape=None, time_context=None, num_video_frames=None, image_only_indicator=None):
|
||||
for layer in ts:
|
||||
if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
|
||||
found_patched = False
|
||||
for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
|
||||
if isinstance(layer, class_type):
|
||||
x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
|
||||
found_patched = True
|
||||
break
|
||||
if found_patched:
|
||||
continue
|
||||
|
||||
if isinstance(layer, VideoResBlock):
|
||||
x = layer(x, emb, num_video_frames, image_only_indicator)
|
||||
elif isinstance(layer, TimestepBlock):
|
||||
@@ -59,6 +49,15 @@ def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, out
|
||||
elif isinstance(layer, Upsample):
|
||||
x = layer(x, output_shape=output_shape)
|
||||
else:
|
||||
if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
|
||||
found_patched = False
|
||||
for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
|
||||
if isinstance(layer, class_type):
|
||||
x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
|
||||
found_patched = True
|
||||
break
|
||||
if found_patched:
|
||||
continue
|
||||
x = layer(x)
|
||||
return x
|
||||
|
||||
@@ -895,12 +894,6 @@ class UNetModel(nn.Module):
|
||||
h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator)
|
||||
h = apply_control(h, control, 'middle')
|
||||
|
||||
if "middle_block_after_patch" in transformer_patches:
|
||||
patch = transformer_patches["middle_block_after_patch"]
|
||||
for p in patch:
|
||||
out = p({"h": h, "x": x, "emb": emb, "context": context, "y": y,
|
||||
"timesteps": timesteps, "transformer_options": transformer_options})
|
||||
h = out["h"]
|
||||
|
||||
for id, module in enumerate(self.output_blocks):
|
||||
transformer_options["block"] = ("output", id)
|
||||
@@ -912,7 +905,6 @@ class UNetModel(nn.Module):
|
||||
for p in patch:
|
||||
h, hsp = p(h, hsp, transformer_options)
|
||||
|
||||
if hsp is not None:
|
||||
h = th.cat([h, hsp], dim=1)
|
||||
del hsp
|
||||
if len(hs) > 0:
|
||||
|
||||
@@ -1,226 +0,0 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
|
||||
from comfy.ldm.modules.diffusionmodules.openaimodel import Downsample, TimestepEmbedSequential, ResBlock, SpatialTransformer
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
|
||||
|
||||
class ZeroSFT(nn.Module):
|
||||
def __init__(self, label_nc, norm_nc, concat_channels=0, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
ks = 3
|
||||
pw = ks // 2
|
||||
|
||||
self.param_free_norm = operations.GroupNorm(32, norm_nc + concat_channels, dtype=dtype, device=device)
|
||||
|
||||
nhidden = 128
|
||||
|
||||
self.mlp_shared = nn.Sequential(
|
||||
operations.Conv2d(label_nc, nhidden, kernel_size=ks, padding=pw, dtype=dtype, device=device),
|
||||
nn.SiLU()
|
||||
)
|
||||
self.zero_mul = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
|
||||
self.zero_add = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
|
||||
|
||||
self.zero_conv = operations.Conv2d(label_nc, norm_nc, 1, 1, 0, dtype=dtype, device=device)
|
||||
self.pre_concat = bool(concat_channels != 0)
|
||||
|
||||
def forward(self, c, h, h_ori=None, control_scale=1):
|
||||
if h_ori is not None and self.pre_concat:
|
||||
h_raw = torch.cat([h_ori, h], dim=1)
|
||||
else:
|
||||
h_raw = h
|
||||
|
||||
h = h + self.zero_conv(c)
|
||||
if h_ori is not None and self.pre_concat:
|
||||
h = torch.cat([h_ori, h], dim=1)
|
||||
actv = self.mlp_shared(c)
|
||||
gamma = self.zero_mul(actv)
|
||||
beta = self.zero_add(actv)
|
||||
h = self.param_free_norm(h)
|
||||
h = torch.addcmul(h + beta, h, gamma)
|
||||
if h_ori is not None and not self.pre_concat:
|
||||
h = torch.cat([h_ori, h], dim=1)
|
||||
return torch.lerp(h_raw, h, control_scale)
|
||||
|
||||
|
||||
class _CrossAttnInner(nn.Module):
|
||||
"""Inner cross-attention module matching the state_dict layout of the original CrossAttention."""
|
||||
def __init__(self, query_dim, context_dim, heads, dim_head, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
inner_dim = dim_head * heads
|
||||
self.heads = heads
|
||||
self.to_q = operations.Linear(query_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_k = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_v = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
||||
self.to_out = nn.Sequential(
|
||||
operations.Linear(inner_dim, query_dim, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
def forward(self, x, context):
|
||||
q = self.to_q(x)
|
||||
k = self.to_k(context)
|
||||
v = self.to_v(context)
|
||||
return self.to_out(optimized_attention(q, k, v, self.heads))
|
||||
|
||||
|
||||
class ZeroCrossAttn(nn.Module):
|
||||
def __init__(self, context_dim, query_dim, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
heads = query_dim // 64
|
||||
dim_head = 64
|
||||
self.attn = _CrossAttnInner(query_dim, context_dim, heads, dim_head, dtype=dtype, device=device, operations=operations)
|
||||
self.norm1 = operations.GroupNorm(32, query_dim, dtype=dtype, device=device)
|
||||
self.norm2 = operations.GroupNorm(32, context_dim, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, context, x, control_scale=1):
|
||||
b, c, h, w = x.shape
|
||||
x_in = x
|
||||
|
||||
x = self.attn(
|
||||
self.norm1(x).flatten(2).transpose(1, 2),
|
||||
self.norm2(context).flatten(2).transpose(1, 2),
|
||||
).transpose(1, 2).unflatten(2, (h, w))
|
||||
|
||||
return x_in + x * control_scale
|
||||
|
||||
|
||||
class GLVControl(nn.Module):
|
||||
"""SUPIR's Guided Latent Vector control encoder. Truncated UNet (input + middle blocks only)."""
|
||||
def __init__(
|
||||
self,
|
||||
in_channels=4,
|
||||
model_channels=320,
|
||||
num_res_blocks=2,
|
||||
attention_resolutions=(4, 2),
|
||||
channel_mult=(1, 2, 4),
|
||||
num_head_channels=64,
|
||||
transformer_depth=(1, 2, 10),
|
||||
context_dim=2048,
|
||||
adm_in_channels=2816,
|
||||
use_linear_in_transformer=True,
|
||||
use_checkpoint=False,
|
||||
dtype=None,
|
||||
device=None,
|
||||
operations=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
self.model_channels = model_channels
|
||||
time_embed_dim = model_channels * 4
|
||||
|
||||
self.time_embed = nn.Sequential(
|
||||
operations.Linear(model_channels, time_embed_dim, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
|
||||
)
|
||||
|
||||
self.label_emb = nn.Sequential(
|
||||
nn.Sequential(
|
||||
operations.Linear(adm_in_channels, time_embed_dim, dtype=dtype, device=device),
|
||||
nn.SiLU(),
|
||||
operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
|
||||
)
|
||||
)
|
||||
|
||||
self.input_blocks = nn.ModuleList([
|
||||
TimestepEmbedSequential(
|
||||
operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
|
||||
)
|
||||
])
|
||||
ch = model_channels
|
||||
ds = 1
|
||||
for level, mult in enumerate(channel_mult):
|
||||
for nr in range(num_res_blocks):
|
||||
layers = [
|
||||
ResBlock(ch, time_embed_dim, 0, out_channels=mult * model_channels,
|
||||
dtype=dtype, device=device, operations=operations)
|
||||
]
|
||||
ch = mult * model_channels
|
||||
if ds in attention_resolutions:
|
||||
num_heads = ch // num_head_channels
|
||||
layers.append(
|
||||
SpatialTransformer(ch, num_heads, num_head_channels,
|
||||
depth=transformer_depth[level], context_dim=context_dim,
|
||||
use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint,
|
||||
dtype=dtype, device=device, operations=operations)
|
||||
)
|
||||
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
||||
if level != len(channel_mult) - 1:
|
||||
self.input_blocks.append(
|
||||
TimestepEmbedSequential(
|
||||
Downsample(ch, True, out_channels=ch, dtype=dtype, device=device, operations=operations)
|
||||
)
|
||||
)
|
||||
ds *= 2
|
||||
|
||||
num_heads = ch // num_head_channels
|
||||
self.middle_block = TimestepEmbedSequential(
|
||||
ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
|
||||
SpatialTransformer(ch, num_heads, num_head_channels,
|
||||
depth=transformer_depth[-1], context_dim=context_dim,
|
||||
use_linear=use_linear_in_transformer,
|
||||
use_checkpoint=use_checkpoint,
|
||||
dtype=dtype, device=device, operations=operations),
|
||||
ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
|
||||
)
|
||||
|
||||
self.input_hint_block = TimestepEmbedSequential(
|
||||
operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
|
||||
)
|
||||
|
||||
def forward(self, x, timesteps, xt, context=None, y=None, **kwargs):
|
||||
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
|
||||
emb = self.time_embed(t_emb) + self.label_emb(y)
|
||||
|
||||
guided_hint = self.input_hint_block(x, emb, context)
|
||||
|
||||
hs = []
|
||||
h = xt
|
||||
for module in self.input_blocks:
|
||||
if guided_hint is not None:
|
||||
h = module(h, emb, context)
|
||||
h += guided_hint
|
||||
guided_hint = None
|
||||
else:
|
||||
h = module(h, emb, context)
|
||||
hs.append(h)
|
||||
h = self.middle_block(h, emb, context)
|
||||
hs.append(h)
|
||||
return hs
|
||||
|
||||
|
||||
class SUPIR(nn.Module):
|
||||
"""
|
||||
SUPIR model containing GLVControl (control encoder) and project_modules (adapters).
|
||||
State dict keys match the original SUPIR checkpoint layout:
|
||||
control_model.* -> GLVControl
|
||||
project_modules.* -> nn.ModuleList of ZeroSFT/ZeroCrossAttn
|
||||
"""
|
||||
def __init__(self, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
self.control_model = GLVControl(dtype=dtype, device=device, operations=operations)
|
||||
|
||||
project_channel_scale = 2
|
||||
cond_output_channels = [320] * 4 + [640] * 3 + [1280] * 3
|
||||
project_channels = [int(c * project_channel_scale) for c in [160] * 4 + [320] * 3 + [640] * 3]
|
||||
concat_channels = [320] * 2 + [640] * 3 + [1280] * 4 + [0]
|
||||
cross_attn_insert_idx = [6, 3]
|
||||
|
||||
self.project_modules = nn.ModuleList()
|
||||
for i in range(len(cond_output_channels)):
|
||||
self.project_modules.append(ZeroSFT(
|
||||
project_channels[i], cond_output_channels[i],
|
||||
concat_channels=concat_channels[i],
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
))
|
||||
|
||||
for i in cross_attn_insert_idx:
|
||||
self.project_modules.insert(i, ZeroCrossAttn(
|
||||
cond_output_channels[i], concat_channels[i],
|
||||
dtype=dtype, device=device, operations=operations,
|
||||
))
|
||||
@@ -1,103 +0,0 @@
|
||||
import torch
|
||||
from comfy.ldm.modules.diffusionmodules.openaimodel import Upsample
|
||||
|
||||
|
||||
class SUPIRPatch:
|
||||
"""
|
||||
Holds GLVControl (control encoder) + project_modules (ZeroSFT/ZeroCrossAttn adapters).
|
||||
Runs GLVControl lazily on first patch invocation per step, applies adapters through
|
||||
middle_block_after_patch, output_block_merge_patch, and forward_timestep_embed_patch.
|
||||
"""
|
||||
SIGMA_MAX = 14.6146
|
||||
|
||||
def __init__(self, model_patch, project_modules, hint_latent, strength_start, strength_end):
|
||||
self.model_patch = model_patch # CoreModelPatcher wrapping GLVControl
|
||||
self.project_modules = project_modules # nn.ModuleList of ZeroSFT/ZeroCrossAttn
|
||||
self.hint_latent = hint_latent # encoded LQ image latent
|
||||
self.strength_start = strength_start
|
||||
self.strength_end = strength_end
|
||||
self.cached_features = None
|
||||
self.adapter_idx = 0
|
||||
self.control_idx = 0
|
||||
self.current_control_idx = 0
|
||||
self.active = True
|
||||
|
||||
def _ensure_features(self, kwargs):
|
||||
"""Run GLVControl on first call per step, cache results."""
|
||||
if self.cached_features is not None:
|
||||
return
|
||||
x = kwargs["x"]
|
||||
b = x.shape[0]
|
||||
hint = self.hint_latent.to(device=x.device, dtype=x.dtype)
|
||||
if hint.shape[0] != b:
|
||||
hint = hint.expand(b, -1, -1, -1) if hint.shape[0] == 1 else hint.repeat((b + hint.shape[0] - 1) // hint.shape[0], 1, 1, 1)[:b]
|
||||
self.cached_features = self.model_patch.model.control_model(
|
||||
hint, kwargs["timesteps"], x,
|
||||
kwargs["context"], kwargs["y"]
|
||||
)
|
||||
self.adapter_idx = len(self.project_modules) - 1
|
||||
self.control_idx = len(self.cached_features) - 1
|
||||
|
||||
def _get_control_scale(self, kwargs):
|
||||
if self.strength_start == self.strength_end:
|
||||
return self.strength_end
|
||||
sigma = kwargs["transformer_options"].get("sigmas")
|
||||
if sigma is None:
|
||||
return self.strength_end
|
||||
s = sigma[0].item() if sigma.dim() > 0 else sigma.item()
|
||||
t = min(s / self.SIGMA_MAX, 1.0)
|
||||
return t * (self.strength_start - self.strength_end) + self.strength_end
|
||||
|
||||
def middle_after(self, kwargs):
|
||||
"""middle_block_after_patch: run GLVControl lazily, apply last adapter after middle block."""
|
||||
self.cached_features = None # reset from previous step
|
||||
self.current_scale = self._get_control_scale(kwargs)
|
||||
self.active = self.current_scale > 0
|
||||
if not self.active:
|
||||
return {"h": kwargs["h"]}
|
||||
self._ensure_features(kwargs)
|
||||
h = kwargs["h"]
|
||||
h = self.project_modules[self.adapter_idx](
|
||||
self.cached_features[self.control_idx], h, control_scale=self.current_scale
|
||||
)
|
||||
self.adapter_idx -= 1
|
||||
self.control_idx -= 1
|
||||
return {"h": h}
|
||||
|
||||
def output_block(self, h, hsp, transformer_options):
|
||||
"""output_block_patch: ZeroSFT adapter fusion replaces cat([h, hsp]). Returns (h, None) to skip cat."""
|
||||
if not self.active:
|
||||
return h, hsp
|
||||
self.current_control_idx = self.control_idx
|
||||
h = self.project_modules[self.adapter_idx](
|
||||
self.cached_features[self.control_idx], hsp, h, control_scale=self.current_scale
|
||||
)
|
||||
self.adapter_idx -= 1
|
||||
self.control_idx -= 1
|
||||
return h, None
|
||||
|
||||
def pre_upsample(self, layer, x, emb, context, transformer_options, output_shape, *args, **kw):
|
||||
"""forward_timestep_embed_patch for Upsample: extra cross-attn adapter before upsample."""
|
||||
block_type, _ = transformer_options["block"]
|
||||
if block_type == "output" and self.active and self.cached_features is not None:
|
||||
x = self.project_modules[self.adapter_idx](
|
||||
self.cached_features[self.current_control_idx], x, control_scale=self.current_scale
|
||||
)
|
||||
self.adapter_idx -= 1
|
||||
return layer(x, output_shape=output_shape)
|
||||
|
||||
def to(self, device_or_dtype):
|
||||
if isinstance(device_or_dtype, torch.device):
|
||||
self.cached_features = None
|
||||
if self.hint_latent is not None:
|
||||
self.hint_latent = self.hint_latent.to(device_or_dtype)
|
||||
return self
|
||||
|
||||
def models(self):
|
||||
return [self.model_patch]
|
||||
|
||||
def register(self, model_patcher):
|
||||
"""Register all patches on a cloned model patcher."""
|
||||
model_patcher.set_model_patch(self.middle_after, "middle_block_after_patch")
|
||||
model_patcher.set_model_output_block_patch(self.output_block)
|
||||
model_patcher.set_model_patch((Upsample, self.pre_upsample), "forward_timestep_embed_patch")
|
||||
@@ -52,6 +52,7 @@ import comfy.ldm.qwen_image.model
|
||||
import comfy.ldm.kandinsky5.model
|
||||
import comfy.ldm.anima.model
|
||||
import comfy.ldm.ace.ace_step15
|
||||
import comfy.ldm.cogvideo.model
|
||||
import comfy.ldm.rt_detr.rtdetr_v4
|
||||
import comfy.ldm.ernie.model
|
||||
|
||||
@@ -80,6 +81,7 @@ class ModelType(Enum):
|
||||
IMG_TO_IMG = 9
|
||||
FLOW_COSMOS = 10
|
||||
IMG_TO_IMG_FLOW = 11
|
||||
V_PREDICTION_DDPM = 12
|
||||
|
||||
|
||||
def model_sampling(model_config, model_type):
|
||||
@@ -114,6 +116,8 @@ def model_sampling(model_config, model_type):
|
||||
s = comfy.model_sampling.ModelSamplingCosmosRFlow
|
||||
elif model_type == ModelType.IMG_TO_IMG_FLOW:
|
||||
c = comfy.model_sampling.IMG_TO_IMG_FLOW
|
||||
elif model_type == ModelType.V_PREDICTION_DDPM:
|
||||
c = comfy.model_sampling.V_PREDICTION_DDPM
|
||||
|
||||
class ModelSampling(s, c):
|
||||
pass
|
||||
@@ -1974,3 +1978,59 @@ class ErnieImage(BaseModel):
|
||||
if cross_attn is not None:
|
||||
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
|
||||
return out
|
||||
|
||||
class CogVideoX(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.V_PREDICTION_DDPM, image_to_video=False, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cogvideo.model.CogVideoXTransformer3DModel)
|
||||
self.image_to_video = image_to_video
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
noise = kwargs.get("noise", None)
|
||||
# Detect extra channels needed (e.g. 32 - 16 = 16 for ref latent)
|
||||
extra_channels = self.diffusion_model.in_channels - noise.shape[1]
|
||||
if extra_channels == 0:
|
||||
return None
|
||||
|
||||
image = kwargs.get("concat_latent_image", None)
|
||||
device = kwargs["device"]
|
||||
|
||||
if image is None:
|
||||
shape = list(noise.shape)
|
||||
shape[1] = extra_channels
|
||||
return torch.zeros(shape, dtype=noise.dtype, layout=noise.layout, device=noise.device)
|
||||
|
||||
latent_dim = self.latent_format.latent_channels
|
||||
image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
||||
|
||||
if noise.ndim == 5 and image.ndim == 5:
|
||||
if image.shape[-3] < noise.shape[-3]:
|
||||
image = torch.nn.functional.pad(image, (0, 0, 0, 0, 0, noise.shape[-3] - image.shape[-3]), "constant", 0)
|
||||
elif image.shape[-3] > noise.shape[-3]:
|
||||
image = image[:, :, :noise.shape[-3]]
|
||||
|
||||
for i in range(0, image.shape[1], latent_dim):
|
||||
image[:, i:i + latent_dim] = self.process_latent_in(image[:, i:i + latent_dim])
|
||||
image = utils.resize_to_batch_size(image, noise.shape[0])
|
||||
|
||||
if image.shape[1] > extra_channels:
|
||||
image = image[:, :extra_channels]
|
||||
elif image.shape[1] < extra_channels:
|
||||
repeats = extra_channels // image.shape[1]
|
||||
remainder = extra_channels % image.shape[1]
|
||||
parts = [image] * repeats
|
||||
if remainder > 0:
|
||||
parts.append(image[:, :remainder])
|
||||
image = torch.cat(parts, dim=1)
|
||||
|
||||
return image
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
# OFS embedding (CogVideoX 1.5 I2V), default 2.0 as used by SparkVSR
|
||||
if self.diffusion_model.ofs_proj_dim is not None:
|
||||
ofs = kwargs.get("ofs", None)
|
||||
if ofs is None:
|
||||
noise = kwargs.get("noise", None)
|
||||
ofs = torch.full((noise.shape[0],), 2.0, device=noise.device, dtype=noise.dtype)
|
||||
out['ofs'] = comfy.conds.CONDRegular(ofs)
|
||||
return out
|
||||
|
||||
@@ -490,6 +490,54 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}blocks.0.norm1.linear.weight'.format(key_prefix) in state_dict_keys: # CogVideoX
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "cogvideox"
|
||||
|
||||
# Extract config from weight shapes
|
||||
norm1_weight = state_dict['{}blocks.0.norm1.linear.weight'.format(key_prefix)]
|
||||
time_embed_dim = norm1_weight.shape[1]
|
||||
dim = norm1_weight.shape[0] // 6
|
||||
|
||||
dit_config["num_attention_heads"] = dim // 64
|
||||
dit_config["attention_head_dim"] = 64
|
||||
dit_config["time_embed_dim"] = time_embed_dim
|
||||
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
|
||||
|
||||
# Detect in_channels from patch_embed
|
||||
patch_proj_key = '{}patch_embed.proj.weight'.format(key_prefix)
|
||||
if patch_proj_key in state_dict_keys:
|
||||
w = state_dict[patch_proj_key]
|
||||
if w.ndim == 4:
|
||||
# Conv2d: [out, in, kh, kw] — CogVideoX 1.0
|
||||
dit_config["in_channels"] = w.shape[1]
|
||||
dit_config["patch_size"] = w.shape[2]
|
||||
elif w.ndim == 2:
|
||||
# Linear: [out, in_channels * patch_size * patch_size * patch_size_t] — CogVideoX 1.5
|
||||
dit_config["patch_size"] = 2
|
||||
dit_config["patch_size_t"] = 2
|
||||
dit_config["in_channels"] = w.shape[1] // (2 * 2 * 2) # 256 // 8 = 32
|
||||
|
||||
text_proj_key = '{}patch_embed.text_proj.weight'.format(key_prefix)
|
||||
if text_proj_key in state_dict_keys:
|
||||
dit_config["text_embed_dim"] = state_dict[text_proj_key].shape[1]
|
||||
|
||||
# Detect OFS embedding
|
||||
ofs_key = '{}ofs_embedding_linear_1.weight'.format(key_prefix)
|
||||
if ofs_key in state_dict_keys:
|
||||
dit_config["ofs_embed_dim"] = state_dict[ofs_key].shape[1]
|
||||
|
||||
# Detect positional embedding type
|
||||
pos_key = '{}patch_embed.pos_embedding'.format(key_prefix)
|
||||
if pos_key in state_dict_keys:
|
||||
dit_config["use_learned_positional_embeddings"] = True
|
||||
dit_config["use_rotary_positional_embeddings"] = False
|
||||
else:
|
||||
dit_config["use_learned_positional_embeddings"] = False
|
||||
dit_config["use_rotary_positional_embeddings"] = True
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}head.modulation'.format(key_prefix) in state_dict_keys: # Wan 2.1
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "wan2.1"
|
||||
|
||||
@@ -506,10 +506,6 @@ class ModelPatcher:
|
||||
def set_model_noise_refiner_patch(self, patch):
|
||||
self.set_model_patch(patch, "noise_refiner")
|
||||
|
||||
def set_model_middle_block_after_patch(self, patch):
|
||||
self.set_model_patch(patch, "middle_block_after_patch")
|
||||
|
||||
|
||||
def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs):
|
||||
rope_options = self.model_options["transformer_options"].get("rope_options", {})
|
||||
rope_options["scale_x"] = scale_x
|
||||
|
||||
@@ -54,6 +54,30 @@ class V_PREDICTION(EPS):
|
||||
sigma = reshape_sigma(sigma, model_output.ndim)
|
||||
return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
|
||||
|
||||
class V_PREDICTION_DDPM:
|
||||
"""CogVideoX v-prediction: model receives raw x_t (unscaled), predicts velocity v.
|
||||
x_0 = sqrt(alpha) * x_t - sqrt(1-alpha) * v
|
||||
= x_t / sqrt(sigma^2 + 1) - v * sigma / sqrt(sigma^2 + 1)
|
||||
"""
|
||||
def calculate_input(self, sigma, noise):
|
||||
return noise
|
||||
|
||||
def calculate_denoised(self, sigma, model_output, model_input):
|
||||
sigma = reshape_sigma(sigma, model_output.ndim)
|
||||
return model_input / (sigma ** 2 + 1.0) ** 0.5 - model_output * sigma / (sigma ** 2 + 1.0) ** 0.5
|
||||
|
||||
def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
|
||||
sigma = reshape_sigma(sigma, noise.ndim)
|
||||
if max_denoise:
|
||||
noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
|
||||
else:
|
||||
noise = noise * sigma
|
||||
noise += latent_image
|
||||
return noise
|
||||
|
||||
def inverse_noise_scaling(self, sigma, latent):
|
||||
return latent
|
||||
|
||||
class EDM(V_PREDICTION):
|
||||
def calculate_denoised(self, sigma, model_output, model_input):
|
||||
sigma = reshape_sigma(sigma, model_output.ndim)
|
||||
|
||||
@@ -1151,7 +1151,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
||||
if param is None:
|
||||
continue
|
||||
p = fn(param)
|
||||
if (not torch.is_inference_mode_enabled()) and p.is_inference():
|
||||
if p.is_inference():
|
||||
p = p.clone()
|
||||
self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False))
|
||||
for key, buf in self._buffers.items():
|
||||
|
||||
36
comfy/sd.py
36
comfy/sd.py
@@ -12,12 +12,12 @@ from .ldm.cascade.stage_c_coder import StageC_coder
|
||||
from .ldm.audio.autoencoder import AudioOobleckVAE
|
||||
import comfy.ldm.genmo.vae.model
|
||||
import comfy.ldm.lightricks.vae.causal_video_autoencoder
|
||||
import comfy.ldm.lightricks.vae.audio_vae
|
||||
import comfy.ldm.cosmos.vae
|
||||
import comfy.ldm.wan.vae
|
||||
import comfy.ldm.wan.vae2_2
|
||||
import comfy.ldm.hunyuan3d.vae
|
||||
import comfy.ldm.ace.vae.music_dcae_pipeline
|
||||
import comfy.ldm.cogvideo.vae
|
||||
import comfy.ldm.hunyuan_video.vae
|
||||
import comfy.ldm.mmaudio.vae.autoencoder
|
||||
import comfy.pixel_space_convert
|
||||
@@ -64,6 +64,7 @@ import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.qwen35
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.cogvideo
|
||||
|
||||
import comfy.model_patcher
|
||||
import comfy.lora
|
||||
@@ -652,6 +653,17 @@ class VAE:
|
||||
|
||||
self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (3600 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
|
||||
elif "decoder.conv_in.conv.weight" in sd and "decoder.mid_block.resnets.0.norm1.norm_layer.weight" in sd: # CogVideoX VAE
|
||||
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
|
||||
self.upscale_index_formula = (4, 8, 8)
|
||||
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
|
||||
self.downscale_index_formula = (4, 8, 8)
|
||||
self.latent_dim = 3
|
||||
self.latent_channels = sd["encoder.conv_out.conv.weight"].shape[0] // 2
|
||||
self.first_stage_model = comfy.ldm.cogvideo.vae.AutoencoderKLCogVideoX(latent_channels=self.latent_channels)
|
||||
self.memory_used_decode = lambda shape, dtype: (2800 * max(2, ((shape[2] - 1) * 4) + 1) * shape[3] * shape[4] * (8 * 8)) * model_management.dtype_size(dtype)
|
||||
self.memory_used_encode = lambda shape, dtype: (1400 * max(1, shape[2]) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
|
||||
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
elif "decoder.conv_in.conv.weight" in sd:
|
||||
ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
|
||||
ddconfig["conv3d"] = True
|
||||
@@ -806,24 +818,6 @@ class VAE:
|
||||
self.downscale_index_formula = (4, 8, 8)
|
||||
self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype))
|
||||
self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
|
||||
elif "vocoder.resblocks.0.convs1.0.weight" in sd or "vocoder.vocoder.resblocks.0.convs1.0.weight" in sd: # LTX Audio
|
||||
sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder."})
|
||||
self.first_stage_model = comfy.ldm.lightricks.vae.audio_vae.AudioVAE(metadata=metadata)
|
||||
self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype)
|
||||
self.latent_channels = self.first_stage_model.latent_channels
|
||||
self.audio_sample_rate_output = self.first_stage_model.output_sample_rate
|
||||
self.autoencoder = self.first_stage_model.autoencoder # TODO: remove hack for ltxv custom nodes
|
||||
self.output_channels = 2
|
||||
self.pad_channel_value = "replicate"
|
||||
self.upscale_ratio = 4096
|
||||
self.downscale_ratio = 4096
|
||||
self.latent_dim = 2
|
||||
self.process_output = lambda audio: audio
|
||||
self.process_input = lambda audio: audio
|
||||
self.working_dtypes = [torch.float32]
|
||||
self.disable_offload = True
|
||||
self.extra_1d_channel = 16
|
||||
else:
|
||||
logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
|
||||
self.first_stage_model = None
|
||||
@@ -1208,6 +1202,7 @@ class CLIPType(Enum):
|
||||
NEWBIE = 24
|
||||
FLUX2 = 25
|
||||
LONGCAT_IMAGE = 26
|
||||
COGVIDEOX = 27
|
||||
|
||||
|
||||
|
||||
@@ -1403,6 +1398,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
|
||||
clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None)
|
||||
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
||||
elif clip_type == CLIPType.COGVIDEOX:
|
||||
clip_target.clip = comfy.text_encoders.cogvideo.cogvideo_te(**t5xxl_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.cogvideo.CogVideoXTokenizer
|
||||
else: #CLIPType.MOCHI
|
||||
clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer
|
||||
|
||||
@@ -27,6 +27,7 @@ import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.cogvideo
|
||||
|
||||
from . import supported_models_base
|
||||
from . import latent_formats
|
||||
@@ -1781,6 +1782,74 @@ class ErnieImage(supported_models_base.BASE):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.ernie.ErnieTokenizer, comfy.text_encoders.ernie.te(**hunyuan_detect))
|
||||
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage]
|
||||
class CogVideoX_T2V(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "cogvideox",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"linear_start": 0.00085,
|
||||
"linear_end": 0.012,
|
||||
"beta_schedule": "linear",
|
||||
"zsnr": True,
|
||||
}
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.CogVideoX
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
def __init__(self, unet_config):
|
||||
# 2b-class (dim=1920, heads=30) uses scale_factor=1.15258426.
|
||||
# 5b-class (dim=3072, heads=48) — incl. CogVideoX-5b, 1.5-5B, and
|
||||
# Fun-V1.5 inpainting — uses scale_factor=0.7 per vae/config.json.
|
||||
if unet_config.get("num_attention_heads", 0) >= 48:
|
||||
self.latent_format = latent_formats.CogVideoX1_5
|
||||
super().__init__(unet_config)
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
# CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
|
||||
if self.unet_config.get("patch_size_t") is not None:
|
||||
self.unet_config.setdefault("sample_height", 96)
|
||||
self.unet_config.setdefault("sample_width", 170)
|
||||
self.unet_config.setdefault("sample_frames", 81)
|
||||
out = model_base.CogVideoX(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.cogvideo.CogVideoXT5Tokenizer, comfy.text_encoders.sd3_clip.T5XXLModel)
|
||||
|
||||
class CogVideoX_I2V(CogVideoX_T2V):
|
||||
unet_config = {
|
||||
"image_model": "cogvideox",
|
||||
"in_channels": 32,
|
||||
}
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
if self.unet_config.get("patch_size_t") is not None:
|
||||
self.unet_config.setdefault("sample_height", 96)
|
||||
self.unet_config.setdefault("sample_width", 170)
|
||||
self.unet_config.setdefault("sample_frames", 81)
|
||||
out = model_base.CogVideoX(self, image_to_video=True, device=device)
|
||||
return out
|
||||
|
||||
class CogVideoX_Inpaint(CogVideoX_T2V):
|
||||
unet_config = {
|
||||
"image_model": "cogvideox",
|
||||
"in_channels": 48,
|
||||
}
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
if self.unet_config.get("patch_size_t") is not None:
|
||||
self.unet_config.setdefault("sample_height", 96)
|
||||
self.unet_config.setdefault("sample_width", 170)
|
||||
self.unet_config.setdefault("sample_frames", 81)
|
||||
out = model_base.CogVideoX(self, image_to_video=True, device=device)
|
||||
return out
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage, CogVideoX_Inpaint, CogVideoX_I2V, CogVideoX_T2V]
|
||||
|
||||
models += [SVD_img2vid]
|
||||
|
||||
48
comfy/text_encoders/cogvideo.py
Normal file
48
comfy/text_encoders/cogvideo.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import comfy.text_encoders.sd3_clip
|
||||
from comfy import sd1_clip
|
||||
|
||||
|
||||
class CogVideoXT5Tokenizer(comfy.text_encoders.sd3_clip.T5XXLTokenizer):
|
||||
"""Inner T5 tokenizer for CogVideoX.
|
||||
|
||||
CogVideoX was trained with T5 embeddings padded to 226 tokens (not 77 like SD3).
|
||||
Used both directly by supported_models.CogVideoX_T2V.clip_target (paired with
|
||||
the raw T5XXLModel) and by the CogVideoXTokenizer outer wrapper below.
|
||||
"""
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, min_length=226)
|
||||
|
||||
|
||||
class CogVideoXTokenizer(sd1_clip.SD1Tokenizer):
|
||||
"""Outer tokenizer wrapper for CLIPLoader (type="cogvideox")."""
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data,
|
||||
clip_name="t5xxl", tokenizer=CogVideoXT5Tokenizer)
|
||||
|
||||
|
||||
class CogVideoXT5XXL(sd1_clip.SD1ClipModel):
|
||||
"""Outer T5XXL model wrapper for CLIPLoader (type="cogvideox").
|
||||
|
||||
Wraps the raw T5XXL model in the SD1ClipModel interface so that CLIP.__init__
|
||||
(which reads self.dtypes) works correctly. The inner model is the standard
|
||||
sd3_clip.T5XXLModel (no attention_mask change needed for CogVideoX).
|
||||
"""
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, name="t5xxl",
|
||||
clip_model=comfy.text_encoders.sd3_clip.T5XXLModel,
|
||||
model_options=model_options)
|
||||
|
||||
|
||||
def cogvideo_te(dtype_t5=None, t5_quantization_metadata=None):
|
||||
"""Factory that returns a CogVideoXT5XXL class configured with the detected
|
||||
T5 dtype and optional quantization metadata, for use in load_text_encoder_state_dicts.
|
||||
"""
|
||||
class CogVideoXTEModel_(CogVideoXT5XXL):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if t5_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
|
||||
if dtype_t5 is not None:
|
||||
dtype = dtype_t5
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return CogVideoXTEModel_
|
||||
@@ -35,4 +35,4 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return ErnieTEModel_
|
||||
return ErnieTEModel
|
||||
|
||||
@@ -158,17 +158,10 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
|
||||
("Custom", None, None),
|
||||
]
|
||||
|
||||
# Seedance 2.0 reference video pixel count limits per model and output resolution.
|
||||
# Seedance 2.0 reference video pixel count limits per model.
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
|
||||
"dreamina-seedance-2-0-260128": {
|
||||
"480p": {"min": 409_600, "max": 927_408},
|
||||
"720p": {"min": 409_600, "max": 927_408},
|
||||
"1080p": {"min": 409_600, "max": 2_073_600},
|
||||
},
|
||||
"dreamina-seedance-2-0-fast-260128": {
|
||||
"480p": {"min": 409_600, "max": 927_408},
|
||||
"720p": {"min": 409_600, "max": 927_408},
|
||||
},
|
||||
"dreamina-seedance-2-0-260128": {"min": 409_600, "max": 927_408},
|
||||
"dreamina-seedance-2-0-fast-260128": {"min": 409_600, "max": 927_408},
|
||||
}
|
||||
|
||||
# The time in this dictionary are given for 10 seconds duration.
|
||||
|
||||
@@ -35,7 +35,6 @@ from comfy_api_nodes.util import (
|
||||
get_number_of_images,
|
||||
image_tensor_pair_to_batch,
|
||||
poll_op,
|
||||
resize_video_to_pixel_budget,
|
||||
sync_op,
|
||||
upload_audio_to_comfyapi,
|
||||
upload_image_to_comfyapi,
|
||||
@@ -70,12 +69,9 @@ DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-2504
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: str, index: int) -> None:
|
||||
"""Validate reference video pixel count against Seedance 2.0 model limits for the selected resolution."""
|
||||
model_limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
|
||||
if not model_limits:
|
||||
return
|
||||
limits = model_limits.get(resolution)
|
||||
def _validate_ref_video_pixels(video: Input.Video, model_id: str, index: int) -> None:
|
||||
"""Validate reference video pixel count against Seedance 2.0 model limits."""
|
||||
limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
|
||||
if not limits:
|
||||
return
|
||||
try:
|
||||
@@ -1070,7 +1066,7 @@ PRICE_BADGE_VIDEO = IO.PriceBadge(
|
||||
)
|
||||
|
||||
|
||||
def _seedance2_text_inputs(resolutions: list[str]):
|
||||
def _seedance2_text_inputs():
|
||||
return [
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
@@ -1080,7 +1076,7 @@ def _seedance2_text_inputs(resolutions: list[str]):
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"resolution",
|
||||
options=resolutions,
|
||||
options=["480p", "720p"],
|
||||
tooltip="Resolution of the output video.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
@@ -1118,8 +1114,8 @@ class ByteDance2TextToVideoNode(IO.ComfyNode):
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs(["480p", "720p", "1080p"])),
|
||||
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs(["480p", "720p"])),
|
||||
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()),
|
||||
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()),
|
||||
],
|
||||
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
|
||||
),
|
||||
@@ -1156,14 +1152,11 @@ class ByteDance2TextToVideoNode(IO.ComfyNode):
|
||||
(
|
||||
$rate480 := 10044;
|
||||
$rate720 := 21600;
|
||||
$rate1080 := 48800;
|
||||
$m := widgets.model;
|
||||
$pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$rate := $res = "1080p" ? $rate1080 :
|
||||
$res = "720p" ? $rate720 :
|
||||
$rate480;
|
||||
$rate := $res = "720p" ? $rate720 : $rate480;
|
||||
$cost := $dur * $rate * $pricePer1K / 1000;
|
||||
{"type": "usd", "usd": $cost, "format": {"approximate": true}}
|
||||
)
|
||||
@@ -1202,7 +1195,6 @@ class ByteDance2TextToVideoNode(IO.ComfyNode):
|
||||
status_extractor=lambda r: r.status,
|
||||
price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
|
||||
poll_interval=9,
|
||||
max_poll_attempts=180,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
@@ -1220,8 +1212,8 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs(["480p", "720p", "1080p"])),
|
||||
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs(["480p", "720p"])),
|
||||
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()),
|
||||
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()),
|
||||
],
|
||||
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
|
||||
),
|
||||
@@ -1267,14 +1259,11 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
(
|
||||
$rate480 := 10044;
|
||||
$rate720 := 21600;
|
||||
$rate1080 := 48800;
|
||||
$m := widgets.model;
|
||||
$pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$rate := $res = "1080p" ? $rate1080 :
|
||||
$res = "720p" ? $rate720 :
|
||||
$rate480;
|
||||
$rate := $res = "720p" ? $rate720 : $rate480;
|
||||
$cost := $dur * $rate * $pricePer1K / 1000;
|
||||
{"type": "usd", "usd": $cost, "format": {"approximate": true}}
|
||||
)
|
||||
@@ -1335,14 +1324,13 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
status_extractor=lambda r: r.status,
|
||||
price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
|
||||
poll_interval=9,
|
||||
max_poll_attempts=180,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
|
||||
def _seedance2_reference_inputs(resolutions: list[str]):
|
||||
def _seedance2_reference_inputs():
|
||||
return [
|
||||
*_seedance2_text_inputs(resolutions),
|
||||
*_seedance2_text_inputs(),
|
||||
IO.Autogrow.Input(
|
||||
"reference_images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
@@ -1377,14 +1365,6 @@ def _seedance2_reference_inputs(resolutions: list[str]):
|
||||
min=0,
|
||||
),
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"auto_downscale",
|
||||
default=False,
|
||||
advanced=True,
|
||||
optional=True,
|
||||
tooltip="Automatically downscale reference videos that exceed the model's pixel budget "
|
||||
"for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@@ -1402,8 +1382,8 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_reference_inputs(["480p", "720p", "1080p"])),
|
||||
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_reference_inputs(["480p", "720p"])),
|
||||
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_reference_inputs()),
|
||||
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_reference_inputs()),
|
||||
],
|
||||
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
|
||||
),
|
||||
@@ -1443,16 +1423,13 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
(
|
||||
$rate480 := 10044;
|
||||
$rate720 := 21600;
|
||||
$rate1080 := 48800;
|
||||
$m := widgets.model;
|
||||
$hasVideo := $lookup(inputGroups, "model.reference_videos") > 0;
|
||||
$noVideoPricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
|
||||
$videoPricePer1K := $contains($m, "fast") ? 0.004719 : 0.006149;
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$rate := $res = "1080p" ? $rate1080 :
|
||||
$res = "720p" ? $rate720 :
|
||||
$rate480;
|
||||
$rate := $res = "720p" ? $rate720 : $rate480;
|
||||
$noVideoCost := $dur * $rate * $noVideoPricePer1K / 1000;
|
||||
$minVideoFactor := $ceil($dur * 5 / 3);
|
||||
$minVideoCost := $minVideoFactor * $rate * $videoPricePer1K / 1000;
|
||||
@@ -1492,23 +1469,10 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
|
||||
model_id = SEEDANCE_MODELS[model["model"]]
|
||||
has_video_input = len(reference_videos) > 0
|
||||
|
||||
if model.get("auto_downscale") and reference_videos:
|
||||
max_px = (
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {})
|
||||
.get(model["resolution"], {})
|
||||
.get("max")
|
||||
)
|
||||
if max_px:
|
||||
for key in reference_videos:
|
||||
reference_videos[key] = resize_video_to_pixel_budget(
|
||||
reference_videos[key], max_px
|
||||
)
|
||||
|
||||
total_video_duration = 0.0
|
||||
for i, key in enumerate(reference_videos, 1):
|
||||
video = reference_videos[key]
|
||||
_validate_ref_video_pixels(video, model_id, model["resolution"], i)
|
||||
_validate_ref_video_pixels(video, model_id, i)
|
||||
try:
|
||||
dur = video.get_duration()
|
||||
if dur < 1.8:
|
||||
@@ -1595,7 +1559,6 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
status_extractor=lambda r: r.status,
|
||||
price_extractor=_seedance2_price_extractor(model_id, has_video_input=has_video_input),
|
||||
poll_interval=9,
|
||||
max_poll_attempts=180,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
|
||||
@@ -221,17 +221,14 @@ class TencentTextToModelNode(IO.ComfyNode):
|
||||
response_model=To3DProTaskResultResponse,
|
||||
status_extractor=lambda r: r.Status,
|
||||
)
|
||||
obj_file_response = get_file_from_response(result.ResultFile3Ds, "obj", raise_if_not_found=False)
|
||||
obj_result = None
|
||||
if obj_file_response:
|
||||
obj_result = await download_and_extract_obj_zip(obj_file_response.Url)
|
||||
obj_result = await download_and_extract_obj_zip(get_file_from_response(result.ResultFile3Ds, "obj").Url)
|
||||
return IO.NodeOutput(
|
||||
f"{task_id}.glb",
|
||||
await download_url_to_file_3d(
|
||||
get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
|
||||
),
|
||||
obj_result.obj if obj_result else None,
|
||||
obj_result.texture if obj_result else None,
|
||||
obj_result.obj,
|
||||
obj_result.texture,
|
||||
)
|
||||
|
||||
|
||||
@@ -381,9 +378,7 @@ class TencentImageToModelNode(IO.ComfyNode):
|
||||
response_model=To3DProTaskResultResponse,
|
||||
status_extractor=lambda r: r.Status,
|
||||
)
|
||||
obj_file_response = get_file_from_response(result.ResultFile3Ds, "obj", raise_if_not_found=False)
|
||||
if obj_file_response:
|
||||
obj_result = await download_and_extract_obj_zip(obj_file_response.Url)
|
||||
obj_result = await download_and_extract_obj_zip(get_file_from_response(result.ResultFile3Ds, "obj").Url)
|
||||
return IO.NodeOutput(
|
||||
f"{task_id}.glb",
|
||||
await download_url_to_file_3d(
|
||||
@@ -395,17 +390,6 @@ class TencentImageToModelNode(IO.ComfyNode):
|
||||
obj_result.normal if obj_result.normal is not None else torch.zeros(1, 1, 1, 3),
|
||||
obj_result.roughness if obj_result.roughness is not None else torch.zeros(1, 1, 1, 3),
|
||||
)
|
||||
return IO.NodeOutput(
|
||||
f"{task_id}.glb",
|
||||
await download_url_to_file_3d(
|
||||
get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
|
||||
),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
|
||||
class TencentModelTo3DUVNode(IO.ComfyNode):
|
||||
|
||||
@@ -17,12 +17,46 @@ from comfy_api_nodes.util import (
|
||||
)
|
||||
from comfy_extras.nodes_images import SVG
|
||||
|
||||
_ARROW_MODELS = ["arrow-1.1", "arrow-1.1-max", "arrow-preview"]
|
||||
|
||||
|
||||
def _arrow_sampling_inputs():
|
||||
"""Shared sampling inputs for all Arrow model variants."""
|
||||
return [
|
||||
class QuiverTextToSVGNode(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="QuiverTextToSVGNode",
|
||||
display_name="Quiver Text to SVG",
|
||||
category="api node/image/Quiver",
|
||||
description="Generate an SVG from a text prompt using Quiver AI.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Text description of the desired SVG output.",
|
||||
),
|
||||
IO.String.Input(
|
||||
"instructions",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Additional style or formatting guidance.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"reference_images",
|
||||
template=IO.Autogrow.TemplatePrefix(
|
||||
IO.Image.Input("image"),
|
||||
prefix="ref_",
|
||||
min=0,
|
||||
max=4,
|
||||
),
|
||||
tooltip="Up to 4 reference images to guide the generation.",
|
||||
optional=True,
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"arrow-preview",
|
||||
[
|
||||
IO.Float.Input(
|
||||
"temperature",
|
||||
default=1.0,
|
||||
@@ -53,46 +87,9 @@ def _arrow_sampling_inputs():
|
||||
tooltip="Token presence penalty.",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class QuiverTextToSVGNode(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="QuiverTextToSVGNode",
|
||||
display_name="Quiver Text to SVG",
|
||||
category="api node/image/Quiver",
|
||||
description="Generate an SVG from a text prompt using Quiver AI.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Text description of the desired SVG output.",
|
||||
],
|
||||
),
|
||||
IO.String.Input(
|
||||
"instructions",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Additional style or formatting guidance.",
|
||||
optional=True,
|
||||
advanced=True,
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"reference_images",
|
||||
template=IO.Autogrow.TemplatePrefix(
|
||||
IO.Image.Input("image"),
|
||||
prefix="ref_",
|
||||
min=0,
|
||||
max=4,
|
||||
),
|
||||
tooltip="Up to 4 reference images to guide the generation.",
|
||||
optional=True,
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[IO.DynamicCombo.Option(m, _arrow_sampling_inputs()) for m in _ARROW_MODELS],
|
||||
],
|
||||
tooltip="Model to use for SVG generation.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
@@ -115,16 +112,7 @@ class QuiverTextToSVGNode(IO.ComfyNode):
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model"]),
|
||||
expr="""
|
||||
(
|
||||
$contains(widgets.model, "max")
|
||||
? {"type":"usd","usd":0.3575}
|
||||
: $contains(widgets.model, "preview")
|
||||
? {"type":"usd","usd":0.429}
|
||||
: {"type":"usd","usd":0.286}
|
||||
)
|
||||
""",
|
||||
expr="""{"type":"usd","usd":0.429}""",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -188,13 +176,12 @@ class QuiverImageToSVGNode(IO.ComfyNode):
|
||||
"auto_crop",
|
||||
default=False,
|
||||
tooltip="Automatically crop to the dominant subject.",
|
||||
advanced=True,
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
m,
|
||||
"arrow-preview",
|
||||
[
|
||||
IO.Int.Input(
|
||||
"target_size",
|
||||
@@ -202,12 +189,39 @@ class QuiverImageToSVGNode(IO.ComfyNode):
|
||||
min=128,
|
||||
max=4096,
|
||||
tooltip="Square resize target in pixels.",
|
||||
),
|
||||
IO.Float.Input(
|
||||
"temperature",
|
||||
default=1.0,
|
||||
min=0.0,
|
||||
max=2.0,
|
||||
step=0.1,
|
||||
display_mode=IO.NumberDisplay.slider,
|
||||
tooltip="Randomness control. Higher values increase randomness.",
|
||||
advanced=True,
|
||||
),
|
||||
IO.Float.Input(
|
||||
"top_p",
|
||||
default=1.0,
|
||||
min=0.05,
|
||||
max=1.0,
|
||||
step=0.05,
|
||||
display_mode=IO.NumberDisplay.slider,
|
||||
tooltip="Nucleus sampling parameter.",
|
||||
advanced=True,
|
||||
),
|
||||
IO.Float.Input(
|
||||
"presence_penalty",
|
||||
default=0.0,
|
||||
min=-2.0,
|
||||
max=2.0,
|
||||
step=0.1,
|
||||
display_mode=IO.NumberDisplay.slider,
|
||||
tooltip="Token presence penalty.",
|
||||
advanced=True,
|
||||
),
|
||||
*_arrow_sampling_inputs(),
|
||||
],
|
||||
)
|
||||
for m in _ARROW_MODELS
|
||||
),
|
||||
],
|
||||
tooltip="Model to use for SVG vectorization.",
|
||||
),
|
||||
@@ -231,16 +245,7 @@ class QuiverImageToSVGNode(IO.ComfyNode):
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model"]),
|
||||
expr="""
|
||||
(
|
||||
$contains(widgets.model, "max")
|
||||
? {"type":"usd","usd":0.3575}
|
||||
: $contains(widgets.model, "preview")
|
||||
? {"type":"usd","usd":0.429}
|
||||
: {"type":"usd","usd":0.286}
|
||||
)
|
||||
""",
|
||||
expr="""{"type":"usd","usd":0.429}""",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -401,7 +401,7 @@ class StabilityUpscaleConservativeNode(IO.ComfyNode):
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
expr="""{"type":"usd","usd":0.4}""",
|
||||
expr="""{"type":"usd","usd":0.25}""",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -510,7 +510,7 @@ class StabilityUpscaleCreativeNode(IO.ComfyNode):
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
expr="""{"type":"usd","usd":0.6}""",
|
||||
expr="""{"type":"usd","usd":0.25}""",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -593,7 +593,7 @@ class StabilityUpscaleFastNode(IO.ComfyNode):
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
expr="""{"type":"usd","usd":0.02}""",
|
||||
expr="""{"type":"usd","usd":0.01}""",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -24,9 +24,8 @@ from comfy_api_nodes.util import (
|
||||
AVERAGE_DURATION_VIDEO_GEN = 32
|
||||
MODELS_MAP = {
|
||||
"veo-2.0-generate-001": "veo-2.0-generate-001",
|
||||
"veo-3.1-generate": "veo-3.1-generate-001",
|
||||
"veo-3.1-fast-generate": "veo-3.1-fast-generate-001",
|
||||
"veo-3.1-lite": "veo-3.1-lite-generate-001",
|
||||
"veo-3.1-generate": "veo-3.1-generate-preview",
|
||||
"veo-3.1-fast-generate": "veo-3.1-fast-generate-preview",
|
||||
"veo-3.0-generate-001": "veo-3.0-generate-001",
|
||||
"veo-3.0-fast-generate-001": "veo-3.0-fast-generate-001",
|
||||
}
|
||||
@@ -248,8 +247,17 @@ class VeoVideoGenerationNode(IO.ComfyNode):
|
||||
raise Exception("Video generation completed but no video was returned")
|
||||
|
||||
|
||||
class Veo3VideoGenerationNode(IO.ComfyNode):
|
||||
"""Generates videos from text prompts using Google's Veo 3 API."""
|
||||
class Veo3VideoGenerationNode(VeoVideoGenerationNode):
|
||||
"""
|
||||
Generates videos from text prompts using Google's Veo 3 API.
|
||||
|
||||
Supported models:
|
||||
- veo-3.0-generate-001
|
||||
- veo-3.0-fast-generate-001
|
||||
|
||||
This node extends the base Veo node with Veo 3 specific features including
|
||||
audio generation and fixed 8-second duration.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
@@ -271,13 +279,6 @@ class Veo3VideoGenerationNode(IO.ComfyNode):
|
||||
default="16:9",
|
||||
tooltip="Aspect ratio of the output video",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"resolution",
|
||||
options=["720p", "1080p", "4k"],
|
||||
default="720p",
|
||||
tooltip="Output video resolution. 4K is not available for veo-3.1-lite and veo-3.0 models.",
|
||||
optional=True,
|
||||
),
|
||||
IO.String.Input(
|
||||
"negative_prompt",
|
||||
multiline=True,
|
||||
@@ -288,11 +289,11 @@ class Veo3VideoGenerationNode(IO.ComfyNode):
|
||||
IO.Int.Input(
|
||||
"duration_seconds",
|
||||
default=8,
|
||||
min=4,
|
||||
min=8,
|
||||
max=8,
|
||||
step=2,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
tooltip="Duration of the output video in seconds",
|
||||
tooltip="Duration of the output video in seconds (Veo 3 only supports 8 seconds)",
|
||||
optional=True,
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
@@ -331,10 +332,10 @@ class Veo3VideoGenerationNode(IO.ComfyNode):
|
||||
options=[
|
||||
"veo-3.1-generate",
|
||||
"veo-3.1-fast-generate",
|
||||
"veo-3.1-lite",
|
||||
"veo-3.0-generate-001",
|
||||
"veo-3.0-fast-generate-001",
|
||||
],
|
||||
default="veo-3.0-generate-001",
|
||||
tooltip="Veo 3 model to use for video generation",
|
||||
optional=True,
|
||||
),
|
||||
@@ -355,111 +356,21 @@ class Veo3VideoGenerationNode(IO.ComfyNode):
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "resolution", "duration_seconds"]),
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio"]),
|
||||
expr="""
|
||||
(
|
||||
$m := widgets.model;
|
||||
$r := widgets.resolution;
|
||||
$a := widgets.generate_audio;
|
||||
$seconds := widgets.duration_seconds;
|
||||
$pps :=
|
||||
$contains($m, "lite")
|
||||
? ($r = "1080p" ? ($a ? 0.08 : 0.05) : ($a ? 0.05 : 0.03))
|
||||
: $contains($m, "3.1-fast")
|
||||
? ($r = "4k" ? ($a ? 0.30 : 0.25) : $r = "1080p" ? ($a ? 0.12 : 0.10) : ($a ? 0.10 : 0.08))
|
||||
: $contains($m, "3.1-generate")
|
||||
? ($r = "4k" ? ($a ? 0.60 : 0.40) : ($a ? 0.40 : 0.20))
|
||||
: $contains($m, "3.0-fast")
|
||||
? ($a ? 0.15 : 0.10)
|
||||
: ($a ? 0.40 : 0.20);
|
||||
{"type":"usd","usd": $pps * $seconds}
|
||||
($contains($m,"veo-3.0-fast-generate-001") or $contains($m,"veo-3.1-fast-generate"))
|
||||
? {"type":"usd","usd": ($a ? 1.2 : 0.8)}
|
||||
: ($contains($m,"veo-3.0-generate-001") or $contains($m,"veo-3.1-generate"))
|
||||
? {"type":"usd","usd": ($a ? 3.2 : 1.6)}
|
||||
: {"type":"range_usd","min_usd":0.8,"max_usd":3.2}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt,
|
||||
aspect_ratio="16:9",
|
||||
resolution="720p",
|
||||
negative_prompt="",
|
||||
duration_seconds=8,
|
||||
enhance_prompt=True,
|
||||
person_generation="ALLOW",
|
||||
seed=0,
|
||||
image=None,
|
||||
model="veo-3.0-generate-001",
|
||||
generate_audio=False,
|
||||
):
|
||||
if "lite" in model and resolution == "4k":
|
||||
raise Exception("4K resolution is not supported by the veo-3.1-lite model.")
|
||||
|
||||
model = MODELS_MAP[model]
|
||||
|
||||
instances = [{"prompt": prompt}]
|
||||
if image is not None:
|
||||
image_base64 = tensor_to_base64_string(image)
|
||||
if image_base64:
|
||||
instances[0]["image"] = {"bytesBase64Encoded": image_base64, "mimeType": "image/png"}
|
||||
|
||||
parameters = {
|
||||
"aspectRatio": aspect_ratio,
|
||||
"personGeneration": person_generation,
|
||||
"durationSeconds": duration_seconds,
|
||||
"enhancePrompt": True,
|
||||
"generateAudio": generate_audio,
|
||||
}
|
||||
if negative_prompt:
|
||||
parameters["negativePrompt"] = negative_prompt
|
||||
if seed > 0:
|
||||
parameters["seed"] = seed
|
||||
if "veo-3.1" in model:
|
||||
parameters["resolution"] = resolution
|
||||
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"),
|
||||
response_model=VeoGenVidResponse,
|
||||
data=VeoGenVidRequest(
|
||||
instances=instances,
|
||||
parameters=parameters,
|
||||
),
|
||||
)
|
||||
|
||||
poll_response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"),
|
||||
response_model=VeoGenVidPollResponse,
|
||||
status_extractor=lambda r: "completed" if r.done else "pending",
|
||||
data=VeoGenVidPollRequest(operationName=initial_response.name),
|
||||
poll_interval=9.0,
|
||||
estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
|
||||
)
|
||||
|
||||
if poll_response.error:
|
||||
raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})")
|
||||
|
||||
response = poll_response.response
|
||||
filtered_count = response.raiMediaFilteredCount
|
||||
if filtered_count:
|
||||
reasons = response.raiMediaFilteredReasons or []
|
||||
reason_part = f": {reasons[0]}" if reasons else ""
|
||||
raise Exception(
|
||||
f"Content blocked by Google's Responsible AI filters{reason_part} "
|
||||
f"({filtered_count} video{'s' if filtered_count != 1 else ''} filtered)."
|
||||
)
|
||||
|
||||
if response.videos:
|
||||
video = response.videos[0]
|
||||
if video.bytesBase64Encoded:
|
||||
return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
|
||||
if video.gcsUri:
|
||||
return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
|
||||
raise Exception("Video returned but no data or URL was provided")
|
||||
raise Exception("Video generation completed but no video was returned")
|
||||
|
||||
|
||||
class Veo3FirstLastFrameNode(IO.ComfyNode):
|
||||
|
||||
@@ -483,7 +394,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
|
||||
default="",
|
||||
tooltip="Negative text prompt to guide what to avoid in the video",
|
||||
),
|
||||
IO.Combo.Input("resolution", options=["720p", "1080p", "4k"]),
|
||||
IO.Combo.Input("resolution", options=["720p", "1080p"]),
|
||||
IO.Combo.Input(
|
||||
"aspect_ratio",
|
||||
options=["16:9", "9:16"],
|
||||
@@ -513,7 +424,8 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
|
||||
IO.Image.Input("last_frame", tooltip="End frame"),
|
||||
IO.Combo.Input(
|
||||
"model",
|
||||
options=["veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.1-lite"],
|
||||
options=["veo-3.1-generate", "veo-3.1-fast-generate"],
|
||||
default="veo-3.1-fast-generate",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"generate_audio",
|
||||
@@ -531,20 +443,26 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration", "resolution"]),
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration"]),
|
||||
expr="""
|
||||
(
|
||||
$prices := {
|
||||
"veo-3.1-fast-generate": { "audio": 0.15, "no_audio": 0.10 },
|
||||
"veo-3.1-generate": { "audio": 0.40, "no_audio": 0.20 }
|
||||
};
|
||||
$m := widgets.model;
|
||||
$r := widgets.resolution;
|
||||
$ga := widgets.generate_audio;
|
||||
$ga := (widgets.generate_audio = "true");
|
||||
$seconds := widgets.duration;
|
||||
$pps :=
|
||||
$contains($m, "lite")
|
||||
? ($r = "1080p" ? ($ga ? 0.08 : 0.05) : ($ga ? 0.05 : 0.03))
|
||||
: $contains($m, "fast")
|
||||
? ($r = "4k" ? ($ga ? 0.30 : 0.25) : $r = "1080p" ? ($ga ? 0.12 : 0.10) : ($ga ? 0.10 : 0.08))
|
||||
: ($r = "4k" ? ($ga ? 0.60 : 0.40) : ($ga ? 0.40 : 0.20));
|
||||
{"type":"usd","usd": $pps * $seconds}
|
||||
$modelKey :=
|
||||
$contains($m, "veo-3.1-fast-generate") ? "veo-3.1-fast-generate" :
|
||||
$contains($m, "veo-3.1-generate") ? "veo-3.1-generate" :
|
||||
"";
|
||||
$audioKey := $ga ? "audio" : "no_audio";
|
||||
$modelPrices := $lookup($prices, $modelKey);
|
||||
$pps := $lookup($modelPrices, $audioKey);
|
||||
($pps != null)
|
||||
? {"type":"usd","usd": $pps * $seconds}
|
||||
: {"type":"range_usd","min_usd": 0.4, "max_usd": 3.2}
|
||||
)
|
||||
""",
|
||||
),
|
||||
@@ -564,9 +482,6 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
|
||||
model: str,
|
||||
generate_audio: bool,
|
||||
):
|
||||
if "lite" in model and resolution == "4k":
|
||||
raise Exception("4K resolution is not supported by the veo-3.1-lite model.")
|
||||
|
||||
model = MODELS_MAP[model]
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
@@ -604,7 +519,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
|
||||
data=VeoGenVidPollRequest(
|
||||
operationName=initial_response.name,
|
||||
),
|
||||
poll_interval=9.0,
|
||||
poll_interval=5.0,
|
||||
estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
|
||||
)
|
||||
|
||||
|
||||
@@ -19,7 +19,6 @@ from .conversions import (
|
||||
image_tensor_pair_to_batch,
|
||||
pil_to_bytesio,
|
||||
resize_mask_to_image,
|
||||
resize_video_to_pixel_budget,
|
||||
tensor_to_base64_string,
|
||||
tensor_to_bytesio,
|
||||
tensor_to_pil,
|
||||
@@ -91,7 +90,6 @@ __all__ = [
|
||||
"image_tensor_pair_to_batch",
|
||||
"pil_to_bytesio",
|
||||
"resize_mask_to_image",
|
||||
"resize_video_to_pixel_budget",
|
||||
"tensor_to_base64_string",
|
||||
"tensor_to_bytesio",
|
||||
"tensor_to_pil",
|
||||
|
||||
@@ -19,8 +19,6 @@ from comfy import utils
|
||||
from comfy_api.latest import IO
|
||||
from server import PromptServer
|
||||
|
||||
from comfy.deploy_environment import get_deploy_environment
|
||||
|
||||
from . import request_logger
|
||||
from ._helpers import (
|
||||
default_base_url,
|
||||
@@ -619,7 +617,6 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
|
||||
payload_headers = {"Accept": "*/*"} if expect_binary else {"Accept": "application/json"}
|
||||
if not parsed_url.scheme and not parsed_url.netloc: # is URL relative?
|
||||
payload_headers.update(get_auth_header(cfg.node_cls))
|
||||
payload_headers["X-Comfy-Env"] = get_deploy_environment()
|
||||
if cfg.endpoint.headers:
|
||||
payload_headers.update(cfg.endpoint.headers)
|
||||
|
||||
|
||||
@@ -129,35 +129,19 @@ def pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO:
|
||||
return img_byte_arr
|
||||
|
||||
|
||||
def _compute_downscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None:
|
||||
"""Return downscaled (w, h) with even dims fitting ``total_pixels``, or None if already fits.
|
||||
|
||||
Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions
|
||||
are rounded down to even values (many codecs require divisible-by-2).
|
||||
"""
|
||||
pixels = src_w * src_h
|
||||
if pixels <= total_pixels:
|
||||
return None
|
||||
scale = math.sqrt(total_pixels / pixels)
|
||||
new_w = max(2, int(src_w * scale))
|
||||
new_h = max(2, int(src_h * scale))
|
||||
new_w -= new_w % 2
|
||||
new_h -= new_h % 2
|
||||
return new_w, new_h
|
||||
|
||||
|
||||
def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024) -> torch.Tensor:
|
||||
"""Downscale input image tensor to roughly the specified total pixels.
|
||||
|
||||
Output dimensions are rounded down to even values so that the result is guaranteed to fit within ``total_pixels``
|
||||
and is compatible with codecs that require even dimensions (e.g. yuv420p).
|
||||
"""
|
||||
"""Downscale input image tensor to roughly the specified total pixels."""
|
||||
samples = image.movedim(-1, 1)
|
||||
dims = _compute_downscale_dims(samples.shape[3], samples.shape[2], int(total_pixels))
|
||||
if dims is None:
|
||||
total = int(total_pixels)
|
||||
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
|
||||
if scale_by >= 1:
|
||||
return image
|
||||
new_w, new_h = dims
|
||||
return common_upscale(samples, new_w, new_h, "lanczos", "disabled").movedim(1, -1)
|
||||
width = round(samples.shape[3] * scale_by)
|
||||
height = round(samples.shape[2] * scale_by)
|
||||
|
||||
s = common_upscale(samples, width, height, "lanczos", "disabled")
|
||||
s = s.movedim(1, -1)
|
||||
return s
|
||||
|
||||
|
||||
def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor:
|
||||
@@ -415,72 +399,6 @@ def trim_video(video: Input.Video, duration_sec: float) -> Input.Video:
|
||||
raise RuntimeError(f"Failed to trim video: {str(e)}") from e
|
||||
|
||||
|
||||
def resize_video_to_pixel_budget(video: Input.Video, total_pixels: int) -> Input.Video:
|
||||
"""Downscale a video to fit within ``total_pixels`` (w * h), preserving aspect ratio.
|
||||
|
||||
Returns the original video object untouched when it already fits. Preserves frame rate, duration, and audio.
|
||||
Aspect ratio is preserved up to a fraction of a percent (even-dim rounding).
|
||||
"""
|
||||
src_w, src_h = video.get_dimensions()
|
||||
scale_dims = _compute_downscale_dims(src_w, src_h, total_pixels)
|
||||
if scale_dims is None:
|
||||
return video
|
||||
return _apply_video_scale(video, scale_dims)
|
||||
|
||||
|
||||
def _apply_video_scale(video: Input.Video, scale_dims: tuple[int, int]) -> Input.Video:
|
||||
"""Re-encode ``video`` scaled to ``scale_dims`` with a single decode/encode pass."""
|
||||
out_w, out_h = scale_dims
|
||||
output_buffer = BytesIO()
|
||||
input_container = None
|
||||
output_container = None
|
||||
|
||||
try:
|
||||
input_source = video.get_stream_source()
|
||||
input_container = av.open(input_source, mode="r")
|
||||
output_container = av.open(output_buffer, mode="w", format="mp4")
|
||||
|
||||
video_stream = output_container.add_stream("h264", rate=video.get_frame_rate())
|
||||
video_stream.width = out_w
|
||||
video_stream.height = out_h
|
||||
video_stream.pix_fmt = "yuv420p"
|
||||
|
||||
audio_stream = None
|
||||
for stream in input_container.streams:
|
||||
if isinstance(stream, av.AudioStream):
|
||||
audio_stream = output_container.add_stream("aac", rate=stream.sample_rate)
|
||||
audio_stream.sample_rate = stream.sample_rate
|
||||
audio_stream.layout = stream.layout
|
||||
break
|
||||
|
||||
for frame in input_container.decode(video=0):
|
||||
frame = frame.reformat(width=out_w, height=out_h, format="yuv420p")
|
||||
for packet in video_stream.encode(frame):
|
||||
output_container.mux(packet)
|
||||
for packet in video_stream.encode():
|
||||
output_container.mux(packet)
|
||||
|
||||
if audio_stream is not None:
|
||||
input_container.seek(0)
|
||||
for audio_frame in input_container.decode(audio=0):
|
||||
for packet in audio_stream.encode(audio_frame):
|
||||
output_container.mux(packet)
|
||||
for packet in audio_stream.encode():
|
||||
output_container.mux(packet)
|
||||
|
||||
output_container.close()
|
||||
input_container.close()
|
||||
output_buffer.seek(0)
|
||||
return InputImpl.VideoFromFile(output_buffer)
|
||||
|
||||
except Exception as e:
|
||||
if input_container is not None:
|
||||
input_container.close()
|
||||
if output_container is not None:
|
||||
output_container.close()
|
||||
raise RuntimeError(f"Failed to resize video: {str(e)}") from e
|
||||
|
||||
|
||||
def _f32_pcm(wav: torch.Tensor) -> torch.Tensor:
|
||||
"""Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file."""
|
||||
if wav.dtype.is_floating_point:
|
||||
|
||||
@@ -3,136 +3,136 @@ from typing_extensions import override
|
||||
|
||||
import comfy.model_management
|
||||
import node_helpers
|
||||
from comfy_api.latest import ComfyExtension, IO
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
|
||||
class TextEncodeAceStepAudio(IO.ComfyNode):
|
||||
class TextEncodeAceStepAudio(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
return io.Schema(
|
||||
node_id="TextEncodeAceStepAudio",
|
||||
category="conditioning",
|
||||
inputs=[
|
||||
IO.Clip.Input("clip"),
|
||||
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
|
||||
IO.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
|
||||
io.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
|
||||
],
|
||||
outputs=[IO.Conditioning.Output()],
|
||||
outputs=[io.Conditioning.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, tags, lyrics, lyrics_strength) -> IO.NodeOutput:
|
||||
def execute(cls, clip, tags, lyrics, lyrics_strength) -> io.NodeOutput:
|
||||
tokens = clip.tokenize(tags, lyrics=lyrics)
|
||||
conditioning = clip.encode_from_tokens_scheduled(tokens)
|
||||
conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength})
|
||||
return IO.NodeOutput(conditioning)
|
||||
return io.NodeOutput(conditioning)
|
||||
|
||||
class TextEncodeAceStepAudio15(IO.ComfyNode):
|
||||
class TextEncodeAceStepAudio15(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
return io.Schema(
|
||||
node_id="TextEncodeAceStepAudio1.5",
|
||||
category="conditioning",
|
||||
inputs=[
|
||||
IO.Clip.Input("clip"),
|
||||
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
|
||||
IO.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
|
||||
IO.Int.Input("bpm", default=120, min=10, max=300),
|
||||
IO.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
|
||||
IO.Combo.Input("timesignature", options=['2', '3', '4', '6']),
|
||||
IO.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
|
||||
IO.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
|
||||
IO.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
|
||||
IO.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
|
||||
IO.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
|
||||
IO.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
|
||||
IO.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
|
||||
IO.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
|
||||
io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
|
||||
io.Int.Input("bpm", default=120, min=10, max=300),
|
||||
io.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
|
||||
io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
|
||||
io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
|
||||
io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
|
||||
io.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
|
||||
io.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
|
||||
io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
|
||||
io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
|
||||
io.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
|
||||
io.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
|
||||
],
|
||||
outputs=[IO.Conditioning.Output()],
|
||||
outputs=[io.Conditioning.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> IO.NodeOutput:
|
||||
def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> io.NodeOutput:
|
||||
tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p)
|
||||
conditioning = clip.encode_from_tokens_scheduled(tokens)
|
||||
return IO.NodeOutput(conditioning)
|
||||
return io.NodeOutput(conditioning)
|
||||
|
||||
|
||||
class EmptyAceStepLatentAudio(IO.ComfyNode):
|
||||
class EmptyAceStepLatentAudio(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
return io.Schema(
|
||||
node_id="EmptyAceStepLatentAudio",
|
||||
display_name="Empty Ace Step 1.0 Latent Audio",
|
||||
category="latent/audio",
|
||||
inputs=[
|
||||
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
|
||||
IO.Int.Input(
|
||||
io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
|
||||
io.Int.Input(
|
||||
"batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
|
||||
),
|
||||
],
|
||||
outputs=[IO.Latent.Output()],
|
||||
outputs=[io.Latent.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, seconds, batch_size) -> IO.NodeOutput:
|
||||
def execute(cls, seconds, batch_size) -> io.NodeOutput:
|
||||
length = int(seconds * 44100 / 512 / 8)
|
||||
latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
return IO.NodeOutput({"samples": latent, "type": "audio"})
|
||||
return io.NodeOutput({"samples": latent, "type": "audio"})
|
||||
|
||||
|
||||
class EmptyAceStep15LatentAudio(IO.ComfyNode):
|
||||
class EmptyAceStep15LatentAudio(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
return io.Schema(
|
||||
node_id="EmptyAceStep1.5LatentAudio",
|
||||
display_name="Empty Ace Step 1.5 Latent Audio",
|
||||
category="latent/audio",
|
||||
inputs=[
|
||||
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
|
||||
IO.Int.Input(
|
||||
io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
|
||||
io.Int.Input(
|
||||
"batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
|
||||
),
|
||||
],
|
||||
outputs=[IO.Latent.Output()],
|
||||
outputs=[io.Latent.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, seconds, batch_size) -> IO.NodeOutput:
|
||||
def execute(cls, seconds, batch_size) -> io.NodeOutput:
|
||||
length = round((seconds * 48000 / 1920))
|
||||
latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
return IO.NodeOutput({"samples": latent, "type": "audio"})
|
||||
return io.NodeOutput({"samples": latent, "type": "audio"})
|
||||
|
||||
class ReferenceAudio(IO.ComfyNode):
|
||||
class ReferenceAudio(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
return io.Schema(
|
||||
node_id="ReferenceTimbreAudio",
|
||||
display_name="Reference Audio",
|
||||
category="advanced/conditioning/audio",
|
||||
is_experimental=True,
|
||||
description="This node sets the reference audio for ace step 1.5",
|
||||
inputs=[
|
||||
IO.Conditioning.Input("conditioning"),
|
||||
IO.Latent.Input("latent", optional=True),
|
||||
io.Conditioning.Input("conditioning"),
|
||||
io.Latent.Input("latent", optional=True),
|
||||
],
|
||||
outputs=[
|
||||
IO.Conditioning.Output(),
|
||||
io.Conditioning.Output(),
|
||||
]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, conditioning, latent=None) -> IO.NodeOutput:
|
||||
def execute(cls, conditioning, latent=None) -> io.NodeOutput:
|
||||
if latent is not None:
|
||||
conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_audio_timbre_latents": [latent["samples"]]}, append=True)
|
||||
return IO.NodeOutput(conditioning)
|
||||
return io.NodeOutput(conditioning)
|
||||
|
||||
class AceExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
TextEncodeAceStepAudio,
|
||||
EmptyAceStepLatentAudio,
|
||||
|
||||
@@ -104,7 +104,7 @@ def vae_decode_audio(vae, samples, tile=None, overlap=None):
|
||||
std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
|
||||
std[std < 1.0] = 1.0
|
||||
audio /= std
|
||||
vae_sample_rate = getattr(vae, "audio_sample_rate_output", getattr(vae, "audio_sample_rate", 44100))
|
||||
vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
|
||||
return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}
|
||||
|
||||
|
||||
|
||||
@@ -3,8 +3,9 @@ import comfy.utils
|
||||
import comfy.model_management
|
||||
import torch
|
||||
|
||||
from comfy.ldm.lightricks.vae.audio_vae import AudioVAE
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
from comfy_extras.nodes_audio import VAEEncodeAudio
|
||||
|
||||
|
||||
class LTXVAudioVAELoader(io.ComfyNode):
|
||||
@classmethod
|
||||
@@ -27,14 +28,10 @@ class LTXVAudioVAELoader(io.ComfyNode):
|
||||
def execute(cls, ckpt_name: str) -> io.NodeOutput:
|
||||
ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
|
||||
sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
|
||||
sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder.", "vocoder.": "vocoder."}, filter_keys=True)
|
||||
vae = comfy.sd.VAE(sd=sd, metadata=metadata)
|
||||
vae.throw_exception_if_invalid()
|
||||
|
||||
return io.NodeOutput(vae)
|
||||
return io.NodeOutput(AudioVAE(sd, metadata))
|
||||
|
||||
|
||||
class LTXVAudioVAEEncode(VAEEncodeAudio):
|
||||
class LTXVAudioVAEEncode(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
@@ -53,8 +50,15 @@ class LTXVAudioVAEEncode(VAEEncodeAudio):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, audio, audio_vae) -> io.NodeOutput:
|
||||
return super().execute(audio_vae, audio)
|
||||
def execute(cls, audio, audio_vae: AudioVAE) -> io.NodeOutput:
|
||||
audio_latents = audio_vae.encode(audio)
|
||||
return io.NodeOutput(
|
||||
{
|
||||
"samples": audio_latents,
|
||||
"sample_rate": int(audio_vae.sample_rate),
|
||||
"type": "audio",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class LTXVAudioVAEDecode(io.ComfyNode):
|
||||
@@ -76,12 +80,12 @@ class LTXVAudioVAEDecode(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, samples, audio_vae) -> io.NodeOutput:
|
||||
def execute(cls, samples, audio_vae: AudioVAE) -> io.NodeOutput:
|
||||
audio_latent = samples["samples"]
|
||||
if audio_latent.is_nested:
|
||||
audio_latent = audio_latent.unbind()[-1]
|
||||
audio = audio_vae.decode(audio_latent).movedim(-1, 1).to(audio_latent.device)
|
||||
output_audio_sample_rate = audio_vae.first_stage_model.output_sample_rate
|
||||
audio = audio_vae.decode(audio_latent).to(audio_latent.device)
|
||||
output_audio_sample_rate = audio_vae.output_sample_rate
|
||||
return io.NodeOutput(
|
||||
{
|
||||
"waveform": audio,
|
||||
@@ -139,17 +143,17 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
|
||||
frames_number: int,
|
||||
frame_rate: int,
|
||||
batch_size: int,
|
||||
audio_vae,
|
||||
audio_vae: AudioVAE,
|
||||
) -> io.NodeOutput:
|
||||
"""Generate empty audio latents matching the reference pipeline structure."""
|
||||
|
||||
assert audio_vae is not None, "Audio VAE model is required"
|
||||
|
||||
z_channels = audio_vae.latent_channels
|
||||
audio_freq = audio_vae.first_stage_model.latent_frequency_bins
|
||||
sampling_rate = int(audio_vae.first_stage_model.sample_rate)
|
||||
audio_freq = audio_vae.latent_frequency_bins
|
||||
sampling_rate = int(audio_vae.sample_rate)
|
||||
|
||||
num_audio_latents = audio_vae.first_stage_model.num_of_latents_from_frames(frames_number, frame_rate)
|
||||
num_audio_latents = audio_vae.num_of_latents_from_frames(frames_number, frame_rate)
|
||||
|
||||
audio_latents = torch.zeros(
|
||||
(batch_size, z_channels, num_audio_latents, audio_freq),
|
||||
|
||||
@@ -7,10 +7,7 @@ import comfy.model_management
|
||||
import comfy.ldm.common_dit
|
||||
import comfy.latent_formats
|
||||
import comfy.ldm.lumina.controlnet
|
||||
import comfy.ldm.supir.supir_modules
|
||||
from comfy.ldm.wan.model_multitalk import WanMultiTalkAttentionBlock, MultiTalkAudioProjModel
|
||||
from comfy_api.latest import io
|
||||
from comfy.ldm.supir.supir_patch import SUPIRPatch
|
||||
|
||||
|
||||
class BlockWiseControlBlock(torch.nn.Module):
|
||||
@@ -269,27 +266,6 @@ class ModelPatchLoader:
|
||||
out_dim=sd["audio_proj.norm.weight"].shape[0],
|
||||
device=comfy.model_management.unet_offload_device(),
|
||||
operations=comfy.ops.manual_cast)
|
||||
elif 'model.control_model.input_hint_block.0.weight' in sd or 'control_model.input_hint_block.0.weight' in sd:
|
||||
prefix_replace = {}
|
||||
if 'model.control_model.input_hint_block.0.weight' in sd:
|
||||
prefix_replace["model.control_model."] = "control_model."
|
||||
prefix_replace["model.diffusion_model.project_modules."] = "project_modules."
|
||||
else:
|
||||
prefix_replace["control_model."] = "control_model."
|
||||
prefix_replace["project_modules."] = "project_modules."
|
||||
|
||||
# Extract denoise_encoder weights before filter_keys discards them
|
||||
de_prefix = "first_stage_model.denoise_encoder."
|
||||
denoise_encoder_sd = {}
|
||||
for k in list(sd.keys()):
|
||||
if k.startswith(de_prefix):
|
||||
denoise_encoder_sd[k[len(de_prefix):]] = sd.pop(k)
|
||||
|
||||
sd = comfy.utils.state_dict_prefix_replace(sd, prefix_replace, filter_keys=True)
|
||||
sd.pop("control_model.mask_LQ", None)
|
||||
model = comfy.ldm.supir.supir_modules.SUPIR(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
|
||||
if denoise_encoder_sd:
|
||||
model.denoise_encoder_sd = denoise_encoder_sd
|
||||
|
||||
model_patcher = comfy.model_patcher.CoreModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device())
|
||||
model.load_state_dict(sd, assign=model_patcher.is_dynamic())
|
||||
@@ -589,89 +565,9 @@ class MultiTalkModelPatch(torch.nn.Module):
|
||||
)
|
||||
|
||||
|
||||
class SUPIRApply(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="SUPIRApply",
|
||||
category="model_patches/supir",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.ModelPatch.Input("model_patch"),
|
||||
io.Vae.Input("vae"),
|
||||
io.Image.Input("image"),
|
||||
io.Float.Input("strength_start", default=1.0, min=0.0, max=10.0, step=0.01,
|
||||
tooltip="Control strength at the start of sampling (high sigma)."),
|
||||
io.Float.Input("strength_end", default=1.0, min=0.0, max=10.0, step=0.01,
|
||||
tooltip="Control strength at the end of sampling (low sigma). Linearly interpolated from start."),
|
||||
io.Float.Input("restore_cfg", default=4.0, min=0.0, max=20.0, step=0.1, advanced=True,
|
||||
tooltip="Pulls denoised output toward the input latent. Higher = stronger fidelity to input. 0 to disable."),
|
||||
io.Float.Input("restore_cfg_s_tmin", default=0.05, min=0.0, max=1.0, step=0.01, advanced=True,
|
||||
tooltip="Sigma threshold below which restore_cfg is disabled."),
|
||||
],
|
||||
outputs=[io.Model.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _encode_with_denoise_encoder(cls, vae, model_patch, image):
|
||||
"""Encode using denoise_encoder weights from SUPIR checkpoint if available."""
|
||||
denoise_sd = getattr(model_patch.model, 'denoise_encoder_sd', None)
|
||||
if not denoise_sd:
|
||||
return vae.encode(image)
|
||||
|
||||
# Clone VAE patcher, apply denoise_encoder weights to clone, encode
|
||||
orig_patcher = vae.patcher
|
||||
vae.patcher = orig_patcher.clone()
|
||||
patches = {f"encoder.{k}": (v,) for k, v in denoise_sd.items()}
|
||||
vae.patcher.add_patches(patches, strength_patch=1.0, strength_model=0.0)
|
||||
try:
|
||||
return vae.encode(image)
|
||||
finally:
|
||||
vae.patcher = orig_patcher
|
||||
|
||||
@classmethod
|
||||
def execute(cls, *, model: io.Model.Type, model_patch: io.ModelPatch.Type, vae: io.Vae.Type, image: io.Image.Type,
|
||||
strength_start: float, strength_end: float, restore_cfg: float, restore_cfg_s_tmin: float) -> io.NodeOutput:
|
||||
model_patched = model.clone()
|
||||
hint_latent = model.get_model_object("latent_format").process_in(
|
||||
cls._encode_with_denoise_encoder(vae, model_patch, image[:, :, :, :3]))
|
||||
patch = SUPIRPatch(model_patch, model_patch.model.project_modules, hint_latent, strength_start, strength_end)
|
||||
patch.register(model_patched)
|
||||
|
||||
if restore_cfg > 0.0:
|
||||
# Round-trip to match original pipeline: decode hint, re-encode with regular VAE
|
||||
latent_format = model.get_model_object("latent_format")
|
||||
decoded = vae.decode(latent_format.process_out(hint_latent))
|
||||
x_center = latent_format.process_in(vae.encode(decoded[:, :, :, :3]))
|
||||
sigma_max = 14.6146
|
||||
|
||||
def restore_cfg_function(args):
|
||||
denoised = args["denoised"]
|
||||
sigma = args["sigma"]
|
||||
if sigma.dim() > 0:
|
||||
s = sigma[0].item()
|
||||
else:
|
||||
s = sigma.item()
|
||||
if s > restore_cfg_s_tmin:
|
||||
ref = x_center.to(device=denoised.device, dtype=denoised.dtype)
|
||||
b = denoised.shape[0]
|
||||
if ref.shape[0] != b:
|
||||
ref = ref.expand(b, -1, -1, -1) if ref.shape[0] == 1 else ref.repeat((b + ref.shape[0] - 1) // ref.shape[0], 1, 1, 1)[:b]
|
||||
sigma_val = sigma.view(-1, 1, 1, 1) if sigma.dim() > 0 else sigma
|
||||
d_center = denoised - ref
|
||||
denoised = denoised - d_center * ((sigma_val / sigma_max) ** restore_cfg)
|
||||
return denoised
|
||||
|
||||
model_patched.set_model_sampler_post_cfg_function(restore_cfg_function)
|
||||
|
||||
return io.NodeOutput(model_patched)
|
||||
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"ModelPatchLoader": ModelPatchLoader,
|
||||
"QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet,
|
||||
"ZImageFunControlnet": ZImageFunControlnet,
|
||||
"USOStyleReference": USOStyleReference,
|
||||
"SUPIRApply": SUPIRApply,
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ from PIL import Image
|
||||
import math
|
||||
from enum import Enum
|
||||
from typing import TypedDict, Literal
|
||||
import kornia
|
||||
|
||||
import comfy.utils
|
||||
import comfy.model_management
|
||||
@@ -661,228 +660,6 @@ class BatchImagesMasksLatentsNode(io.ComfyNode):
|
||||
return io.NodeOutput(batched)
|
||||
|
||||
|
||||
class ColorTransfer(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ColorTransfer",
|
||||
category="image/postprocessing",
|
||||
description="Match the colors of one image to another using various algorithms.",
|
||||
search_aliases=["color match", "color grading", "color correction", "match colors", "color transform", "mkl", "reinhard", "histogram"],
|
||||
inputs=[
|
||||
io.Image.Input("image_target", tooltip="Image(s) to apply the color transform to."),
|
||||
io.Image.Input("image_ref", optional=True, tooltip="Reference image(s) to match colors to. If not provided, processing is skipped"),
|
||||
io.Combo.Input("method", options=['reinhard_lab', 'mkl_lab', 'histogram'],),
|
||||
io.DynamicCombo.Input("source_stats",
|
||||
tooltip="per_frame: each frame matched to image_ref individually. uniform: pool stats across all source frames as baseline, match to image_ref. target_frame: use one chosen frame as the baseline for the transform to image_ref, applied uniformly to all frames (preserves relative differences)",
|
||||
options=[
|
||||
io.DynamicCombo.Option("per_frame", []),
|
||||
io.DynamicCombo.Option("uniform", []),
|
||||
io.DynamicCombo.Option("target_frame", [
|
||||
io.Int.Input("target_index", default=0, min=0, max=10000,
|
||||
tooltip="Frame index used as the source baseline for computing the transform to image_ref"),
|
||||
]),
|
||||
]),
|
||||
io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
|
||||
],
|
||||
outputs=[
|
||||
io.Image.Output(display_name="image"),
|
||||
],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _to_lab(images, i, device):
|
||||
return kornia.color.rgb_to_lab(
|
||||
images[i:i+1].to(device, dtype=torch.float32).permute(0, 3, 1, 2))
|
||||
|
||||
@staticmethod
|
||||
def _pool_stats(images, device, is_reinhard, eps):
|
||||
"""Two-pass pooled mean + std/cov across all frames."""
|
||||
N, C = images.shape[0], images.shape[3]
|
||||
HW = images.shape[1] * images.shape[2]
|
||||
mean = torch.zeros(C, 1, device=device, dtype=torch.float32)
|
||||
for i in range(N):
|
||||
mean += ColorTransfer._to_lab(images, i, device).view(C, -1).mean(dim=-1, keepdim=True)
|
||||
mean /= N
|
||||
acc = torch.zeros(C, 1 if is_reinhard else C, device=device, dtype=torch.float32)
|
||||
for i in range(N):
|
||||
centered = ColorTransfer._to_lab(images, i, device).view(C, -1) - mean
|
||||
if is_reinhard:
|
||||
acc += (centered * centered).mean(dim=-1, keepdim=True)
|
||||
else:
|
||||
acc += centered @ centered.T / HW
|
||||
if is_reinhard:
|
||||
return mean, torch.sqrt(acc / N).clamp_min_(eps)
|
||||
return mean, acc / N
|
||||
|
||||
@staticmethod
|
||||
def _frame_stats(lab_flat, hw, is_reinhard, eps):
|
||||
"""Per-frame mean + std/cov."""
|
||||
mean = lab_flat.mean(dim=-1, keepdim=True)
|
||||
if is_reinhard:
|
||||
return mean, lab_flat.std(dim=-1, keepdim=True, unbiased=False).clamp_min_(eps)
|
||||
centered = lab_flat - mean
|
||||
return mean, centered @ centered.T / hw
|
||||
|
||||
@staticmethod
|
||||
def _mkl_matrix(cov_s, cov_r, eps):
|
||||
"""Compute MKL 3x3 transform matrix from source and ref covariances."""
|
||||
eig_val_s, eig_vec_s = torch.linalg.eigh(cov_s)
|
||||
sqrt_val_s = torch.sqrt(eig_val_s.clamp_min(0)).clamp_min_(eps)
|
||||
|
||||
scaled_V = eig_vec_s * sqrt_val_s.unsqueeze(0)
|
||||
mid = scaled_V.T @ cov_r @ scaled_V
|
||||
eig_val_m, eig_vec_m = torch.linalg.eigh(mid)
|
||||
sqrt_m = torch.sqrt(eig_val_m.clamp_min(0))
|
||||
|
||||
inv_sqrt_s = 1.0 / sqrt_val_s
|
||||
inv_scaled_V = eig_vec_s * inv_sqrt_s.unsqueeze(0)
|
||||
M_half = (eig_vec_m * sqrt_m.unsqueeze(0)) @ eig_vec_m.T
|
||||
return inv_scaled_V @ M_half @ inv_scaled_V.T
|
||||
|
||||
@staticmethod
|
||||
def _histogram_lut(src, ref, bins=256):
|
||||
"""Build per-channel LUT from source and ref histograms. src/ref: (C, HW) in [0,1]."""
|
||||
s_bins = (src * (bins - 1)).long().clamp(0, bins - 1)
|
||||
r_bins = (ref * (bins - 1)).long().clamp(0, bins - 1)
|
||||
s_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
|
||||
r_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
|
||||
ones_s = torch.ones_like(src)
|
||||
ones_r = torch.ones_like(ref)
|
||||
s_hist.scatter_add_(1, s_bins, ones_s)
|
||||
r_hist.scatter_add_(1, r_bins, ones_r)
|
||||
s_cdf = s_hist.cumsum(1)
|
||||
s_cdf = s_cdf / s_cdf[:, -1:]
|
||||
r_cdf = r_hist.cumsum(1)
|
||||
r_cdf = r_cdf / r_cdf[:, -1:]
|
||||
return torch.searchsorted(r_cdf, s_cdf).clamp_max_(bins - 1).float() / (bins - 1)
|
||||
|
||||
@classmethod
|
||||
def _pooled_cdf(cls, images, device, num_bins=256):
|
||||
"""Build pooled CDF across all frames, one frame at a time."""
|
||||
C = images.shape[3]
|
||||
hist = torch.zeros(C, num_bins, device=device, dtype=torch.float32)
|
||||
for i in range(images.shape[0]):
|
||||
frame = images[i].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
|
||||
bins = (frame * (num_bins - 1)).long().clamp(0, num_bins - 1)
|
||||
hist.scatter_add_(1, bins, torch.ones_like(frame))
|
||||
cdf = hist.cumsum(1)
|
||||
return cdf / cdf[:, -1:]
|
||||
|
||||
@classmethod
|
||||
def _build_histogram_transform(cls, image_target, image_ref, device, stats_mode, target_index, B):
|
||||
"""Build per-frame or uniform LUT transform for histogram mode."""
|
||||
if stats_mode == 'per_frame':
|
||||
return None # LUT computed per-frame in the apply loop
|
||||
|
||||
r_cdf = cls._pooled_cdf(image_ref, device)
|
||||
if stats_mode == 'target_frame':
|
||||
ti = min(target_index, B - 1)
|
||||
s_cdf = cls._pooled_cdf(image_target[ti:ti+1], device)
|
||||
else:
|
||||
s_cdf = cls._pooled_cdf(image_target, device)
|
||||
return torch.searchsorted(r_cdf, s_cdf).clamp_max_(255).float() / 255.0
|
||||
|
||||
@classmethod
|
||||
def _build_lab_transform(cls, image_target, image_ref, device, stats_mode, target_index, is_reinhard):
|
||||
"""Build transform parameters for Lab-based methods. Returns a transform function."""
|
||||
eps = 1e-6
|
||||
B, H, W, C = image_target.shape
|
||||
B_ref = image_ref.shape[0]
|
||||
single_ref = B_ref == 1
|
||||
HW = H * W
|
||||
HW_ref = image_ref.shape[1] * image_ref.shape[2]
|
||||
|
||||
# Precompute ref stats
|
||||
if single_ref or stats_mode in ('uniform', 'target_frame'):
|
||||
ref_mean, ref_sc = cls._pool_stats(image_ref, device, is_reinhard, eps)
|
||||
|
||||
# Uniform/target_frame: precompute single affine transform
|
||||
if stats_mode in ('uniform', 'target_frame'):
|
||||
if stats_mode == 'target_frame':
|
||||
ti = min(target_index, B - 1)
|
||||
s_lab = cls._to_lab(image_target, ti, device).view(C, -1)
|
||||
s_mean, s_sc = cls._frame_stats(s_lab, HW, is_reinhard, eps)
|
||||
else:
|
||||
s_mean, s_sc = cls._pool_stats(image_target, device, is_reinhard, eps)
|
||||
|
||||
if is_reinhard:
|
||||
scale = ref_sc / s_sc
|
||||
offset = ref_mean - scale * s_mean
|
||||
return lambda src_flat, **_: src_flat * scale + offset
|
||||
T = cls._mkl_matrix(s_sc, ref_sc, eps)
|
||||
offset = ref_mean - T @ s_mean
|
||||
return lambda src_flat, **_: T @ src_flat + offset
|
||||
|
||||
# per_frame
|
||||
def per_frame_transform(src_flat, frame_idx):
|
||||
s_mean, s_sc = cls._frame_stats(src_flat, HW, is_reinhard, eps)
|
||||
|
||||
if single_ref:
|
||||
r_mean, r_sc = ref_mean, ref_sc
|
||||
else:
|
||||
ri = min(frame_idx, B_ref - 1)
|
||||
r_mean, r_sc = cls._frame_stats(cls._to_lab(image_ref, ri, device).view(C, -1), HW_ref, is_reinhard, eps)
|
||||
|
||||
centered = src_flat - s_mean
|
||||
if is_reinhard:
|
||||
return centered * (r_sc / s_sc) + r_mean
|
||||
T = cls._mkl_matrix(centered @ centered.T / HW, r_sc, eps)
|
||||
return T @ centered + r_mean
|
||||
|
||||
return per_frame_transform
|
||||
|
||||
@classmethod
|
||||
def execute(cls, image_target, image_ref, method, source_stats, strength=1.0) -> io.NodeOutput:
|
||||
stats_mode = source_stats["source_stats"]
|
||||
target_index = source_stats.get("target_index", 0)
|
||||
|
||||
if strength == 0 or image_ref is None:
|
||||
return io.NodeOutput(image_target)
|
||||
|
||||
device = comfy.model_management.get_torch_device()
|
||||
intermediate_device = comfy.model_management.intermediate_device()
|
||||
intermediate_dtype = comfy.model_management.intermediate_dtype()
|
||||
|
||||
B, H, W, C = image_target.shape
|
||||
B_ref = image_ref.shape[0]
|
||||
pbar = comfy.utils.ProgressBar(B)
|
||||
out = torch.empty(B, H, W, C, device=intermediate_device, dtype=intermediate_dtype)
|
||||
|
||||
if method == 'histogram':
|
||||
uniform_lut = cls._build_histogram_transform(
|
||||
image_target, image_ref, device, stats_mode, target_index, B)
|
||||
|
||||
for i in range(B):
|
||||
src = image_target[i].to(device, dtype=torch.float32).permute(2, 0, 1)
|
||||
src_flat = src.reshape(C, -1)
|
||||
if uniform_lut is not None:
|
||||
lut = uniform_lut
|
||||
else:
|
||||
ri = min(i, B_ref - 1)
|
||||
ref = image_ref[ri].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
|
||||
lut = cls._histogram_lut(src_flat, ref)
|
||||
bin_idx = (src_flat * 255).long().clamp(0, 255)
|
||||
matched = lut.gather(1, bin_idx).view(C, H, W)
|
||||
result = matched if strength == 1.0 else torch.lerp(src, matched, strength)
|
||||
out[i] = result.permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
|
||||
pbar.update(1)
|
||||
else:
|
||||
transform = cls._build_lab_transform(image_target, image_ref, device, stats_mode, target_index, is_reinhard=method == "reinhard_lab")
|
||||
|
||||
for i in range(B):
|
||||
src_frame = cls._to_lab(image_target, i, device)
|
||||
corrected = transform(src_frame.view(C, -1), frame_idx=i)
|
||||
if strength == 1.0:
|
||||
result = kornia.color.lab_to_rgb(corrected.view(1, C, H, W))
|
||||
else:
|
||||
result = kornia.color.lab_to_rgb(torch.lerp(src_frame, corrected.view(1, C, H, W), strength))
|
||||
out[i] = result.squeeze(0).permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
|
||||
pbar.update(1)
|
||||
|
||||
return io.NodeOutput(out)
|
||||
|
||||
|
||||
class PostProcessingExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
@@ -896,7 +673,6 @@ class PostProcessingExtension(ComfyExtension):
|
||||
BatchImagesNode,
|
||||
BatchMasksNode,
|
||||
BatchLatentsNode,
|
||||
ColorTransfer,
|
||||
# BatchImagesMasksLatentsNode,
|
||||
]
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ class PreviewAny():
|
||||
"required": {"source": (IO.ANY, {})},
|
||||
}
|
||||
|
||||
RETURN_TYPES = (IO.STRING,)
|
||||
RETURN_TYPES = ()
|
||||
FUNCTION = "main"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
@@ -33,7 +33,7 @@ class PreviewAny():
|
||||
except Exception:
|
||||
value = 'source exists, but could not be serialized.'
|
||||
|
||||
return {"ui": {"text": (value,)}, "result": (value,)}
|
||||
return {"ui": {"text": (value,)}}
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"PreviewAny": PreviewAny,
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import re
|
||||
import json
|
||||
from typing_extensions import override
|
||||
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
@@ -376,39 +375,6 @@ class RegexReplace(io.ComfyNode):
|
||||
return io.NodeOutput(result)
|
||||
|
||||
|
||||
class JsonExtractString(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="JsonExtractString",
|
||||
display_name="Extract String from JSON",
|
||||
category="utils/string",
|
||||
search_aliases=["json", "extract json", "parse json", "json value", "read json"],
|
||||
inputs=[
|
||||
io.String.Input("json_string", multiline=True),
|
||||
io.String.Input("key", multiline=False),
|
||||
],
|
||||
outputs=[
|
||||
io.String.Output(),
|
||||
]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, json_string, key):
|
||||
try:
|
||||
data = json.loads(json_string)
|
||||
if isinstance(data, dict) and key in data:
|
||||
value = data[key]
|
||||
if value is None:
|
||||
return io.NodeOutput("")
|
||||
|
||||
return io.NodeOutput(str(value))
|
||||
|
||||
return io.NodeOutput("")
|
||||
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return io.NodeOutput("")
|
||||
|
||||
class StringExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
@@ -424,7 +390,6 @@ class StringExtension(ComfyExtension):
|
||||
RegexMatch,
|
||||
RegexExtract,
|
||||
RegexReplace,
|
||||
JsonExtractString,
|
||||
]
|
||||
|
||||
async def comfy_entrypoint() -> StringExtension:
|
||||
|
||||
@@ -35,7 +35,6 @@ class TextGenerate(io.ComfyNode):
|
||||
io.Int.Input("max_length", default=256, min=1, max=2048),
|
||||
io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"),
|
||||
io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."),
|
||||
io.Boolean.Input("use_default_template", optional=True, default=True, tooltip="Use the built in system prompt/template if the model has one.", advanced=True),
|
||||
],
|
||||
outputs=[
|
||||
io.String.Output(display_name="generated_text"),
|
||||
@@ -43,9 +42,9 @@ class TextGenerate(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False, use_default_template=True) -> io.NodeOutput:
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput:
|
||||
|
||||
tokens = clip.tokenize(prompt, image=image, skip_template=not use_default_template, min_length=1, thinking=thinking)
|
||||
tokens = clip.tokenize(prompt, image=image, skip_template=False, min_length=1, thinking=thinking)
|
||||
|
||||
# Get sampling parameters from dynamic combo
|
||||
do_sample = sampling_mode.get("sampling_mode") == "on"
|
||||
@@ -161,12 +160,12 @@ class TextGenerateLTX2Prompt(TextGenerate):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False, use_default_template=True) -> io.NodeOutput:
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput:
|
||||
if image is None:
|
||||
formatted_prompt = f"<start_of_turn>system\n{LTX2_T2V_SYSTEM_PROMPT.strip()}<end_of_turn>\n<start_of_turn>user\nUser Raw Input Prompt: {prompt}.<end_of_turn>\n<start_of_turn>model\n"
|
||||
else:
|
||||
formatted_prompt = f"<start_of_turn>system\n{LTX2_I2V_SYSTEM_PROMPT.strip()}<end_of_turn>\n<start_of_turn>user\n\n<image_soft_token>\n\nUser Raw Input Prompt: {prompt}.<end_of_turn>\n<start_of_turn>model\n"
|
||||
return super().execute(clip, formatted_prompt, max_length, sampling_mode, image, thinking, use_default_template)
|
||||
return super().execute(clip, formatted_prompt, max_length, sampling_mode, image, thinking)
|
||||
|
||||
|
||||
class TextgenExtension(ComfyExtension):
|
||||
|
||||
420
comfy_extras/nodes_void.py
Normal file
420
comfy_extras/nodes_void.py
Normal file
@@ -0,0 +1,420 @@
|
||||
import logging
|
||||
|
||||
import torch
|
||||
|
||||
import comfy
|
||||
import comfy.model_management
|
||||
import comfy.samplers
|
||||
import comfy.utils
|
||||
import node_helpers
|
||||
import nodes
|
||||
from comfy.utils import model_trange as trange
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
from typing_extensions import override
|
||||
|
||||
TEMPORAL_COMPRESSION = 4
|
||||
PATCH_SIZE_T = 2
|
||||
|
||||
|
||||
def _valid_void_length(length: int) -> int:
|
||||
"""Round ``length`` down to a value that produces an even latent_t.
|
||||
|
||||
VOID / CogVideoX-Fun-V1.5 uses patch_size_t=2, so the VAE-encoded latent
|
||||
must have an even temporal dimension. If latent_t is odd, the transformer
|
||||
pad_to_patch_size circular-wraps an extra latent frame onto the end; after
|
||||
the post-transformer crop the last real latent frame has been influenced
|
||||
by the wrapped phantom frame, producing visible jitter and "disappearing"
|
||||
subjects near the end of the decoded video. Rounding down fixes this.
|
||||
"""
|
||||
latent_t = ((length - 1) // TEMPORAL_COMPRESSION) + 1
|
||||
if latent_t % PATCH_SIZE_T == 0:
|
||||
return length
|
||||
# Round latent_t down to the nearest multiple of PATCH_SIZE_T, then invert
|
||||
# the ((length - 1) // TEMPORAL_COMPRESSION) + 1 formula. Floor at 1 frame
|
||||
# so we never return a non-positive length.
|
||||
target_latent_t = max(PATCH_SIZE_T, (latent_t // PATCH_SIZE_T) * PATCH_SIZE_T)
|
||||
return (target_latent_t - 1) * TEMPORAL_COMPRESSION + 1
|
||||
|
||||
|
||||
class VOIDQuadmaskPreprocess(io.ComfyNode):
|
||||
"""Preprocess a quadmask video for VOID inpainting.
|
||||
|
||||
Quantizes mask values to four semantic levels, inverts, and normalizes:
|
||||
0 -> primary object to remove
|
||||
63 -> overlap of primary + affected
|
||||
127 -> affected region (interactions)
|
||||
255 -> background (keep)
|
||||
|
||||
After inversion and normalization, the output mask has values in [0, 1]
|
||||
with four discrete levels: 1.0 (remove), ~0.75, ~0.50, 0.0 (keep).
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDQuadmaskPreprocess",
|
||||
category="mask/video",
|
||||
inputs=[
|
||||
io.Mask.Input("mask"),
|
||||
io.Int.Input("dilate_width", default=0, min=0, max=50, step=1,
|
||||
tooltip="Dilation radius for the primary mask region (0 = no dilation)"),
|
||||
],
|
||||
outputs=[
|
||||
io.Mask.Output(display_name="quadmask"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, mask, dilate_width=0) -> io.NodeOutput:
|
||||
m = mask.clone()
|
||||
|
||||
if m.max() <= 1.0:
|
||||
m = m * 255.0
|
||||
|
||||
if dilate_width > 0 and m.ndim >= 3:
|
||||
binary = (m < 128).float()
|
||||
kernel_size = dilate_width * 2 + 1
|
||||
if binary.ndim == 3:
|
||||
binary = binary.unsqueeze(1)
|
||||
dilated = torch.nn.functional.max_pool2d(
|
||||
binary, kernel_size=kernel_size, stride=1, padding=dilate_width
|
||||
)
|
||||
if dilated.ndim == 4:
|
||||
dilated = dilated.squeeze(1)
|
||||
m = torch.where(dilated > 0.5, torch.zeros_like(m), m)
|
||||
|
||||
m = torch.where(m <= 31, torch.zeros_like(m), m)
|
||||
m = torch.where((m > 31) & (m <= 95), torch.full_like(m, 63), m)
|
||||
m = torch.where((m > 95) & (m <= 191), torch.full_like(m, 127), m)
|
||||
m = torch.where(m > 191, torch.full_like(m, 255), m)
|
||||
|
||||
m = (255.0 - m) / 255.0
|
||||
|
||||
return io.NodeOutput(m)
|
||||
|
||||
|
||||
class VOIDInpaintConditioning(io.ComfyNode):
|
||||
"""Build VOID inpainting conditioning for CogVideoX.
|
||||
|
||||
Encodes the processed quadmask and masked source video through the VAE,
|
||||
producing a 32-channel concat conditioning (16ch mask + 16ch masked video)
|
||||
that gets concatenated with the 16ch noise latent by the model.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDInpaintConditioning",
|
||||
category="conditioning/video_models",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
io.Vae.Input("vae"),
|
||||
io.Image.Input("video", tooltip="Source video frames [T, H, W, 3]"),
|
||||
io.Mask.Input("quadmask", tooltip="Preprocessed quadmask from VOIDQuadmaskPreprocess [T, H, W]"),
|
||||
io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
||||
io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
||||
io.Int.Input("length", default=45, min=1, max=nodes.MAX_RESOLUTION, step=1,
|
||||
tooltip="Number of pixel frames to process. For CogVideoX-Fun-V1.5 "
|
||||
"(patch_size_t=2), latent_t must be even — lengths that "
|
||||
"produce odd latent_t are rounded down (e.g. 49 → 45)."),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=64),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(display_name="positive"),
|
||||
io.Conditioning.Output(display_name="negative"),
|
||||
io.Latent.Output(display_name="latent"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, positive, negative, vae, video, quadmask,
|
||||
width, height, length, batch_size) -> io.NodeOutput:
|
||||
|
||||
adjusted_length = _valid_void_length(length)
|
||||
if adjusted_length != length:
|
||||
logging.warning(
|
||||
"VOIDInpaintConditioning: rounding length %d down to %d so that "
|
||||
"latent_t is even (required by CogVideoX-Fun-V1.5 patch_size_t=2). "
|
||||
"Using odd latent_t causes the last frame to be corrupted by "
|
||||
"circular padding.", length, adjusted_length,
|
||||
)
|
||||
length = adjusted_length
|
||||
|
||||
latent_t = ((length - 1) // TEMPORAL_COMPRESSION) + 1
|
||||
latent_h = height // 8
|
||||
latent_w = width // 8
|
||||
|
||||
vid = video[:length]
|
||||
vid = comfy.utils.common_upscale(
|
||||
vid.movedim(-1, 1), width, height, "bilinear", "center"
|
||||
).movedim(1, -1)
|
||||
|
||||
qm = quadmask[:length]
|
||||
if qm.ndim == 3:
|
||||
qm = qm.unsqueeze(-1)
|
||||
qm = comfy.utils.common_upscale(
|
||||
qm.movedim(-1, 1), width, height, "bilinear", "center"
|
||||
).movedim(1, -1)
|
||||
if qm.ndim == 4 and qm.shape[-1] == 1:
|
||||
qm = qm.squeeze(-1)
|
||||
|
||||
mask_condition = qm
|
||||
if mask_condition.ndim == 3:
|
||||
mask_condition_3ch = mask_condition.unsqueeze(-1).expand(-1, -1, -1, 3)
|
||||
else:
|
||||
mask_condition_3ch = mask_condition
|
||||
|
||||
inverted_mask_3ch = 1.0 - mask_condition_3ch
|
||||
masked_video = vid[:, :, :, :3] * (1.0 - mask_condition_3ch)
|
||||
|
||||
mask_latents = vae.encode(inverted_mask_3ch)
|
||||
masked_video_latents = vae.encode(masked_video)
|
||||
|
||||
def _match_temporal(lat, target_t):
|
||||
if lat.shape[2] > target_t:
|
||||
return lat[:, :, :target_t]
|
||||
elif lat.shape[2] < target_t:
|
||||
pad = target_t - lat.shape[2]
|
||||
return torch.cat([lat, lat[:, :, -1:].repeat(1, 1, pad, 1, 1)], dim=2)
|
||||
return lat
|
||||
|
||||
mask_latents = _match_temporal(mask_latents, latent_t)
|
||||
masked_video_latents = _match_temporal(masked_video_latents, latent_t)
|
||||
|
||||
inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1)
|
||||
|
||||
# No explicit scaling needed here: the model's CogVideoX.concat_cond()
|
||||
# applies process_latent_in (×latent_format.scale_factor) to each 16-ch
|
||||
# block of the stored conditioning. For 5b-class checkpoints (incl. the
|
||||
# VOID/CogVideoX-Fun-V1.5 inpainting model) that scale_factor is auto-
|
||||
# selected as 0.7 in supported_models.CogVideoX_T2V, which matches the
|
||||
# diffusers vae/config.json scaling_factor VOID was trained with.
|
||||
|
||||
positive = node_helpers.conditioning_set_values(
|
||||
positive, {"concat_latent_image": inpaint_latents}
|
||||
)
|
||||
negative = node_helpers.conditioning_set_values(
|
||||
negative, {"concat_latent_image": inpaint_latents}
|
||||
)
|
||||
|
||||
noise_latent = torch.zeros(
|
||||
[batch_size, 16, latent_t, latent_h, latent_w],
|
||||
device=comfy.model_management.intermediate_device()
|
||||
)
|
||||
|
||||
return io.NodeOutput(positive, negative, {"samples": noise_latent})
|
||||
|
||||
|
||||
class VOIDWarpedNoise(io.ComfyNode):
|
||||
"""Generate optical-flow warped noise for VOID Pass 2 refinement.
|
||||
|
||||
Takes the Pass 1 output video and produces temporally-correlated noise
|
||||
by warping Gaussian noise along optical flow vectors. This noise is used
|
||||
as the initial latent for Pass 2, resulting in better temporal consistency.
|
||||
|
||||
Requires: pip install rp (auto-installs Go-with-the-Flow dependencies)
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDWarpedNoise",
|
||||
category="latent/video",
|
||||
inputs=[
|
||||
io.Image.Input("video", tooltip="Pass 1 output video frames [T, H, W, 3]"),
|
||||
io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
||||
io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8),
|
||||
io.Int.Input("length", default=45, min=1, max=nodes.MAX_RESOLUTION, step=1,
|
||||
tooltip="Number of pixel frames. Rounded down to make latent_t "
|
||||
"even (patch_size_t=2 requirement), e.g. 49 → 45."),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=64),
|
||||
],
|
||||
outputs=[
|
||||
io.Latent.Output(display_name="warped_noise"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, video, width, height, length, batch_size) -> io.NodeOutput:
|
||||
try:
|
||||
import rp
|
||||
rp.r._pip_import_autoyes = True
|
||||
rp.git_import('CommonSource')
|
||||
import rp.git.CommonSource.noise_warp as nw
|
||||
except ImportError:
|
||||
raise RuntimeError(
|
||||
"VOIDWarpedNoise requires the 'rp' package. Install with: pip install rp"
|
||||
)
|
||||
|
||||
adjusted_length = _valid_void_length(length)
|
||||
if adjusted_length != length:
|
||||
logging.warning(
|
||||
"VOIDWarpedNoise: rounding length %d down to %d so that "
|
||||
"latent_t is even (required by CogVideoX-Fun-V1.5 patch_size_t=2).",
|
||||
length, adjusted_length,
|
||||
)
|
||||
length = adjusted_length
|
||||
|
||||
latent_t = ((length - 1) // TEMPORAL_COMPRESSION) + 1
|
||||
latent_h = height // 8
|
||||
latent_w = width // 8
|
||||
|
||||
# rp.get_noise_from_video expects uint8 numpy frames; everything
|
||||
# downstream of the warp stays on torch.
|
||||
vid_uint8 = (video[:length].clamp(0, 1) * 255).to(torch.uint8).cpu().numpy()
|
||||
|
||||
frames = [vid_uint8[i] for i in range(vid_uint8.shape[0])]
|
||||
frames = rp.resize_images_to_hold(frames, height=height, width=width)
|
||||
frames = rp.crop_images(frames, height=height, width=width, origin='center')
|
||||
frames = rp.as_numpy_array(frames)
|
||||
|
||||
FRAME = 2**-1
|
||||
FLOW = 2**3
|
||||
LATENT_SCALE = 8
|
||||
|
||||
warp_output = nw.get_noise_from_video(
|
||||
frames,
|
||||
remove_background=False,
|
||||
visualize=False,
|
||||
save_files=False,
|
||||
noise_channels=16,
|
||||
output_folder=None,
|
||||
resize_frames=FRAME,
|
||||
resize_flow=FLOW,
|
||||
downscale_factor=round(FRAME * FLOW) * LATENT_SCALE,
|
||||
)
|
||||
|
||||
# (T, H, W, C) → torch on intermediate device for torchified resize.
|
||||
warped = torch.from_numpy(warp_output.numpy_noises).float()
|
||||
device = comfy.model_management.intermediate_device()
|
||||
warped = warped.to(device)
|
||||
|
||||
if warped.shape[0] != latent_t:
|
||||
indices = torch.linspace(0, warped.shape[0] - 1, latent_t,
|
||||
device=device).long()
|
||||
warped = warped[indices]
|
||||
|
||||
if warped.shape[1] != latent_h or warped.shape[2] != latent_w:
|
||||
# (T, H, W, C) → (T, C, H, W) → bilinear resize → back
|
||||
warped = warped.permute(0, 3, 1, 2)
|
||||
warped = torch.nn.functional.interpolate(
|
||||
warped, size=(latent_h, latent_w),
|
||||
mode="bilinear", align_corners=False,
|
||||
)
|
||||
warped = warped.permute(0, 2, 3, 1)
|
||||
|
||||
# (T, H, W, C) → (B, C, T, H, W)
|
||||
warped_tensor = warped.permute(3, 0, 1, 2).unsqueeze(0)
|
||||
if batch_size > 1:
|
||||
warped_tensor = warped_tensor.repeat(batch_size, 1, 1, 1, 1)
|
||||
|
||||
return io.NodeOutput({"samples": warped_tensor})
|
||||
|
||||
|
||||
class Noise_FromLatent:
|
||||
"""Wraps a pre-computed LATENT tensor as a NOISE source."""
|
||||
def __init__(self, latent_dict):
|
||||
self.seed = 0
|
||||
self._samples = latent_dict["samples"]
|
||||
|
||||
def generate_noise(self, input_latent):
|
||||
return self._samples.clone().cpu()
|
||||
|
||||
|
||||
class VOIDWarpedNoiseSource(io.ComfyNode):
|
||||
"""Convert a LATENT (e.g. from VOIDWarpedNoise) into a NOISE source
|
||||
for use with SamplerCustomAdvanced."""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDWarpedNoiseSource",
|
||||
category="sampling/custom_sampling/noise",
|
||||
inputs=[
|
||||
io.Latent.Input("warped_noise",
|
||||
tooltip="Warped noise latent from VOIDWarpedNoise"),
|
||||
],
|
||||
outputs=[io.Noise.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, warped_noise) -> io.NodeOutput:
|
||||
return io.NodeOutput(Noise_FromLatent(warped_noise))
|
||||
|
||||
|
||||
class VOID_DDIM(comfy.samplers.Sampler):
|
||||
"""DDIM sampler for VOID inpainting models.
|
||||
|
||||
VOID was trained with the diffusers CogVideoXDDIMScheduler which operates in
|
||||
alpha-space (input std ≈ 1). The standard KSampler applies noise_scaling that
|
||||
multiplies by sqrt(1+sigma^2) ≈ 4500x, which is incompatible with VOID's
|
||||
training. This sampler skips noise_scaling and implements the DDIM update rule
|
||||
directly using sigma-to-alpha conversion.
|
||||
"""
|
||||
|
||||
def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
|
||||
x = noise.to(torch.float32)
|
||||
model_options = extra_args.get("model_options", {})
|
||||
seed = extra_args.get("seed", None)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
|
||||
for i in trange(len(sigmas) - 1, disable=disable_pbar):
|
||||
sigma = sigmas[i]
|
||||
sigma_next = sigmas[i + 1]
|
||||
|
||||
denoised = model_wrap(x, sigma * s_in, model_options=model_options, seed=seed)
|
||||
|
||||
if callback is not None:
|
||||
callback(i, denoised, x, len(sigmas) - 1)
|
||||
|
||||
if sigma_next == 0:
|
||||
x = denoised
|
||||
else:
|
||||
alpha_t = 1.0 / (1.0 + sigma ** 2)
|
||||
alpha_prev = 1.0 / (1.0 + sigma_next ** 2)
|
||||
|
||||
pred_eps = (x - (alpha_t ** 0.5) * denoised) / (1.0 - alpha_t) ** 0.5
|
||||
x = (alpha_prev ** 0.5) * denoised + (1.0 - alpha_prev) ** 0.5 * pred_eps
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class VOIDSampler(io.ComfyNode):
|
||||
"""VOID DDIM sampler for use with SamplerCustom / SamplerCustomAdvanced.
|
||||
|
||||
Required for VOID inpainting models. Implements the same DDIM loop that VOID
|
||||
was trained with (diffusers CogVideoXDDIMScheduler), without the noise_scaling
|
||||
that the standard KSampler applies. Use with RandomNoise or VOIDWarpedNoiseSource.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDSampler",
|
||||
category="sampling/custom_sampling/samplers",
|
||||
inputs=[],
|
||||
outputs=[io.Sampler.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls) -> io.NodeOutput:
|
||||
return io.NodeOutput(VOID_DDIM())
|
||||
|
||||
get_sampler = execute
|
||||
|
||||
|
||||
class VOIDExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
VOIDQuadmaskPreprocess,
|
||||
VOIDInpaintConditioning,
|
||||
VOIDWarpedNoise,
|
||||
VOIDWarpedNoiseSource,
|
||||
VOIDSampler,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> VOIDExtension:
|
||||
return VOIDExtension()
|
||||
@@ -1,3 +1,3 @@
|
||||
# This file is automatically generated by the build process when version is
|
||||
# updated in pyproject.toml.
|
||||
__version__ = "0.19.3"
|
||||
__version__ = "0.19.0"
|
||||
|
||||
7
nodes.py
7
nodes.py
@@ -977,7 +977,7 @@ class CLIPLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox"], ),
|
||||
},
|
||||
"optional": {
|
||||
"device": (["default", "cpu"], {"advanced": True}),
|
||||
@@ -987,7 +987,7 @@ class CLIPLoader:
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
|
||||
DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B"
|
||||
DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B"
|
||||
|
||||
def load_clip(self, clip_name, type="stable_diffusion", device="default"):
|
||||
clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
|
||||
@@ -2457,7 +2457,8 @@ async def init_builtin_extra_nodes():
|
||||
"nodes_number_convert.py",
|
||||
"nodes_painter.py",
|
||||
"nodes_curve.py",
|
||||
"nodes_rtdetr.py"
|
||||
"nodes_rtdetr.py",
|
||||
"nodes_void.py",
|
||||
]
|
||||
|
||||
import_failed = []
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "ComfyUI"
|
||||
version = "0.19.3"
|
||||
version = "0.19.0"
|
||||
readme = "README.md"
|
||||
license = { file = "LICENSE" }
|
||||
requires-python = ">=3.10"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
comfyui-frontend-package==1.42.14
|
||||
comfyui-workflow-templates==0.9.57
|
||||
comfyui-frontend-package==1.42.10
|
||||
comfyui-workflow-templates==0.9.50
|
||||
comfyui-embedded-docs==0.4.3
|
||||
torch
|
||||
torchsde
|
||||
@@ -19,7 +19,7 @@ scipy
|
||||
tqdm
|
||||
psutil
|
||||
alembic
|
||||
SQLAlchemy>=2.0
|
||||
SQLAlchemy
|
||||
filelock
|
||||
av>=14.2.0
|
||||
comfy-kitchen>=0.2.8
|
||||
|
||||
Reference in New Issue
Block a user