add support for headless

Merge branch 'master' into pysssss/angle-glsl
Merge remote-tracking branch 'origin/master' into pysssss/angle-glsl
2026-04-15 12:11:43 +00:00 · 2026-04-08 02:36:28 -07:00 · 2026-04-07 10:16:31 +01:00 · 2026-03-27 09:05:42 -07:00 · 2026-03-26 07:37:19 -07:00 · 2026-03-24 12:10:03 -07:00
33 changed files with 322 additions and 3126 deletions
--- a/.ci/windows_intel_base_files/run_intel_gpu.bat
+++ b/.ci/windows_intel_base_files/run_intel_gpu.bat
@@ -1,2 +0,0 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build
-pause
--- a/QUANTIZATION.md
+++ b/QUANTIZATION.md
@@ -139,9 +139,9 @@ Example:
  "_quantization_metadata": {
    "format_version": "1.0",
    "layers": {
-      "model.layers.0.mlp.up_proj": {"format": "float8_e4m3fn"},
-      "model.layers.0.mlp.down_proj": {"format": "float8_e4m3fn"},
-      "model.layers.1.mlp.up_proj": {"format": "float8_e4m3fn"}
+      "model.layers.0.mlp.up_proj": "float8_e4m3fn",
+      "model.layers.0.mlp.down_proj": "float8_e4m3fn",
+      "model.layers.1.mlp.up_proj": "float8_e4m3fn"
    }
  }
 }
@@ -165,4 +165,4 @@ Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_s
 3. **Compute scales**: Derive `input_scale` from collected statistics
 4. **Store in checkpoint**: Save `input_scale` parameters alongside weights

-The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
+The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
--- a/blueprints/Brightness
+++ b/blueprints/Brightness
@@ -182,7 +182,7 @@
              ]
            },
            "widgets_values": [
-              0
+              50
            ]
          },
          {
--- a/blueprints/Glow.json
+++ b/blueprints/Glow.json
@@ -316,7 +316,7 @@
              "step": 1
            },
            "widgets_values": [
-              0
+              30
            ]
          },
          {
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -783,10 +783,3 @@ class ZImagePixelSpace(ChromaRadiance):
    No VAE encoding/decoding — the model operates directly on RGB pixels.
    """
    pass
-
-class CogVideoX(LatentFormat):
-    latent_channels = 16
-    latent_dimensions = 3
-
-    def __init__(self):
-        self.scale_factor = 1.15258426
--- a/comfy/ldm/cogvideo/init.py
+++ b/comfy/ldm/cogvideo/init.py
--- a/comfy/ldm/cogvideo/model.py
+++ b/comfy/ldm/cogvideo/model.py
@@ -1,573 +0,0 @@
-# CogVideoX 3D Transformer - ported to ComfyUI native ops
-# Architecture reference: diffusers CogVideoXTransformer3DModel
-# Style reference: comfy/ldm/wan/model.py
-
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from comfy.ldm.modules.attention import optimized_attention
-import comfy.patcher_extension
-import comfy.ldm.common_dit
-
-
-def _get_1d_rotary_pos_embed(dim, pos, theta=10000.0):
-    """Returns (cos, sin) each with shape [seq_len, dim].
-
-    Frequencies are computed at dim//2 resolution then repeat_interleaved
-    to full dim, matching CogVideoX's interleaved (real, imag) pair format.
-    """
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim))
-    angles = torch.outer(pos.float(), freqs.float())
-    cos = angles.cos().repeat_interleave(2, dim=-1).float()
-    sin = angles.sin().repeat_interleave(2, dim=-1).float()
-    return (cos, sin)
-
-
-def apply_rotary_emb(x, freqs_cos_sin):
-    """Apply CogVideoX rotary embedding to query or key tensor.
-
-    x: [B, heads, seq_len, head_dim]
-    freqs_cos_sin: (cos, sin) each [seq_len, head_dim//2]
-
-    Uses interleaved pair rotation (same as diffusers CogVideoX/Flux).
-    head_dim is reshaped to (-1, 2) pairs, rotated, then flattened back.
-    """
-    cos, sin = freqs_cos_sin
-    cos = cos[None, None, :, :].to(x.device)
-    sin = sin[None, None, :, :].to(x.device)
-
-    # Interleaved pairs: [B, H, S, D] -> [B, H, S, D//2, 2] -> (real, imag)
-    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
-    x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-
-    return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
-
-
-def get_timestep_embedding(timesteps, dim, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1, max_period=10000):
-    half = dim // 2
-    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half)
-    args = timesteps[:, None].float() * freqs[None] * scale
-    embedding = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
-    if flip_sin_to_cos:
-        embedding = torch.cat([embedding[:, half:], embedding[:, :half]], dim=-1)
-    if dim % 2:
-        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    return embedding
-
-
-def get_3d_sincos_pos_embed(embed_dim, spatial_size, temporal_size, spatial_interpolation_scale=1.0, temporal_interpolation_scale=1.0, device=None):
-    if isinstance(spatial_size, int):
-        spatial_size = (spatial_size, spatial_size)
-
-    grid_w = torch.arange(spatial_size[0], dtype=torch.float32, device=device) / spatial_interpolation_scale
-    grid_h = torch.arange(spatial_size[1], dtype=torch.float32, device=device) / spatial_interpolation_scale
-    grid_t = torch.arange(temporal_size, dtype=torch.float32, device=device) / temporal_interpolation_scale
-
-    grid_t, grid_h, grid_w = torch.meshgrid(grid_t, grid_h, grid_w, indexing="ij")
-
-    embed_dim_spatial = 2 * (embed_dim // 3)
-    embed_dim_temporal = embed_dim // 3
-
-    pos_embed_spatial = _get_2d_sincos_pos_embed(embed_dim_spatial, grid_h, grid_w, device=device)
-    pos_embed_temporal = _get_1d_sincos_pos_embed(embed_dim_temporal, grid_t[:, 0, 0], device=device)
-
-    T, H, W = grid_t.shape
-    pos_embed_temporal = pos_embed_temporal.unsqueeze(1).unsqueeze(1).expand(-1, H, W, -1)
-    pos_embed = torch.cat([pos_embed_temporal, pos_embed_spatial], dim=-1)
-
-    return pos_embed
-
-
-def _get_2d_sincos_pos_embed(embed_dim, grid_h, grid_w, device=None):
-    T, H, W = grid_h.shape
-    half_dim = embed_dim // 2
-    pos_h = _get_1d_sincos_pos_embed(half_dim, grid_h.reshape(-1), device=device).reshape(T, H, W, half_dim)
-    pos_w = _get_1d_sincos_pos_embed(half_dim, grid_w.reshape(-1), device=device).reshape(T, H, W, half_dim)
-    return torch.cat([pos_h, pos_w], dim=-1)
-
-
-def _get_1d_sincos_pos_embed(embed_dim, pos, device=None):
-    half = embed_dim // 2
-    freqs = torch.exp(-math.log(10000.0) * torch.arange(start=0, end=half, dtype=torch.float32, device=device) / half)
-    args = pos.float().reshape(-1)[:, None] * freqs[None]
-    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    if embed_dim % 2:
-        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    return embedding
-
-
-
-class CogVideoXPatchEmbed(nn.Module):
-    def __init__(self, patch_size=2, patch_size_t=None, in_channels=16, dim=1920,
-                 text_dim=4096, bias=True, sample_width=90, sample_height=60,
-                 sample_frames=49, temporal_compression_ratio=4,
-                 max_text_seq_length=226, spatial_interpolation_scale=1.875,
-                 temporal_interpolation_scale=1.0, use_positional_embeddings=True,
-                 use_learned_positional_embeddings=True,
-                 device=None, dtype=None, operations=None):
-        super().__init__()
-        self.patch_size = patch_size
-        self.patch_size_t = patch_size_t
-        self.dim = dim
-        self.sample_height = sample_height
-        self.sample_width = sample_width
-        self.sample_frames = sample_frames
-        self.temporal_compression_ratio = temporal_compression_ratio
-        self.max_text_seq_length = max_text_seq_length
-        self.spatial_interpolation_scale = spatial_interpolation_scale
-        self.temporal_interpolation_scale = temporal_interpolation_scale
-        self.use_positional_embeddings = use_positional_embeddings
-        self.use_learned_positional_embeddings = use_learned_positional_embeddings
-
-        if patch_size_t is None:
-            self.proj = operations.Conv2d(in_channels, dim, kernel_size=patch_size, stride=patch_size, bias=bias, device=device, dtype=dtype)
-        else:
-            self.proj = operations.Linear(in_channels * patch_size * patch_size * patch_size_t, dim, device=device, dtype=dtype)
-
-        self.text_proj = operations.Linear(text_dim, dim, device=device, dtype=dtype)
-
-        if use_positional_embeddings or use_learned_positional_embeddings:
-            persistent = use_learned_positional_embeddings
-            pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
-            self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
-
-    def _get_positional_embeddings(self, sample_height, sample_width, sample_frames, device=None):
-        post_patch_height = sample_height // self.patch_size
-        post_patch_width = sample_width // self.patch_size
-        post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
-        if self.patch_size_t is not None:
-            post_time_compression_frames = post_time_compression_frames // self.patch_size_t
-        num_patches = post_patch_height * post_patch_width * post_time_compression_frames
-
-        pos_embedding = get_3d_sincos_pos_embed(
-            self.dim,
-            (post_patch_width, post_patch_height),
-            post_time_compression_frames,
-            self.spatial_interpolation_scale,
-            self.temporal_interpolation_scale,
-            device=device,
-        )
-        pos_embedding = pos_embedding.reshape(-1, self.dim)
-        joint_pos_embedding = pos_embedding.new_zeros(
-            1, self.max_text_seq_length + num_patches, self.dim, requires_grad=False
-        )
-        joint_pos_embedding.data[:, self.max_text_seq_length:].copy_(pos_embedding)
-        return joint_pos_embedding
-
-    def forward(self, text_embeds, image_embeds):
-        input_dtype = text_embeds.dtype
-        text_embeds = self.text_proj(text_embeds.to(self.text_proj.weight.dtype)).to(input_dtype)
-        batch_size, num_frames, channels, height, width = image_embeds.shape
-
-        proj_dtype = self.proj.weight.dtype
-        if self.patch_size_t is None:
-            image_embeds = image_embeds.reshape(-1, channels, height, width)
-            image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
-            image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
-            image_embeds = image_embeds.flatten(3).transpose(2, 3)
-            image_embeds = image_embeds.flatten(1, 2)
-        else:
-            p = self.patch_size
-            p_t = self.patch_size_t
-            image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
-            image_embeds = image_embeds.reshape(
-                batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
-            )
-            image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
-            image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
-
-        embeds = torch.cat([text_embeds, image_embeds], dim=1).contiguous()
-
-        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
-            text_seq_length = text_embeds.shape[1]
-            num_image_patches = image_embeds.shape[1]
-
-            if self.use_learned_positional_embeddings:
-                image_pos = self.pos_embedding[
-                    :, self.max_text_seq_length:self.max_text_seq_length + num_image_patches
-                ].to(device=embeds.device, dtype=embeds.dtype)
-            else:
-                image_pos = get_3d_sincos_pos_embed(
-                    self.dim,
-                    (width // self.patch_size, height // self.patch_size),
-                    num_image_patches // ((height // self.patch_size) * (width // self.patch_size)),
-                    self.spatial_interpolation_scale,
-                    self.temporal_interpolation_scale,
-                    device=embeds.device,
-                ).reshape(1, num_image_patches, self.dim).to(dtype=embeds.dtype)
-
-            # Build joint: zeros for text + sincos for image
-            joint_pos = torch.zeros(1, text_seq_length + num_image_patches, self.dim, device=embeds.device, dtype=embeds.dtype)
-            joint_pos[:, text_seq_length:] = image_pos
-            embeds = embeds + joint_pos
-
-        return embeds
-
-
-class CogVideoXLayerNormZero(nn.Module):
-    def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5, bias=True,
-                 device=None, dtype=None, operations=None):
-        super().__init__()
-        self.silu = nn.SiLU()
-        self.linear = operations.Linear(time_dim, 6 * dim, bias=bias, device=device, dtype=dtype)
-        self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
-
-    def forward(self, hidden_states, encoder_hidden_states, temb):
-        shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
-        hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
-        encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
-        return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
-
-
-class CogVideoXAdaLayerNorm(nn.Module):
-    def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5,
-                 device=None, dtype=None, operations=None):
-        super().__init__()
-        self.silu = nn.SiLU()
-        self.linear = operations.Linear(time_dim, 2 * dim, device=device, dtype=dtype)
-        self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
-
-    def forward(self, x, temb):
-        temb = self.linear(self.silu(temb))
-        shift, scale = temb.chunk(2, dim=1)
-        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
-        return x
-
-
-class CogVideoXBlock(nn.Module):
-    def __init__(self, dim, num_heads, head_dim, time_dim,
-                 eps=1e-5, ff_inner_dim=None, ff_bias=True,
-                 device=None, dtype=None, operations=None):
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-
-        self.norm1 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
-
-        # Self-attention (joint text + latent)
-        self.q = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
-        self.k = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
-        self.v = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
-        self.norm_q = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
-        self.norm_k = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
-        self.attn_out = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
-
-        self.norm2 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
-
-        # Feed-forward (GELU approximate)
-        inner_dim = ff_inner_dim or dim * 4
-        self.ff_proj = operations.Linear(dim, inner_dim, bias=ff_bias, device=device, dtype=dtype)
-        self.ff_out = operations.Linear(inner_dim, dim, bias=ff_bias, device=device, dtype=dtype)
-
-    def forward(self, hidden_states, encoder_hidden_states, temb, image_rotary_emb=None, transformer_options=None):
-        if transformer_options is None:
-            transformer_options = {}
-        text_seq_length = encoder_hidden_states.size(1)
-
-        # Norm & modulate
-        norm_hidden, norm_encoder, gate_msa, enc_gate_msa = self.norm1(hidden_states, encoder_hidden_states, temb)
-
-        # Joint self-attention
-        qkv_input = torch.cat([norm_encoder, norm_hidden], dim=1)
-        b, s, _ = qkv_input.shape
-        n, d = self.num_heads, self.head_dim
-
-        q = self.q(qkv_input).view(b, s, n, d)
-        k = self.k(qkv_input).view(b, s, n, d)
-        v = self.v(qkv_input)
-
-        q = self.norm_q(q).view(b, s, n, d)
-        k = self.norm_k(k).view(b, s, n, d)
-
-        # Apply rotary embeddings to image tokens only (diffusers format: [B, heads, seq, head_dim])
-        if image_rotary_emb is not None:
-            q_img = q[:, text_seq_length:].transpose(1, 2)  # [B, heads, img_seq, head_dim]
-            k_img = k[:, text_seq_length:].transpose(1, 2)
-            q_img = apply_rotary_emb(q_img, image_rotary_emb)
-            k_img = apply_rotary_emb(k_img, image_rotary_emb)
-            q = torch.cat([q[:, :text_seq_length], q_img.transpose(1, 2)], dim=1)
-            k = torch.cat([k[:, :text_seq_length], k_img.transpose(1, 2)], dim=1)
-
-        attn_out = optimized_attention(
-            q.reshape(b, s, n * d),
-            k.reshape(b, s, n * d),
-            v,
-            heads=self.num_heads,
-            transformer_options=transformer_options,
-        )
-
-        attn_out = self.attn_out(attn_out)
-
-        attn_encoder, attn_hidden = attn_out.split([text_seq_length, s - text_seq_length], dim=1)
-
-        hidden_states = hidden_states + gate_msa * attn_hidden
-        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder
-
-        # Norm & modulate for FF
-        norm_hidden, norm_encoder, gate_ff, enc_gate_ff = self.norm2(hidden_states, encoder_hidden_states, temb)
-
-        # Feed-forward (GELU on concatenated text + latent)
-        ff_input = torch.cat([norm_encoder, norm_hidden], dim=1)
-        ff_output = self.ff_out(F.gelu(self.ff_proj(ff_input), approximate="tanh"))
-
-        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
-        encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
-
-        return hidden_states, encoder_hidden_states
-
-
-class CogVideoXTransformer3DModel(nn.Module):
-    def __init__(self,
-                 num_attention_heads=30,
-                 attention_head_dim=64,
-                 in_channels=16,
-                 out_channels=16,
-                 flip_sin_to_cos=True,
-                 freq_shift=0,
-                 time_embed_dim=512,
-                 ofs_embed_dim=None,
-                 text_embed_dim=4096,
-                 num_layers=30,
-                 dropout=0.0,
-                 attention_bias=True,
-                 sample_width=90,
-                 sample_height=60,
-                 sample_frames=49,
-                 patch_size=2,
-                 patch_size_t=None,
-                 temporal_compression_ratio=4,
-                 max_text_seq_length=226,
-                 spatial_interpolation_scale=1.875,
-                 temporal_interpolation_scale=1.0,
-                 use_rotary_positional_embeddings=False,
-                 use_learned_positional_embeddings=False,
-                 patch_bias=True,
-                 image_model=None,
-                 device=None,
-                 dtype=None,
-                 operations=None,
-                 ):
-        super().__init__()
-        self.dtype = dtype
-        dim = num_attention_heads * attention_head_dim
-        self.dim = dim
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.patch_size = patch_size
-        self.patch_size_t = patch_size_t
-        self.max_text_seq_length = max_text_seq_length
-        self.use_rotary_positional_embeddings = use_rotary_positional_embeddings
-
-        # 1. Patch embedding
-        self.patch_embed = CogVideoXPatchEmbed(
-            patch_size=patch_size,
-            patch_size_t=patch_size_t,
-            in_channels=in_channels,
-            dim=dim,
-            text_dim=text_embed_dim,
-            bias=patch_bias,
-            sample_width=sample_width,
-            sample_height=sample_height,
-            sample_frames=sample_frames,
-            temporal_compression_ratio=temporal_compression_ratio,
-            max_text_seq_length=max_text_seq_length,
-            spatial_interpolation_scale=spatial_interpolation_scale,
-            temporal_interpolation_scale=temporal_interpolation_scale,
-            use_positional_embeddings=not use_rotary_positional_embeddings,
-            use_learned_positional_embeddings=use_learned_positional_embeddings,
-            device=device, dtype=torch.float32, operations=operations,
-        )
-
-        # 2. Time embedding
-        self.time_proj_dim = dim
-        self.time_proj_flip = flip_sin_to_cos
-        self.time_proj_shift = freq_shift
-        self.time_embedding_linear_1 = operations.Linear(dim, time_embed_dim, device=device, dtype=dtype)
-        self.time_embedding_act = nn.SiLU()
-        self.time_embedding_linear_2 = operations.Linear(time_embed_dim, time_embed_dim, device=device, dtype=dtype)
-
-        # Optional OFS embedding (CogVideoX 1.5 I2V)
-        self.ofs_proj_dim = ofs_embed_dim
-        if ofs_embed_dim:
-            self.ofs_embedding_linear_1 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
-            self.ofs_embedding_act = nn.SiLU()
-            self.ofs_embedding_linear_2 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
-        else:
-            self.ofs_embedding_linear_1 = None
-
-        # 3. Transformer blocks
-        self.blocks = nn.ModuleList([
-            CogVideoXBlock(
-                dim=dim,
-                num_heads=num_attention_heads,
-                head_dim=attention_head_dim,
-                time_dim=time_embed_dim,
-                eps=1e-5,
-                device=device, dtype=dtype, operations=operations,
-            )
-            for _ in range(num_layers)
-        ])
-
-        self.norm_final = operations.LayerNorm(dim, eps=1e-5, elementwise_affine=True, device=device, dtype=dtype)
-
-        # 4. Output
-        self.norm_out = CogVideoXAdaLayerNorm(
-            time_dim=time_embed_dim, dim=dim, eps=1e-5,
-            device=device, dtype=dtype, operations=operations,
-        )
-
-        if patch_size_t is None:
-            output_dim = patch_size * patch_size * out_channels
-        else:
-            output_dim = patch_size * patch_size * patch_size_t * out_channels
-
-        self.proj_out = operations.Linear(dim, output_dim, device=device, dtype=dtype)
-
-        self.spatial_interpolation_scale = spatial_interpolation_scale
-        self.temporal_interpolation_scale = temporal_interpolation_scale
-        self.temporal_compression_ratio = temporal_compression_ratio
-
-    def forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
-        if transformer_options is None:
-            transformer_options = {}
-        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
-            self._forward,
-            self,
-            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, ofs, transformer_options, **kwargs)
-
-    def _forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
-        if transformer_options is None:
-            transformer_options = {}
-        # ComfyUI passes [B, C, T, H, W]
-        batch_size, channels, t, h, w = x.shape
-
-        # Pad to patch size (temporal + spatial), same pattern as WAN
-        p_t = self.patch_size_t if self.patch_size_t is not None else 1
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, (p_t, self.patch_size, self.patch_size))
-
-        # CogVideoX expects [B, T, C, H, W]
-        x = x.permute(0, 2, 1, 3, 4)
-        batch_size, num_frames, channels, height, width = x.shape
-
-        # Time embedding
-        t_emb = get_timestep_embedding(timestep, self.time_proj_dim, self.time_proj_flip, self.time_proj_shift)
-        t_emb = t_emb.to(dtype=x.dtype)
-        emb = self.time_embedding_linear_2(self.time_embedding_act(self.time_embedding_linear_1(t_emb)))
-
-        if self.ofs_embedding_linear_1 is not None and ofs is not None:
-            ofs_emb = get_timestep_embedding(ofs, self.ofs_proj_dim, self.time_proj_flip, self.time_proj_shift)
-            ofs_emb = ofs_emb.to(dtype=x.dtype)
-            ofs_emb = self.ofs_embedding_linear_2(self.ofs_embedding_act(self.ofs_embedding_linear_1(ofs_emb)))
-            emb = emb + ofs_emb
-
-        # Patch embedding
-        hidden_states = self.patch_embed(context, x)
-
-        text_seq_length = context.shape[1]
-        encoder_hidden_states = hidden_states[:, :text_seq_length]
-        hidden_states = hidden_states[:, text_seq_length:]
-
-        # Rotary embeddings (if used)
-        image_rotary_emb = None
-        if self.use_rotary_positional_embeddings:
-            post_patch_height = height // self.patch_size
-            post_patch_width = width // self.patch_size
-            if self.patch_size_t is None:
-                post_time = num_frames
-            else:
-                post_time = num_frames // self.patch_size_t
-            image_rotary_emb = self._get_rotary_emb(post_patch_height, post_patch_width, post_time, device=x.device)
-
-        # Transformer blocks
-        for i, block in enumerate(self.blocks):
-            hidden_states, encoder_hidden_states = block(
-                hidden_states=hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                temb=emb,
-                image_rotary_emb=image_rotary_emb,
-                transformer_options=transformer_options,
-            )
-
-        hidden_states = self.norm_final(hidden_states)
-
-        # Output projection
-        hidden_states = self.norm_out(hidden_states, temb=emb)
-        hidden_states = self.proj_out(hidden_states)
-
-        # Unpatchify
-        p = self.patch_size
-        p_t = self.patch_size_t
-
-        if p_t is None:
-            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
-            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
-        else:
-            output = hidden_states.reshape(
-                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
-            )
-            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
-
-        # Back to ComfyUI format [B, C, T, H, W] and crop padding
-        output = output.permute(0, 2, 1, 3, 4)[:, :, :t, :h, :w]
-        return output
-
-    def _get_rotary_emb(self, h, w, t, device):
-        """Compute CogVideoX 3D rotary positional embeddings.
-
-        For CogVideoX 1.5 (patch_size_t != None): uses "slice" mode — grid positions
-        are integer arange computed at max_size, then sliced to actual size.
-        For CogVideoX 1.0 (patch_size_t == None): uses "linspace" mode with crop coords
-        scaled by spatial_interpolation_scale.
-        """
-        d = self.attention_head_dim
-        dim_t = d // 4
-        dim_h = d // 8 * 3
-        dim_w = d // 8 * 3
-
-        if self.patch_size_t is not None:
-            # CogVideoX 1.5: "slice" mode — positions are simple integer indices
-            # Compute at max(sample_size, actual_size) then slice to actual
-            base_h = self.patch_embed.sample_height // self.patch_size
-            base_w = self.patch_embed.sample_width // self.patch_size
-            max_h = max(base_h, h)
-            max_w = max(base_w, w)
-
-            grid_h = torch.arange(max_h, device=device, dtype=torch.float32)
-            grid_w = torch.arange(max_w, device=device, dtype=torch.float32)
-            grid_t = torch.arange(t, device=device, dtype=torch.float32)
-        else:
-            # CogVideoX 1.0: "linspace" mode with interpolation scale
-            grid_h = torch.linspace(0, h - 1, h, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
-            grid_w = torch.linspace(0, w - 1, w, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
-            grid_t = torch.arange(t, device=device, dtype=torch.float32)
-
-        freqs_t = _get_1d_rotary_pos_embed(dim_t, grid_t)
-        freqs_h = _get_1d_rotary_pos_embed(dim_h, grid_h)
-        freqs_w = _get_1d_rotary_pos_embed(dim_w, grid_w)
-
-        t_cos, t_sin = freqs_t
-        h_cos, h_sin = freqs_h
-        w_cos, w_sin = freqs_w
-
-        # Slice to actual size (for "slice" mode where grids may be larger)
-        t_cos, t_sin = t_cos[:t], t_sin[:t]
-        h_cos, h_sin = h_cos[:h], h_sin[:h]
-        w_cos, w_sin = w_cos[:w], w_sin[:w]
-
-        # Broadcast and concatenate into [T*H*W, head_dim]
-        t_cos = t_cos[:, None, None, :].expand(-1, h, w, -1)
-        t_sin = t_sin[:, None, None, :].expand(-1, h, w, -1)
-        h_cos = h_cos[None, :, None, :].expand(t, -1, w, -1)
-        h_sin = h_sin[None, :, None, :].expand(t, -1, w, -1)
-        w_cos = w_cos[None, None, :, :].expand(t, h, -1, -1)
-        w_sin = w_sin[None, None, :, :].expand(t, h, -1, -1)
-
-        cos = torch.cat([t_cos, h_cos, w_cos], dim=-1).reshape(t * h * w, -1)
-        sin = torch.cat([t_sin, h_sin, w_sin], dim=-1).reshape(t * h * w, -1)
-        return (cos, sin)
--- a/comfy/ldm/cogvideo/vae.py
+++ b/comfy/ldm/cogvideo/vae.py
@@ -1,566 +0,0 @@
-# CogVideoX VAE - ported to ComfyUI native ops
-# Architecture reference: diffusers AutoencoderKLCogVideoX
-# Style reference: comfy/ldm/wan/vae.py
-
-import numpy as np
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-
-class CausalConv3d(nn.Module):
-    """Causal 3D convolution with temporal padding.
-
-    Uses comfy.ops.Conv3d with autopad='causal_zero' fast path: when input has
-    a single temporal frame and no cache, the 3D conv weight is sliced to act
-    as a 2D conv, avoiding computation on zero-padded temporal dimensions.
-    """
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, pad_mode="constant"):
-        super().__init__()
-        if isinstance(kernel_size, int):
-            kernel_size = (kernel_size,) * 3
-
-        time_kernel, height_kernel, width_kernel = kernel_size
-        self.time_kernel_size = time_kernel
-        self.pad_mode = pad_mode
-
-        height_pad = (height_kernel - 1) // 2
-        width_pad = (width_kernel - 1) // 2
-        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_kernel - 1, 0)
-
-        stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
-        dilation = (dilation, 1, 1)
-        self.conv = ops.Conv3d(
-            in_channels, out_channels, kernel_size,
-            stride=stride, dilation=dilation,
-            padding=(0, height_pad, width_pad),
-        )
-
-    def forward(self, x, conv_cache=None):
-        if self.pad_mode == "replicate":
-            x = F.pad(x, self.time_causal_padding, mode="replicate")
-            conv_cache = None
-        else:
-            kernel_t = self.time_kernel_size
-            if kernel_t > 1:
-                if conv_cache is None and x.shape[2] == 1:
-                    # Fast path: single frame, no cache. All temporal padding
-                    # frames are copies of the input (replicate-style), so the
-                    # 3D conv reduces to a 2D conv with summed temporal kernel.
-                    w = comfy.ops.cast_to_input(self.conv.weight, x)
-                    b = comfy.ops.cast_to_input(self.conv.bias, x) if self.conv.bias is not None else None
-                    w2d = w.sum(dim=2, keepdim=True)
-                    out = F.conv3d(x, w2d, b,
-                                   self.conv.stride, self.conv.padding,
-                                   self.conv.dilation, self.conv.groups)
-                    return out, None
-                cached = [conv_cache] if conv_cache is not None else [x[:, :, :1]] * (kernel_t - 1)
-                x = torch.cat(cached + [x], dim=2)
-            conv_cache = x[:, :, -self.time_kernel_size + 1:].clone() if self.time_kernel_size > 1 else None
-
-        out = self.conv(x)
-        return out, conv_cache
-
-
-def _interpolate_zq(zq, target_size):
-    """Interpolate latent z to target (T, H, W), matching CogVideoX's first-frame-special handling."""
-    t = target_size[0]
-    if t > 1 and t % 2 == 1:
-        z_first = F.interpolate(zq[:, :, :1], size=(1, target_size[1], target_size[2]))
-        z_rest = F.interpolate(zq[:, :, 1:], size=(t - 1, target_size[1], target_size[2]))
-        return torch.cat([z_first, z_rest], dim=2)
-    return F.interpolate(zq, size=target_size)
-
-
-class SpatialNorm3D(nn.Module):
-    """Spatially conditioned normalization."""
-    def __init__(self, f_channels, zq_channels, groups=32):
-        super().__init__()
-        self.norm_layer = ops.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
-        self.conv_y = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
-        self.conv_b = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
-
-    def forward(self, f, zq, conv_cache=None):
-        new_cache = {}
-        conv_cache = conv_cache or {}
-
-        if zq.shape[-3:] != f.shape[-3:]:
-            zq = _interpolate_zq(zq, f.shape[-3:])
-
-        conv_y, new_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
-        conv_b, new_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
-
-        return self.norm_layer(f) * conv_y + conv_b, new_cache
-
-
-class ResnetBlock3D(nn.Module):
-    """3D ResNet block with optional spatial norm."""
-    def __init__(self, in_channels, out_channels=None, temb_channels=512, groups=32,
-                 eps=1e-6, act_fn="silu", spatial_norm_dim=None, pad_mode="first"):
-        super().__init__()
-        out_channels = out_channels or in_channels
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.spatial_norm_dim = spatial_norm_dim
-
-        if act_fn == "silu":
-            self.nonlinearity = nn.SiLU()
-        elif act_fn == "swish":
-            self.nonlinearity = nn.SiLU()
-        else:
-            self.nonlinearity = nn.SiLU()
-
-        if spatial_norm_dim is None:
-            self.norm1 = ops.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
-            self.norm2 = ops.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
-        else:
-            self.norm1 = SpatialNorm3D(in_channels, spatial_norm_dim, groups=groups)
-            self.norm2 = SpatialNorm3D(out_channels, spatial_norm_dim, groups=groups)
-
-        self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
-
-        if temb_channels > 0:
-            self.temb_proj = ops.Linear(temb_channels, out_channels)
-
-        self.conv2 = CausalConv3d(out_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
-
-        if in_channels != out_channels:
-            self.conv_shortcut = ops.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
-        else:
-            self.conv_shortcut = None
-
-    def forward(self, x, temb=None, zq=None, conv_cache=None):
-        new_cache = {}
-        conv_cache = conv_cache or {}
-        residual = x
-
-        if zq is not None:
-            x, new_cache["norm1"] = self.norm1(x, zq, conv_cache=conv_cache.get("norm1"))
-        else:
-            x = self.norm1(x)
-
-        x = self.nonlinearity(x)
-        x, new_cache["conv1"] = self.conv1(x, conv_cache=conv_cache.get("conv1"))
-
-        if temb is not None and hasattr(self, "temb_proj"):
-            x = x + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
-
-        if zq is not None:
-            x, new_cache["norm2"] = self.norm2(x, zq, conv_cache=conv_cache.get("norm2"))
-        else:
-            x = self.norm2(x)
-
-        x = self.nonlinearity(x)
-        x, new_cache["conv2"] = self.conv2(x, conv_cache=conv_cache.get("conv2"))
-
-        if self.conv_shortcut is not None:
-            residual = self.conv_shortcut(residual)
-
-        return x + residual, new_cache
-
-
-class Downsample3D(nn.Module):
-    """3D downsampling with optional temporal compression."""
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=0, compress_time=False):
-        super().__init__()
-        self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
-        self.compress_time = compress_time
-
-    def forward(self, x):
-        if self.compress_time:
-            b, c, t, h, w = x.shape
-            x = x.permute(0, 3, 4, 1, 2).reshape(b * h * w, c, t)
-            if t % 2 == 1:
-                x_first, x_rest = x[..., 0], x[..., 1:]
-                if x_rest.shape[-1] > 0:
-                    x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
-                x = torch.cat([x_first[..., None], x_rest], dim=-1)
-                x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
-            else:
-                x = F.avg_pool1d(x, kernel_size=2, stride=2)
-                x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
-
-        pad = (0, 1, 0, 1)
-        x = F.pad(x, pad, mode="constant", value=0)
-        b, c, t, h, w = x.shape
-        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
-        x = self.conv(x)
-        x = x.reshape(b, t, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
-        return x
-
-
-class Upsample3D(nn.Module):
-    """3D upsampling with optional temporal decompression."""
-    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, compress_time=False):
-        super().__init__()
-        self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
-        self.compress_time = compress_time
-
-    def forward(self, x):
-        if self.compress_time:
-            if x.shape[2] > 1 and x.shape[2] % 2 == 1:
-                x_first, x_rest = x[:, :, 0], x[:, :, 1:]
-                x_first = F.interpolate(x_first, scale_factor=2.0)
-                x_rest = F.interpolate(x_rest, scale_factor=2.0)
-                x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2)
-            elif x.shape[2] > 1:
-                x = F.interpolate(x, scale_factor=2.0)
-            else:
-                x = x.squeeze(2)
-                x = F.interpolate(x, scale_factor=2.0)
-                x = x[:, :, None, :, :]
-        else:
-            b, c, t, h, w = x.shape
-            x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
-            x = F.interpolate(x, scale_factor=2.0)
-            x = x.reshape(b, t, c, *x.shape[2:]).permute(0, 2, 1, 3, 4)
-
-        b, c, t, h, w = x.shape
-        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
-        x = self.conv(x)
-        x = x.reshape(b, t, *x.shape[1:]).permute(0, 2, 1, 3, 4)
-        return x
-
-
-class DownBlock3D(nn.Module):
-    def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
-                 eps=1e-6, act_fn="silu", groups=32, add_downsample=True,
-                 compress_time=False, pad_mode="first"):
-        super().__init__()
-        self.resnets = nn.ModuleList([
-            ResnetBlock3D(
-                in_channels=in_channels if i == 0 else out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels,
-                groups=groups, eps=eps, act_fn=act_fn, pad_mode=pad_mode,
-            )
-            for i in range(num_layers)
-        ])
-        self.downsamplers = nn.ModuleList([Downsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_downsample else None
-
-    def forward(self, x, temb=None, zq=None, conv_cache=None):
-        new_cache = {}
-        conv_cache = conv_cache or {}
-        for i, resnet in enumerate(self.resnets):
-            x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
-        if self.downsamplers is not None:
-            for ds in self.downsamplers:
-                x = ds(x)
-        return x, new_cache
-
-
-class MidBlock3D(nn.Module):
-    def __init__(self, in_channels, temb_channels=0, num_layers=1,
-                 eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=None, pad_mode="first"):
-        super().__init__()
-        self.resnets = nn.ModuleList([
-            ResnetBlock3D(
-                in_channels=in_channels, out_channels=in_channels,
-                temb_channels=temb_channels, groups=groups, eps=eps,
-                act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
-            )
-            for _ in range(num_layers)
-        ])
-
-    def forward(self, x, temb=None, zq=None, conv_cache=None):
-        new_cache = {}
-        conv_cache = conv_cache or {}
-        for i, resnet in enumerate(self.resnets):
-            x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
-        return x, new_cache
-
-
-class UpBlock3D(nn.Module):
-    def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
-                 eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=16,
-                 add_upsample=True, compress_time=False, pad_mode="first"):
-        super().__init__()
-        self.resnets = nn.ModuleList([
-            ResnetBlock3D(
-                in_channels=in_channels if i == 0 else out_channels,
-                out_channels=out_channels,
-                temb_channels=temb_channels, groups=groups, eps=eps,
-                act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
-            )
-            for i in range(num_layers)
-        ])
-        self.upsamplers = nn.ModuleList([Upsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_upsample else None
-
-    def forward(self, x, temb=None, zq=None, conv_cache=None):
-        new_cache = {}
-        conv_cache = conv_cache or {}
-        for i, resnet in enumerate(self.resnets):
-            x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
-        if self.upsamplers is not None:
-            for us in self.upsamplers:
-                x = us(x)
-        return x, new_cache
-
-
-class Encoder3D(nn.Module):
-    def __init__(self, in_channels=3, out_channels=16,
-                 block_out_channels=(128, 256, 256, 512),
-                 layers_per_block=3, act_fn="silu",
-                 eps=1e-6, groups=32, pad_mode="first",
-                 temporal_compression_ratio=4):
-        super().__init__()
-        temporal_compress_level = int(np.log2(temporal_compression_ratio))
-
-        self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
-
-        self.down_blocks = nn.ModuleList()
-        output_channel = block_out_channels[0]
-        for i in range(len(block_out_channels)):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final = i == len(block_out_channels) - 1
-            compress_time = i < temporal_compress_level
-
-            self.down_blocks.append(DownBlock3D(
-                in_channels=input_channel, out_channels=output_channel,
-                temb_channels=0, num_layers=layers_per_block,
-                eps=eps, act_fn=act_fn, groups=groups,
-                add_downsample=not is_final, compress_time=compress_time,
-            ))
-
-        self.mid_block = MidBlock3D(
-            in_channels=block_out_channels[-1], temb_channels=0,
-            num_layers=2, eps=eps, act_fn=act_fn, groups=groups, pad_mode=pad_mode,
-        )
-
-        self.norm_out = ops.GroupNorm(groups, block_out_channels[-1], eps=1e-6)
-        self.conv_act = nn.SiLU()
-        self.conv_out = CausalConv3d(block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode)
-
-    def forward(self, x, conv_cache=None):
-        new_cache = {}
-        conv_cache = conv_cache or {}
-
-        x, new_cache["conv_in"] = self.conv_in(x, conv_cache=conv_cache.get("conv_in"))
-
-        for i, block in enumerate(self.down_blocks):
-            key = f"down_block_{i}"
-            x, new_cache[key] = block(x, None, None, conv_cache.get(key))
-
-        x, new_cache["mid_block"] = self.mid_block(x, None, None, conv_cache=conv_cache.get("mid_block"))
-
-        x = self.norm_out(x)
-        x = self.conv_act(x)
-        x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
-
-        return x, new_cache
-
-
-class Decoder3D(nn.Module):
-    def __init__(self, in_channels=16, out_channels=3,
-                 block_out_channels=(128, 256, 256, 512),
-                 layers_per_block=3, act_fn="silu",
-                 eps=1e-6, groups=32, pad_mode="first",
-                 temporal_compression_ratio=4):
-        super().__init__()
-        reversed_channels = list(reversed(block_out_channels))
-        temporal_compress_level = int(np.log2(temporal_compression_ratio))
-
-        self.conv_in = CausalConv3d(in_channels, reversed_channels[0], kernel_size=3, pad_mode=pad_mode)
-
-        self.mid_block = MidBlock3D(
-            in_channels=reversed_channels[0], temb_channels=0,
-            num_layers=2, eps=eps, act_fn=act_fn, groups=groups,
-            spatial_norm_dim=in_channels, pad_mode=pad_mode,
-        )
-
-        self.up_blocks = nn.ModuleList()
-        output_channel = reversed_channels[0]
-        for i in range(len(block_out_channels)):
-            prev_channel = output_channel
-            output_channel = reversed_channels[i]
-            is_final = i == len(block_out_channels) - 1
-            compress_time = i < temporal_compress_level
-
-            self.up_blocks.append(UpBlock3D(
-                in_channels=prev_channel, out_channels=output_channel,
-                temb_channels=0, num_layers=layers_per_block + 1,
-                eps=eps, act_fn=act_fn, groups=groups,
-                spatial_norm_dim=in_channels,
-                add_upsample=not is_final, compress_time=compress_time,
-            ))
-
-        self.norm_out = SpatialNorm3D(reversed_channels[-1], in_channels, groups=groups)
-        self.conv_act = nn.SiLU()
-        self.conv_out = CausalConv3d(reversed_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode)
-
-    def forward(self, sample, conv_cache=None):
-        new_cache = {}
-        conv_cache = conv_cache or {}
-
-        x, new_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
-
-        x, new_cache["mid_block"] = self.mid_block(x, None, sample, conv_cache=conv_cache.get("mid_block"))
-
-        for i, block in enumerate(self.up_blocks):
-            key = f"up_block_{i}"
-            x, new_cache[key] = block(x, None, sample, conv_cache=conv_cache.get(key))
-
-        x, new_cache["norm_out"] = self.norm_out(x, sample, conv_cache=conv_cache.get("norm_out"))
-        x = self.conv_act(x)
-        x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
-
-        return x, new_cache
-
-
-
-class AutoencoderKLCogVideoX(nn.Module):
-    """CogVideoX VAE. Spatial tiling/slicing handled by ComfyUI's VAE wrapper.
-
-    Uses rolling temporal decode: conv_in + mid_block + temporal up_blocks run
-    on the full (low-res) tensor, then the expensive spatial-only up_blocks +
-    norm_out + conv_out are processed in small temporal chunks with conv_cache
-    carrying causal state between chunks. This keeps peak VRAM proportional to
-    chunk_size rather than total frame count.
-    """
-
-    def __init__(self,
-                 in_channels=3, out_channels=3,
-                 block_out_channels=(128, 256, 256, 512),
-                 latent_channels=16, layers_per_block=3,
-                 act_fn="silu", eps=1e-6, groups=32,
-                 temporal_compression_ratio=4,
-                 ):
-        super().__init__()
-        self.latent_channels = latent_channels
-        self.temporal_compression_ratio = temporal_compression_ratio
-
-        self.encoder = Encoder3D(
-            in_channels=in_channels, out_channels=latent_channels,
-            block_out_channels=block_out_channels, layers_per_block=layers_per_block,
-            act_fn=act_fn, eps=eps, groups=groups,
-            temporal_compression_ratio=temporal_compression_ratio,
-        )
-        self.decoder = Decoder3D(
-            in_channels=latent_channels, out_channels=out_channels,
-            block_out_channels=block_out_channels, layers_per_block=layers_per_block,
-            act_fn=act_fn, eps=eps, groups=groups,
-            temporal_compression_ratio=temporal_compression_ratio,
-        )
-
-        self.num_latent_frames_batch_size = 2
-        self.num_sample_frames_batch_size = 8
-
-    def encode(self, x):
-        t = x.shape[2]
-        frame_batch = self.num_sample_frames_batch_size
-        remainder = t % frame_batch
-        conv_cache = None
-        enc = []
-
-        # Process remainder frames first so only the first chunk can have an
-        # odd temporal dimension — where Downsample3D's first-frame-special
-        # handling in temporal compression is actually correct.
-        if remainder > 0:
-            chunk, conv_cache = self.encoder(x[:, :, :remainder], conv_cache=conv_cache)
-            enc.append(chunk.to(x.device))
-
-        for start in range(remainder, t, frame_batch):
-            chunk, conv_cache = self.encoder(x[:, :, start:start + frame_batch], conv_cache=conv_cache)
-            enc.append(chunk.to(x.device))
-
-        enc = torch.cat(enc, dim=2)
-        mean, _ = enc.chunk(2, dim=1)
-        return mean
-
-    def decode(self, z):
-        return self._decode_rolling(z)
-
-    def _decode_batched(self, z):
-        """Original batched decode - processes 2 latent frames through full decoder."""
-        t = z.shape[2]
-        frame_batch = self.num_latent_frames_batch_size
-        num_batches = max(t // frame_batch, 1)
-        conv_cache = None
-        dec = []
-        for i in range(num_batches):
-            remaining = t % frame_batch
-            start = frame_batch * i + (0 if i == 0 else remaining)
-            end = frame_batch * (i + 1) + remaining
-            chunk, conv_cache = self.decoder(z[:, :, start:end], conv_cache=conv_cache)
-            dec.append(chunk.cpu())
-        return torch.cat(dec, dim=2).to(z.device)
-
-    def _decode_rolling(self, z):
-        """Rolling decode - processes low-res layers on full tensor, then rolls
-        through expensive high-res layers in temporal chunks."""
-        decoder = self.decoder
-        device = z.device
-
-        # Determine which up_blocks have temporal upsample vs spatial-only.
-        # Temporal up_blocks are cheap (low res), spatial-only are expensive.
-        temporal_compress_level = int(np.log2(self.temporal_compression_ratio))
-        split_at = temporal_compress_level  # first N up_blocks do temporal upsample
-
-        # Phase 1: conv_in + mid_block + temporal up_blocks on full tensor (low/medium res)
-        x, _ = decoder.conv_in(z)
-        x, _ = decoder.mid_block(x, None, z)
-
-        for i in range(split_at):
-            x, _ = decoder.up_blocks[i](x, None, z)
-
-        # Phase 2: remaining spatial-only up_blocks + norm_out + conv_out in temporal chunks
-        remaining_blocks = list(range(split_at, len(decoder.up_blocks)))
-        chunk_size = 4  # pixel frames per chunk through high-res layers
-        t_expanded = x.shape[2]
-
-        if t_expanded <= chunk_size or len(remaining_blocks) == 0:
-            # Small enough to process in one go
-            for i in remaining_blocks:
-                x, _ = decoder.up_blocks[i](x, None, z)
-            x, _ = decoder.norm_out(x, z)
-            x = decoder.conv_act(x)
-            x, _ = decoder.conv_out(x)
-            return x
-
-        # Expand z temporally once to match Phase 2's time dimension.
-        # z stays at latent spatial resolution so this is small (~16 MB vs ~1.3 GB
-        # for the old approach of pre-interpolating to every pixel resolution).
-        z_time_expanded = _interpolate_zq(z, (t_expanded, z.shape[3], z.shape[4]))
-
-        # Process in temporal chunks, interpolating spatially per-chunk to avoid
-        # allocating full [B, C, t_expanded, H, W] tensors at each resolution.
-        dec_out = []
-        conv_caches = {}
-
-        for chunk_start in range(0, t_expanded, chunk_size):
-            chunk_end = min(chunk_start + chunk_size, t_expanded)
-            x_chunk = x[:, :, chunk_start:chunk_end]
-            z_t_chunk = z_time_expanded[:, :, chunk_start:chunk_end]
-            z_spatial_cache = {}
-
-            for i in remaining_blocks:
-                block = decoder.up_blocks[i]
-                cache_key = f"up_block_{i}"
-                hw_key = (x_chunk.shape[3], x_chunk.shape[4])
-                if hw_key not in z_spatial_cache:
-                    if z_t_chunk.shape[3] == hw_key[0] and z_t_chunk.shape[4] == hw_key[1]:
-                        z_spatial_cache[hw_key] = z_t_chunk
-                    else:
-                        z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
-                x_chunk, new_cache = block(x_chunk, None, z_spatial_cache[hw_key], conv_cache=conv_caches.get(cache_key))
-                conv_caches[cache_key] = new_cache
-
-            hw_key = (x_chunk.shape[3], x_chunk.shape[4])
-            if hw_key not in z_spatial_cache:
-                z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
-            x_chunk, new_cache = decoder.norm_out(x_chunk, z_spatial_cache[hw_key], conv_cache=conv_caches.get("norm_out"))
-            conv_caches["norm_out"] = new_cache
-            x_chunk = decoder.conv_act(x_chunk)
-            x_chunk, new_cache = decoder.conv_out(x_chunk, conv_cache=conv_caches.get("conv_out"))
-            conv_caches["conv_out"] = new_cache
-
-            dec_out.append(x_chunk.cpu())
-            del z_spatial_cache
-
-        del x, z_time_expanded
-        return torch.cat(dec_out, dim=2).to(device)
--- a/comfy/ldm/ernie/model.py
+++ b/comfy/ldm/ernie/model.py
@@ -1,303 +0,0 @@
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from comfy.ldm.modules.attention import optimized_attention
-import comfy.model_management
-
-def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
-    assert dim % 2 == 0
-    if not comfy.model_management.supports_fp64(pos.device):
-        device = torch.device("cpu")
-    else:
-        device = pos.device
-
-    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=device) / dim
-    omega = 1.0 / (theta**scale)
-    out = torch.einsum("...n,d->...nd", pos, omega)
-    out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
-    return out.to(dtype=torch.float32, device=pos.device)
-
-def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
-    rot_dim = freqs_cis.shape[-1]
-    x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:]
-    cos_ = freqs_cis[0]
-    sin_ = freqs_cis[1]
-    x1, x2 = x.chunk(2, dim=-1)
-    x_rotated = torch.cat((-x2, x1), dim=-1)
-    return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1)
-
-class ErnieImageEmbedND3(nn.Module):
-    def __init__(self, dim: int, theta: int, axes_dim: tuple):
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        self.axes_dim = list(axes_dim)
-
-    def forward(self, ids: torch.Tensor) -> torch.Tensor:
-        emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1)
-        emb = emb.unsqueeze(3)  # [2, B, S, 1, head_dim//2]
-        return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1)  # [B, S, 1, head_dim]
-
-class ErnieImagePatchEmbedDynamic(nn.Module):
-    def __init__(self, in_channels: int, embed_dim: int, patch_size: int, operations, device=None, dtype=None):
-        super().__init__()
-        self.patch_size = patch_size
-        self.proj = operations.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True, device=device, dtype=dtype)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
-        batch_size, dim, height, width = x.shape
-        return x.reshape(batch_size, dim, height * width).transpose(1, 2).contiguous()
-
-class Timesteps(nn.Module):
-    def __init__(self, num_channels: int, flip_sin_to_cos: bool = False):
-        super().__init__()
-        self.num_channels = num_channels
-        self.flip_sin_to_cos = flip_sin_to_cos
-
-    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
-        half_dim = self.num_channels // 2
-        exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) / half_dim
-        emb = torch.exp(exponent)
-        emb = timesteps[:, None].float() * emb[None, :]
-        if self.flip_sin_to_cos:
-            emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=-1)
-        else:
-            emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
-        return emb
-
-class TimestepEmbedding(nn.Module):
-    def __init__(self, in_channels: int, time_embed_dim: int, operations, device=None, dtype=None):
-        super().__init__()
-        Linear = operations.Linear
-        self.linear_1 = Linear(in_channels, time_embed_dim, bias=True, device=device, dtype=dtype)
-        self.act = nn.SiLU()
-        self.linear_2 = Linear(time_embed_dim, time_embed_dim, bias=True, device=device, dtype=dtype)
-
-    def forward(self, sample: torch.Tensor) -> torch.Tensor:
-        sample = self.linear_1(sample)
-        sample = self.act(sample)
-        sample = self.linear_2(sample)
-        return sample
-
-class ErnieImageAttention(nn.Module):
-    def __init__(self, query_dim: int, heads: int, dim_head: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
-        super().__init__()
-        self.heads = heads
-        self.head_dim = dim_head
-        self.inner_dim = heads * dim_head
-
-        Linear = operations.Linear
-        RMSNorm = operations.RMSNorm
-
-        self.to_q = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
-        self.to_k = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
-        self.to_v = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
-
-        self.norm_q = RMSNorm(dim_head, eps=eps, elementwise_affine=True, device=device, dtype=dtype)
-        self.norm_k = RMSNorm(dim_head, eps=eps, elementwise_affine=True, device=device, dtype=dtype)
-
-        self.to_out = nn.ModuleList([Linear(self.inner_dim, query_dim, bias=False, device=device, dtype=dtype)])
-
-    def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None, image_rotary_emb: torch.Tensor = None) -> torch.Tensor:
-        B, S, _ = x.shape
-
-        q_flat = self.to_q(x)
-        k_flat = self.to_k(x)
-        v_flat = self.to_v(x)
-
-        query = q_flat.view(B, S, self.heads, self.head_dim)
-        key = k_flat.view(B, S, self.heads, self.head_dim)
-
-        query = self.norm_q(query)
-        key = self.norm_k(key)
-
-        if image_rotary_emb is not None:
-            query = apply_rotary_emb(query, image_rotary_emb)
-            key = apply_rotary_emb(key, image_rotary_emb)
-
-        query, key = query.to(x.dtype), key.to(x.dtype)
-
-        q_flat = query.reshape(B, S, -1)
-        k_flat = key.reshape(B, S, -1)
-
-        hidden_states = optimized_attention(q_flat, k_flat, v_flat, self.heads, mask=attention_mask)
-
-        return self.to_out[0](hidden_states)
-
-class ErnieImageFeedForward(nn.Module):
-    def __init__(self, hidden_size: int, ffn_hidden_size: int, operations, device=None, dtype=None):
-        super().__init__()
-        Linear = operations.Linear
-        self.gate_proj = Linear(hidden_size, ffn_hidden_size, bias=False, device=device, dtype=dtype)
-        self.up_proj = Linear(hidden_size, ffn_hidden_size, bias=False, device=device, dtype=dtype)
-        self.linear_fc2 = Linear(ffn_hidden_size, hidden_size, bias=False, device=device, dtype=dtype)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.linear_fc2(self.up_proj(x) * F.gelu(self.gate_proj(x)))
-
-class ErnieImageSharedAdaLNBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, ffn_hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
-        super().__init__()
-        RMSNorm = operations.RMSNorm
-
-        self.adaLN_sa_ln = RMSNorm(hidden_size, eps=eps, device=device, dtype=dtype)
-        self.self_attention = ErnieImageAttention(
-            query_dim=hidden_size,
-            dim_head=hidden_size // num_heads,
-            heads=num_heads,
-            eps=eps,
-            operations=operations,
-            device=device,
-            dtype=dtype
-        )
-        self.adaLN_mlp_ln = RMSNorm(hidden_size, eps=eps, device=device, dtype=dtype)
-        self.mlp = ErnieImageFeedForward(hidden_size, ffn_hidden_size, operations=operations, device=device, dtype=dtype)
-
-    def forward(self, x, rotary_pos_emb, temb, attention_mask=None):
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = temb
-
-        residual = x
-        x_norm = self.adaLN_sa_ln(x)
-        x_norm = (x_norm.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype)
-
-        attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
-        x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype)
-
-        residual = x
-        x_norm = self.adaLN_mlp_ln(x)
-        x_norm = (x_norm.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype)
-
-        return residual + (gate_mlp.float() * self.mlp(x_norm).float()).to(x.dtype)
-
-class ErnieImageAdaLNContinuous(nn.Module):
-    def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
-        super().__init__()
-        LayerNorm = operations.LayerNorm
-        Linear = operations.Linear
-        self.norm = LayerNorm(hidden_size, elementwise_affine=False, eps=eps, device=device, dtype=dtype)
-        self.linear = Linear(hidden_size, hidden_size * 2, device=device, dtype=dtype)
-
-    def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
-        scale, shift = self.linear(conditioning).chunk(2, dim=-1)
-        x = self.norm(x)
-        x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-        return x
-
-class ErnieImageModel(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int = 4096,
-        num_attention_heads: int = 32,
-        num_layers: int = 36,
-        ffn_hidden_size: int = 12288,
-        in_channels: int = 128,
-        out_channels: int = 128,
-        patch_size: int = 1,
-        text_in_dim: int = 3072,
-        rope_theta: int = 256,
-        rope_axes_dim: tuple = (32, 48, 48),
-        eps: float = 1e-6,
-        qk_layernorm: bool = True,
-        device=None,
-        dtype=None,
-        operations=None,
-        **kwargs
-    ):
-        super().__init__()
-        self.dtype = dtype
-        self.hidden_size = hidden_size
-        self.num_heads = num_attention_heads
-        self.head_dim = hidden_size // num_attention_heads
-        self.patch_size = patch_size
-        self.out_channels = out_channels
-
-        Linear = operations.Linear
-
-        self.x_embedder = ErnieImagePatchEmbedDynamic(in_channels, hidden_size, patch_size, operations, device, dtype)
-        self.text_proj = Linear(text_in_dim, hidden_size, bias=False, device=device, dtype=dtype) if text_in_dim != hidden_size else None
-
-        self.time_proj = Timesteps(hidden_size, flip_sin_to_cos=False)
-        self.time_embedding = TimestepEmbedding(hidden_size, hidden_size, operations, device, dtype)
-
-        self.pos_embed = ErnieImageEmbedND3(dim=self.head_dim, theta=rope_theta, axes_dim=rope_axes_dim)
-
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            Linear(hidden_size, 6 * hidden_size, device=device, dtype=dtype)
-        )
-
-        self.layers = nn.ModuleList([
-            ErnieImageSharedAdaLNBlock(hidden_size, num_attention_heads, ffn_hidden_size, eps, operations, device, dtype)
-            for _ in range(num_layers)
-        ])
-
-        self.final_norm = ErnieImageAdaLNContinuous(hidden_size, eps, operations, device, dtype)
-        self.final_linear = Linear(hidden_size, patch_size * patch_size * out_channels, device=device, dtype=dtype)
-
-    def forward(self, x, timesteps, context, **kwargs):
-        device, dtype = x.device, x.dtype
-        B, C, H, W = x.shape
-        p, Hp, Wp = self.patch_size, H // self.patch_size, W // self.patch_size
-        N_img = Hp * Wp
-
-        img_bsh = self.x_embedder(x)
-
-        text_bth = context
-        if self.text_proj is not None and text_bth.numel() > 0:
-            text_bth = self.text_proj(text_bth)
-        Tmax = text_bth.shape[1]
-
-        hidden_states = torch.cat([img_bsh, text_bth], dim=1)
-
-        text_ids = torch.zeros((B, Tmax, 3), device=device, dtype=torch.float32)
-        text_ids[:, :, 0] = torch.linspace(0, Tmax - 1, steps=Tmax, device=x.device, dtype=torch.float32)
-        index = float(Tmax)
-
-        transformer_options = kwargs.get("transformer_options", {})
-        rope_options = transformer_options.get("rope_options", None)
-
-        h_len, w_len = float(Hp), float(Wp)
-        h_offset, w_offset = 0.0, 0.0
-
-        if rope_options is not None:
-            h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
-            w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
-            index += rope_options.get("shift_t", 0.0)
-            h_offset += rope_options.get("shift_y", 0.0)
-            w_offset += rope_options.get("shift_x", 0.0)
-
-        image_ids = torch.zeros((Hp, Wp, 3), device=device, dtype=torch.float32)
-        image_ids[:, :, 0] = image_ids[:, :, 1] + index
-        image_ids[:, :, 1] = image_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=Hp, device=device, dtype=torch.float32).unsqueeze(1)
-        image_ids[:, :, 2] = image_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=Wp, device=device, dtype=torch.float32).unsqueeze(0)
-
-        image_ids = image_ids.view(1, N_img, 3).expand(B, -1, -1)
-
-        rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)).to(x.dtype)
-        del image_ids, text_ids
-
-        sample = self.time_proj(timesteps).to(dtype)
-        c = self.time_embedding(sample)
-
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
-            t.unsqueeze(1).contiguous() for t in self.adaLN_modulation(c).chunk(6, dim=-1)
-        ]
-
-        temb = [shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp]
-        for layer in self.layers:
-            hidden_states = layer(hidden_states, rotary_pos_emb, temb)
-
-        hidden_states = self.final_norm(hidden_states, c).type_as(hidden_states)
-
-        patches = self.final_linear(hidden_states)[:, :N_img, :]
-        output = (
-            patches.view(B, Hp, Wp, p, p, self.out_channels)
-            .permute(0, 5, 1, 3, 2, 4)
-            .contiguous()
-            .view(B, self.out_channels, H, W)
-        )
-
-        return output
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -16,7 +16,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transforme

 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
-    if not comfy.model_management.supports_fp64(pos.device):
+    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
        device = torch.device("cpu")
    else:
        device = pos.device
--- a/comfy/ldm/modules/sdpose.py
+++ b/comfy/ldm/modules/sdpose.py
@@ -90,7 +90,7 @@ class HeatmapHead(torch.nn.Module):
                origin_max = np.max(hm[k])
                dr = np.zeros((H + 2 * border, W + 2 * border), dtype=np.float32)
                dr[border:-border, border:-border] = hm[k].copy()
-                dr = gaussian_filter(dr, sigma=2.0, truncate=2.5)
+                dr = gaussian_filter(dr, sigma=2.0)
                hm[k] = dr[border:-border, border:-border].copy()
                cur_max = np.max(hm[k])
                if cur_max > 0:
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -52,9 +52,7 @@ import comfy.ldm.qwen_image.model
 import comfy.ldm.kandinsky5.model
 import comfy.ldm.anima.model
 import comfy.ldm.ace.ace_step15
-import comfy.ldm.cogvideo.model
 import comfy.ldm.rt_detr.rtdetr_v4
-import comfy.ldm.ernie.model

 import comfy.model_management
 import comfy.patcher_extension
@@ -81,7 +79,6 @@ class ModelType(Enum):
    IMG_TO_IMG = 9
    FLOW_COSMOS = 10
    IMG_TO_IMG_FLOW = 11
-    V_PREDICTION_DDPM = 12


 def model_sampling(model_config, model_type):
@@ -116,8 +113,6 @@ def model_sampling(model_config, model_type):
        s = comfy.model_sampling.ModelSamplingCosmosRFlow
    elif model_type == ModelType.IMG_TO_IMG_FLOW:
        c = comfy.model_sampling.IMG_TO_IMG_FLOW
-    elif model_type == ModelType.V_PREDICTION_DDPM:
-        c = comfy.model_sampling.V_PREDICTION_DDPM

    class ModelSampling(s, c):
        pass
@@ -1967,70 +1962,3 @@ class Kandinsky5Image(Kandinsky5):
 class RT_DETR_v4(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.rt_detr.rtdetr_v4.RTv4)
-
-class ErnieImage(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ernie.model.ErnieImageModel)
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        cross_attn = kwargs.get("cross_attn", None)
-        if cross_attn is not None:
-            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-        return out
-
-class CogVideoX(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_DDPM, image_to_video=False, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cogvideo.model.CogVideoXTransformer3DModel)
-        self.image_to_video = image_to_video
-
-    def concat_cond(self, **kwargs):
-        noise = kwargs.get("noise", None)
-        # Detect extra channels needed (e.g. 32 - 16 = 16 for ref latent)
-        extra_channels = self.diffusion_model.in_channels - noise.shape[1]
-        if extra_channels == 0:
-            return None
-
-        image = kwargs.get("concat_latent_image", None)
-        device = kwargs["device"]
-
-        if image is None:
-            shape = list(noise.shape)
-            shape[1] = extra_channels
-            return torch.zeros(shape, dtype=noise.dtype, layout=noise.layout, device=noise.device)
-
-        latent_dim = self.latent_format.latent_channels
-        image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
-
-        if noise.ndim == 5 and image.ndim == 5:
-            if image.shape[-3] < noise.shape[-3]:
-                image = torch.nn.functional.pad(image, (0, 0, 0, 0, 0, noise.shape[-3] - image.shape[-3]), "constant", 0)
-            elif image.shape[-3] > noise.shape[-3]:
-                image = image[:, :, :noise.shape[-3]]
-
-        for i in range(0, image.shape[1], latent_dim):
-            image[:, i:i + latent_dim] = self.process_latent_in(image[:, i:i + latent_dim])
-        image = utils.resize_to_batch_size(image, noise.shape[0])
-
-        if image.shape[1] > extra_channels:
-            image = image[:, :extra_channels]
-        elif image.shape[1] < extra_channels:
-            repeats = extra_channels // image.shape[1]
-            remainder = extra_channels % image.shape[1]
-            parts = [image] * repeats
-            if remainder > 0:
-                parts.append(image[:, :remainder])
-            image = torch.cat(parts, dim=1)
-
-        return image
-
-    def extra_conds(self, **kwargs):
-        out = super().extra_conds(**kwargs)
-        # OFS embedding (CogVideoX 1.5 I2V), default 2.0 as used by SparkVSR
-        if self.diffusion_model.ofs_proj_dim is not None:
-            ofs = kwargs.get("ofs", None)
-            if ofs is None:
-                noise = kwargs.get("noise", None)
-                ofs = torch.full((noise.shape[0],), 2.0, device=noise.device, dtype=noise.dtype)
-            out['ofs'] = comfy.conds.CONDRegular(ofs)
-        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -490,54 +490,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):

        return dit_config

-    if '{}blocks.0.norm1.linear.weight'.format(key_prefix) in state_dict_keys:  # CogVideoX
-        dit_config = {}
-        dit_config["image_model"] = "cogvideox"
-
-        # Extract config from weight shapes
-        norm1_weight = state_dict['{}blocks.0.norm1.linear.weight'.format(key_prefix)]
-        time_embed_dim = norm1_weight.shape[1]
-        dim = norm1_weight.shape[0] // 6
-
-        dit_config["num_attention_heads"] = dim // 64
-        dit_config["attention_head_dim"] = 64
-        dit_config["time_embed_dim"] = time_embed_dim
-        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
-
-        # Detect in_channels from patch_embed
-        patch_proj_key = '{}patch_embed.proj.weight'.format(key_prefix)
-        if patch_proj_key in state_dict_keys:
-            w = state_dict[patch_proj_key]
-            if w.ndim == 4:
-                # Conv2d: [out, in, kh, kw] — CogVideoX 1.0
-                dit_config["in_channels"] = w.shape[1]
-                dit_config["patch_size"] = w.shape[2]
-            elif w.ndim == 2:
-                # Linear: [out, in_channels * patch_size * patch_size * patch_size_t] — CogVideoX 1.5
-                dit_config["patch_size"] = 2
-                dit_config["patch_size_t"] = 2
-                dit_config["in_channels"] = w.shape[1] // (2 * 2 * 2)  # 256 // 8 = 32
-
-        text_proj_key = '{}patch_embed.text_proj.weight'.format(key_prefix)
-        if text_proj_key in state_dict_keys:
-            dit_config["text_embed_dim"] = state_dict[text_proj_key].shape[1]
-
-        # Detect OFS embedding
-        ofs_key = '{}ofs_embedding_linear_1.weight'.format(key_prefix)
-        if ofs_key in state_dict_keys:
-            dit_config["ofs_embed_dim"] = state_dict[ofs_key].shape[1]
-
-        # Detect positional embedding type
-        pos_key = '{}patch_embed.pos_embedding'.format(key_prefix)
-        if pos_key in state_dict_keys:
-            dit_config["use_learned_positional_embeddings"] = True
-            dit_config["use_rotary_positional_embeddings"] = False
-        else:
-            dit_config["use_learned_positional_embeddings"] = False
-            dit_config["use_rotary_positional_embeddings"] = True
-
-        return dit_config
-
    if '{}head.modulation'.format(key_prefix) in state_dict_keys:  # Wan 2.1
        dit_config = {}
        dit_config["image_model"] = "wan2.1"
@@ -761,11 +713,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["enc_h"] = state_dict['{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix)].shape[0]
        return dit_config

-    if '{}layers.0.mlp.linear_fc2.weight'.format(key_prefix) in state_dict_keys: # Ernie Image
-        dit_config = {}
-        dit_config["image_model"] = "ernie"
-        return dit_config
-
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None

--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1732,21 +1732,6 @@ def supports_mxfp8_compute(device=None):

    return True

-def supports_fp64(device=None):
-    if is_device_mps(device):
-        return False
-
-    if is_intel_xpu():
-        return False
-
-    if is_directml_enabled():
-        return False
-
-    if is_ixuca():
-        return False
-
-    return True
-
 def extended_fp16_support():
    # TODO: check why some models work with fp16 on newer torch versions but not on older
    if torch_version_numeric < (2, 7):
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@@ -54,30 +54,6 @@ class V_PREDICTION(EPS):
        sigma = reshape_sigma(sigma, model_output.ndim)
        return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5

-class V_PREDICTION_DDPM:
-    """CogVideoX v-prediction: model receives raw x_t (unscaled), predicts velocity v.
-    x_0 = sqrt(alpha) * x_t - sqrt(1-alpha) * v
-        = x_t / sqrt(sigma^2 + 1) - v * sigma / sqrt(sigma^2 + 1)
-    """
-    def calculate_input(self, sigma, noise):
-        return noise
-
-    def calculate_denoised(self, sigma, model_output, model_input):
-        sigma = reshape_sigma(sigma, model_output.ndim)
-        return model_input / (sigma ** 2 + 1.0) ** 0.5 - model_output * sigma / (sigma ** 2 + 1.0) ** 0.5
-
-    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
-        sigma = reshape_sigma(sigma, noise.ndim)
-        if max_denoise:
-            noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
-        else:
-            noise = noise * sigma
-        noise += latent_image
-        return noise
-
-    def inverse_noise_scaling(self, sigma, latent):
-        return latent
-
 class EDM(V_PREDICTION):
    def calculate_denoised(self, sigma, model_output, model_input):
        sigma = reshape_sigma(sigma, model_output.ndim)
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -17,7 +17,6 @@ import comfy.ldm.wan.vae
 import comfy.ldm.wan.vae2_2
 import comfy.ldm.hunyuan3d.vae
 import comfy.ldm.ace.vae.music_dcae_pipeline
-import comfy.ldm.cogvideo.vae
 import comfy.ldm.hunyuan_video.vae
 import comfy.ldm.mmaudio.vae.autoencoder
 import comfy.pixel_space_convert
@@ -63,7 +62,6 @@ import comfy.text_encoders.anima
 import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
 import comfy.text_encoders.qwen35
-import comfy.text_encoders.ernie

 import comfy.model_patcher
 import comfy.lora
@@ -652,17 +650,6 @@ class VAE:

                self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
                self.memory_used_decode = lambda shape, dtype: (3600 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
-            elif "decoder.conv_in.conv.weight" in sd and "decoder.mid_block.resnets.0.norm1.norm_layer.weight" in sd:  # CogVideoX VAE
-                self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
-                self.upscale_index_formula = (4, 8, 8)
-                self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
-                self.downscale_index_formula = (4, 8, 8)
-                self.latent_dim = 3
-                self.latent_channels = sd["encoder.conv_out.conv.weight"].shape[0] // 2
-                self.first_stage_model = comfy.ldm.cogvideo.vae.AutoencoderKLCogVideoX(latent_channels=self.latent_channels)
-                self.memory_used_decode = lambda shape, dtype: (2800 * max(2, ((shape[2] - 1) * 4) + 1) * shape[3] * shape[4] * (8 * 8)) * model_management.dtype_size(dtype)
-                self.memory_used_encode = lambda shape, dtype: (1400 * max(1, shape[2]) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
-                self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
            elif "decoder.conv_in.conv.weight" in sd:
                ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
                ddconfig["conv3d"] = True
@@ -1248,7 +1235,6 @@ class TEModel(Enum):
    QWEN35_4B = 25
    QWEN35_9B = 26
    QWEN35_27B = 27
-    MINISTRAL_3_3B = 28


 def detect_te_model(sd):
@@ -1315,8 +1301,6 @@ def detect_te_model(sd):
                return TEModel.MISTRAL3_24B
            else:
                return TEModel.MISTRAL3_24B_PRUNED_FLUX2
-        if weight.shape[0] == 3072:
-            return TEModel.MINISTRAL_3_3B

        return TEModel.LLAMA3_8
    return None
@@ -1474,10 +1458,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        elif te_model == TEModel.QWEN3_06B:
            clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer
-        elif te_model == TEModel.MINISTRAL_3_3B:
-            clip_target.clip = comfy.text_encoders.ernie.te(**llama_detect(clip_data))
-            clip_target.tokenizer = comfy.text_encoders.ernie.ErnieTokenizer
-            tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None)
        else:
            # clip_l
            if clip_type == CLIPType.SD3:
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -26,8 +26,6 @@ import comfy.text_encoders.z_image
 import comfy.text_encoders.anima
 import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
-import comfy.text_encoders.ernie
-import comfy.text_encoders.cogvideo

 from . import supported_models_base
 from . import latent_formats
@@ -1751,83 +1749,6 @@ class RT_DETR_v4(supported_models_base.BASE):
    def clip_target(self, state_dict={}):
        return None

-
-class ErnieImage(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "ernie",
-    }
-
-    sampling_settings = {
-        "multiplier": 1000.0,
-        "shift": 3.0,
-    }
-
-    memory_usage_factor = 10.0
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Flux2
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.ErnieImage(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}ministral3_3b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.ernie.ErnieTokenizer, comfy.text_encoders.ernie.te(**hunyuan_detect))
-
-
-class CogVideoX_T2V(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "cogvideox",
-    }
-
-    sampling_settings = {
-        "linear_start": 0.00085,
-        "linear_end": 0.012,
-        "beta_schedule": "linear",
-        "zsnr": True,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.CogVideoX
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        # CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
-        if self.unet_config.get("patch_size_t") is not None:
-            self.unet_config.setdefault("sample_height", 96)
-            self.unet_config.setdefault("sample_width", 170)
-            self.unet_config.setdefault("sample_frames", 81)
-        out = model_base.CogVideoX(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.cogvideo.CogVideoXT5Tokenizer, comfy.text_encoders.sd3_clip.T5XXLModel)
-
-class CogVideoX_I2V(CogVideoX_T2V):
-    unet_config = {
-        "image_model": "cogvideox",
-        "in_channels": 32,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        if self.unet_config.get("patch_size_t") is not None:
-            self.unet_config.setdefault("sample_height", 96)
-            self.unet_config.setdefault("sample_width", 170)
-            self.unet_config.setdefault("sample_frames", 81)
-        out = model_base.CogVideoX(self, image_to_video=True, device=device)
-        return out
-
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage, CogVideoX_I2V, CogVideoX_T2V]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4]

 models += [SVD_img2vid]
--- a/comfy/text_encoders/cogvideo.py
+++ b/comfy/text_encoders/cogvideo.py
@@ -1,6 +0,0 @@
-import comfy.text_encoders.sd3_clip
-
-
-class CogVideoXT5Tokenizer(comfy.text_encoders.sd3_clip.T5XXLTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, min_length=226)
--- a/comfy/text_encoders/ernie.py
+++ b/comfy/text_encoders/ernie.py
@@ -1,38 +0,0 @@
-from .flux import Mistral3Tokenizer
-from comfy import sd1_clip
-import comfy.text_encoders.llama
-
-class Ministral3_3BTokenizer(Mistral3Tokenizer):
-    def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='ministral3_3b', tokenizer_data={}):
-        return super().__init__(embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_data=tokenizer_data)
-
-class ErnieTokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="ministral3_3b", tokenizer=Mistral3Tokenizer)
-
-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
-        tokens = super().tokenize_with_weights(text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
-        return tokens
-
-
-class Ministral3_3BModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
-        textmodel_json_config = {}
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 1, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Ministral3_3B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
-
-
-class ErnieTEModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}, name="ministral3_3b", clip_model=Ministral3_3BModel):
-        super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
-
-
-def te(dtype_llama=None, llama_quantization_metadata=None):
-    class ErnieTEModel_(ErnieTEModel):
-        def __init__(self, device="cpu", dtype=None, model_options={}):
-            if dtype_llama is not None:
-                dtype = dtype_llama
-            if llama_quantization_metadata is not None:
-                model_options = model_options.copy()
-                model_options["quantization_metadata"] = llama_quantization_metadata
-            super().__init__(device=device, dtype=dtype, model_options=model_options)
-    return ErnieTEModel
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@@ -116,9 +116,9 @@ class MistralTokenizerClass:
        return LlamaTokenizerFast(**kwargs)

 class Mistral3Tokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_data={}):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
        self.tekken_data = tokenizer_data.get("tekken_model", None)
-        super().__init__("", pad_with_end=False, embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, disable_weights=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
+        super().__init__("", pad_with_end=False, embedding_directory=embedding_directory, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)

    def state_dict(self):
        return {"tekken_model": self.tekken_data}
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -60,30 +60,6 @@ class Mistral3Small24BConfig:
    final_norm: bool = True
    lm_head: bool = False

-@dataclass
-class Ministral3_3BConfig:
-    vocab_size: int = 131072
-    hidden_size: int = 3072
-    intermediate_size: int = 9216
-    num_hidden_layers: int = 26
-    num_attention_heads: int = 32
-    num_key_value_heads: int = 8
-    max_position_embeddings: int = 262144
-    rms_norm_eps: float = 1e-5
-    rope_theta: float = 1000000.0
-    transformer_type: str = "llama"
-    head_dim = 128
-    rms_norm_add = False
-    mlp_activation = "silu"
-    qkv_bias = False
-    rope_dims = None
-    q_norm = None
-    k_norm = None
-    rope_scale = None
-    final_norm: bool = True
-    lm_head: bool = False
-    stop_tokens = [2]
-
@dataclass
 class Qwen25_3BConfig:
    vocab_size: int = 151936
@@ -970,15 +946,6 @@ class Mistral3Small24B(BaseLlama, torch.nn.Module):
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype

-class Ministral3_3B(BaseLlama, BaseQwen3, BaseGenerate, torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        config = Ministral3_3BConfig(**config_dict)
-        self.num_layers = config.num_hidden_layers
-
-        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
-        self.dtype = dtype
-
 class Qwen25_3B(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
--- a/comfy_api_nodes/apis/bytedance.py
+++ b/comfy_api_nodes/apis/bytedance.py
@@ -52,26 +52,6 @@ class TaskImageContent(BaseModel):
    role: Literal["first_frame", "last_frame", "reference_image"] | None = Field(None)


-class TaskVideoContentUrl(BaseModel):
-    url: str = Field(...)
-
-
-class TaskVideoContent(BaseModel):
-    type: str = Field("video_url")
-    video_url: TaskVideoContentUrl = Field(...)
-    role: str = Field("reference_video")
-
-
-class TaskAudioContentUrl(BaseModel):
-    url: str = Field(...)
-
-
-class TaskAudioContent(BaseModel):
-    type: str = Field("audio_url")
-    audio_url: TaskAudioContentUrl = Field(...)
-    role: str = Field("reference_audio")
-
-
 class Text2VideoTaskCreationRequest(BaseModel):
    model: str = Field(...)
    content: list[TaskTextContent] = Field(..., min_length=1)
@@ -84,17 +64,6 @@ class Image2VideoTaskCreationRequest(BaseModel):
    generate_audio: bool | None = Field(...)


-class Seedance2TaskCreationRequest(BaseModel):
-    model: str = Field(...)
-    content: list[TaskTextContent | TaskImageContent | TaskVideoContent | TaskAudioContent] = Field(..., min_length=1)
-    generate_audio: bool | None = Field(None)
-    resolution: str | None = Field(None)
-    ratio: str | None = Field(None)
-    duration: int | None = Field(None, ge=4, le=15)
-    seed: int | None = Field(None, ge=0, le=2147483647)
-    watermark: bool | None = Field(None)
-
-
 class TaskCreationResponse(BaseModel):
    id: str = Field(...)

@@ -108,27 +77,12 @@ class TaskStatusResult(BaseModel):
    video_url: str = Field(...)


-class TaskStatusUsage(BaseModel):
-    completion_tokens: int = Field(0)
-    total_tokens: int = Field(0)
-
-
 class TaskStatusResponse(BaseModel):
    id: str = Field(...)
    model: str = Field(...)
    status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...)
    error: TaskStatusError | None = Field(None)
    content: TaskStatusResult | None = Field(None)
-    usage: TaskStatusUsage | None = Field(None)
-
-
-# Dollars per 1K tokens, keyed by (model_id, has_video_input).
-SEEDANCE2_PRICE_PER_1K_TOKENS = {
-    ("dreamina-seedance-2-0-260128", False): 0.007,
-    ("dreamina-seedance-2-0-260128", True): 0.0043,
-    ("dreamina-seedance-2-0-fast-260128", False): 0.0056,
-    ("dreamina-seedance-2-0-fast-260128", True): 0.0033,
-}


 RECOMMENDED_PRESETS = [
@@ -158,12 +112,6 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
    ("Custom", None, None),
 ]

-# Seedance 2.0 reference video pixel count limits per model.
-SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
-    "dreamina-seedance-2-0-260128": {"min": 409_600, "max": 927_408},
-    "dreamina-seedance-2-0-fast-260128": {"min": 409_600, "max": 927_408},
-}
-
 # The time in this dictionary are given for 10 seconds duration.
 VIDEO_TASKS_EXECUTION_TIME = {
    "seedance-1-0-lite-t2v-250428": {
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@@ -8,23 +8,16 @@ from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api_nodes.apis.bytedance import (
    RECOMMENDED_PRESETS,
    RECOMMENDED_PRESETS_SEEDREAM_4,
-    SEEDANCE2_PRICE_PER_1K_TOKENS,
-    SEEDANCE2_REF_VIDEO_PIXEL_LIMITS,
    VIDEO_TASKS_EXECUTION_TIME,
    Image2VideoTaskCreationRequest,
    ImageTaskCreationResponse,
-    Seedance2TaskCreationRequest,
    Seedream4Options,
    Seedream4TaskCreationRequest,
-    TaskAudioContent,
-    TaskAudioContentUrl,
    TaskCreationResponse,
    TaskImageContent,
    TaskImageContentUrl,
    TaskStatusResponse,
    TaskTextContent,
-    TaskVideoContent,
-    TaskVideoContentUrl,
    Text2ImageTaskCreationRequest,
    Text2VideoTaskCreationRequest,
 )
@@ -36,10 +29,7 @@ from comfy_api_nodes.util import (
    image_tensor_pair_to_batch,
    poll_op,
    sync_op,
-    upload_audio_to_comfyapi,
-    upload_image_to_comfyapi,
    upload_images_to_comfyapi,
-    upload_video_to_comfyapi,
    validate_image_aspect_ratio,
    validate_image_dimensions,
    validate_string,
@@ -56,56 +46,12 @@ SEEDREAM_MODELS = {
 # Long-running tasks endpoints(e.g., video)
 BYTEPLUS_TASK_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks"
 BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks"  # + /{task_id}
-BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT = "/proxy/byteplus-seedance2/api/v3/contents/generations/tasks"  # + /{task_id}
-
-SEEDANCE_MODELS = {
-    "Seedance 2.0": "dreamina-seedance-2-0-260128",
-    "Seedance 2.0 Fast": "dreamina-seedance-2-0-fast-260128",
-}

 DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-250428"}

-
 logger = logging.getLogger(__name__)


-def _validate_ref_video_pixels(video: Input.Video, model_id: str, index: int) -> None:
-    """Validate reference video pixel count against Seedance 2.0 model limits."""
-    limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
-    if not limits:
-        return
-    try:
-        w, h = video.get_dimensions()
-    except Exception:
-        return
-    pixels = w * h
-    min_px = limits.get("min")
-    max_px = limits.get("max")
-    if min_px and pixels < min_px:
-        raise ValueError(
-            f"Reference video {index} is too small: {w}x{h} = {pixels:,}px. " f"Minimum is {min_px:,}px for this model."
-        )
-    if max_px and pixels > max_px:
-        raise ValueError(
-            f"Reference video {index} is too large: {w}x{h} = {pixels:,}px. "
-            f"Maximum is {max_px:,}px for this model. Try downscaling the video."
-        )
-
-
-def _seedance2_price_extractor(model_id: str, has_video_input: bool):
-    """Returns a price_extractor closure for Seedance 2.0 poll_op."""
-    rate = SEEDANCE2_PRICE_PER_1K_TOKENS.get((model_id, has_video_input))
-    if rate is None:
-        return None
-
-    def extractor(response: TaskStatusResponse) -> float | None:
-        if response.usage is None:
-            return None
-        return response.usage.total_tokens * 1.43 * rate / 1_000.0
-
-    return extractor
-
-
 def get_image_url_from_response(response: ImageTaskCreationResponse) -> str:
    if response.error:
        error_msg = f"ByteDance request failed. Code: {response.error['code']}, message: {response.error['message']}"
@@ -389,7 +335,8 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
        mp_provided = out_num_pixels / 1_000_000.0
        if ("seedream-4-5" in model or "seedream-5-0" in model) and out_num_pixels < 3686400:
            raise ValueError(
-                f"Minimum image resolution for the selected model is 3.68MP, " f"but {mp_provided:.2f}MP provided."
+                f"Minimum image resolution for the selected model is 3.68MP, "
+                f"but {mp_provided:.2f}MP provided."
            )
        if "seedream-4-0" in model and out_num_pixels < 921600:
            raise ValueError(
@@ -1005,6 +952,33 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
        )


+async def process_video_task(
+    cls: type[IO.ComfyNode],
+    payload: Text2VideoTaskCreationRequest | Image2VideoTaskCreationRequest,
+    estimated_duration: int | None,
+) -> IO.NodeOutput:
+    if payload.model in DEPRECATED_MODELS:
+        logger.warning(
+            "Model '%s' is deprecated and will be deactivated on May 13, 2026. "
+            "Please switch to a newer model. Recommended: seedance-1-0-pro-fast-251015.",
+            payload.model,
+        )
+    initial_response = await sync_op(
+        cls,
+        ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
+        data=payload,
+        response_model=TaskCreationResponse,
+    )
+    response = await poll_op(
+        cls,
+        ApiEndpoint(path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
+        status_extractor=lambda r: r.status,
+        estimated_duration=estimated_duration,
+        response_model=TaskStatusResponse,
+    )
+    return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
+
+
 def raise_if_text_params(prompt: str, text_params: list[str]) -> None:
    for i in text_params:
        if f"--{i} " in prompt:
@@ -1066,530 +1040,6 @@ PRICE_BADGE_VIDEO = IO.PriceBadge(
 )


-def _seedance2_text_inputs():
-    return [
-        IO.String.Input(
-            "prompt",
-            multiline=True,
-            default="",
-            tooltip="Text prompt for video generation.",
-        ),
-        IO.Combo.Input(
-            "resolution",
-            options=["480p", "720p"],
-            tooltip="Resolution of the output video.",
-        ),
-        IO.Combo.Input(
-            "ratio",
-            options=["16:9", "4:3", "1:1", "3:4", "9:16", "21:9", "adaptive"],
-            tooltip="Aspect ratio of the output video.",
-        ),
-        IO.Int.Input(
-            "duration",
-            default=7,
-            min=4,
-            max=15,
-            step=1,
-            tooltip="Duration of the output video in seconds (4-15).",
-            display_mode=IO.NumberDisplay.slider,
-        ),
-        IO.Boolean.Input(
-            "generate_audio",
-            default=True,
-            tooltip="Enable audio generation for the output video.",
-        ),
-    ]
-
-
-class ByteDance2TextToVideoNode(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="ByteDance2TextToVideoNode",
-            display_name="ByteDance Seedance 2.0 Text to Video",
-            category="api node/video/ByteDance",
-            description="Generate video using Seedance 2.0 models based on a text prompt.",
-            inputs=[
-                IO.DynamicCombo.Input(
-                    "model",
-                    options=[
-                        IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()),
-                        IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()),
-                    ],
-                    tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=2147483647,
-                    step=1,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                    tooltip="Seed controls whether the node should re-run; "
-                    "results are non-deterministic regardless of seed.",
-                ),
-                IO.Boolean.Input(
-                    "watermark",
-                    default=False,
-                    tooltip="Whether to add a watermark to the video.",
-                    advanced=True,
-                ),
-            ],
-            outputs=[
-                IO.Video.Output(),
-            ],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
-                expr="""
-                (
-                  $rate480 := 10044;
-                  $rate720 := 21600;
-                  $m := widgets.model;
-                  $pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
-                  $res := $lookup(widgets, "model.resolution");
-                  $dur := $lookup(widgets, "model.duration");
-                  $rate := $res = "720p" ? $rate720 : $rate480;
-                  $cost := $dur * $rate * $pricePer1K / 1000;
-                  {"type": "usd", "usd": $cost, "format": {"approximate": true}}
-                )
-                """,
-            ),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        model: dict,
-        seed: int,
-        watermark: bool,
-    ) -> IO.NodeOutput:
-        validate_string(model["prompt"], strip_whitespace=True, min_length=1)
-        model_id = SEEDANCE_MODELS[model["model"]]
-        initial_response = await sync_op(
-            cls,
-            ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
-            data=Seedance2TaskCreationRequest(
-                model=model_id,
-                content=[TaskTextContent(text=model["prompt"])],
-                generate_audio=model["generate_audio"],
-                resolution=model["resolution"],
-                ratio=model["ratio"],
-                duration=model["duration"],
-                seed=seed,
-                watermark=watermark,
-            ),
-            response_model=TaskCreationResponse,
-        )
-        response = await poll_op(
-            cls,
-            ApiEndpoint(path=f"{BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
-            response_model=TaskStatusResponse,
-            status_extractor=lambda r: r.status,
-            price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
-            poll_interval=9,
-        )
-        return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
-
-
-class ByteDance2FirstLastFrameNode(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="ByteDance2FirstLastFrameNode",
-            display_name="ByteDance Seedance 2.0 First-Last-Frame to Video",
-            category="api node/video/ByteDance",
-            description="Generate video using Seedance 2.0 from a first frame image and optional last frame image.",
-            inputs=[
-                IO.DynamicCombo.Input(
-                    "model",
-                    options=[
-                        IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()),
-                        IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()),
-                    ],
-                    tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
-                ),
-                IO.Image.Input(
-                    "first_frame",
-                    tooltip="First frame image for the video.",
-                ),
-                IO.Image.Input(
-                    "last_frame",
-                    tooltip="Last frame image for the video.",
-                    optional=True,
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=2147483647,
-                    step=1,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                    tooltip="Seed controls whether the node should re-run; "
-                    "results are non-deterministic regardless of seed.",
-                ),
-                IO.Boolean.Input(
-                    "watermark",
-                    default=False,
-                    tooltip="Whether to add a watermark to the video.",
-                    advanced=True,
-                ),
-            ],
-            outputs=[
-                IO.Video.Output(),
-            ],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
-                expr="""
-                (
-                  $rate480 := 10044;
-                  $rate720 := 21600;
-                  $m := widgets.model;
-                  $pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
-                  $res := $lookup(widgets, "model.resolution");
-                  $dur := $lookup(widgets, "model.duration");
-                  $rate := $res = "720p" ? $rate720 : $rate480;
-                  $cost := $dur * $rate * $pricePer1K / 1000;
-                  {"type": "usd", "usd": $cost, "format": {"approximate": true}}
-                )
-                """,
-            ),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        model: dict,
-        first_frame: Input.Image,
-        seed: int,
-        watermark: bool,
-        last_frame: Input.Image | None = None,
-    ) -> IO.NodeOutput:
-        validate_string(model["prompt"], strip_whitespace=True, min_length=1)
-        model_id = SEEDANCE_MODELS[model["model"]]
-
-        content: list[TaskTextContent | TaskImageContent] = [
-            TaskTextContent(text=model["prompt"]),
-            TaskImageContent(
-                image_url=TaskImageContentUrl(
-                    url=await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame.")
-                ),
-                role="first_frame",
-            ),
-        ]
-        if last_frame is not None:
-            content.append(
-                TaskImageContent(
-                    image_url=TaskImageContentUrl(
-                        url=await upload_image_to_comfyapi(cls, last_frame, wait_label="Uploading last frame.")
-                    ),
-                    role="last_frame",
-                ),
-            )
-
-        initial_response = await sync_op(
-            cls,
-            ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
-            data=Seedance2TaskCreationRequest(
-                model=model_id,
-                content=content,
-                generate_audio=model["generate_audio"],
-                resolution=model["resolution"],
-                ratio=model["ratio"],
-                duration=model["duration"],
-                seed=seed,
-                watermark=watermark,
-            ),
-            response_model=TaskCreationResponse,
-        )
-        response = await poll_op(
-            cls,
-            ApiEndpoint(path=f"{BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
-            response_model=TaskStatusResponse,
-            status_extractor=lambda r: r.status,
-            price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
-            poll_interval=9,
-        )
-        return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
-
-
-def _seedance2_reference_inputs():
-    return [
-        *_seedance2_text_inputs(),
-        IO.Autogrow.Input(
-            "reference_images",
-            template=IO.Autogrow.TemplateNames(
-                IO.Image.Input("reference_image"),
-                names=[
-                    "image_1",
-                    "image_2",
-                    "image_3",
-                    "image_4",
-                    "image_5",
-                    "image_6",
-                    "image_7",
-                    "image_8",
-                    "image_9",
-                ],
-                min=0,
-            ),
-        ),
-        IO.Autogrow.Input(
-            "reference_videos",
-            template=IO.Autogrow.TemplateNames(
-                IO.Video.Input("reference_video"),
-                names=["video_1", "video_2", "video_3"],
-                min=0,
-            ),
-        ),
-        IO.Autogrow.Input(
-            "reference_audios",
-            template=IO.Autogrow.TemplateNames(
-                IO.Audio.Input("reference_audio"),
-                names=["audio_1", "audio_2", "audio_3"],
-                min=0,
-            ),
-        ),
-    ]
-
-
-class ByteDance2ReferenceNode(IO.ComfyNode):
-
-    @classmethod
-    def define_schema(cls):
-        return IO.Schema(
-            node_id="ByteDance2ReferenceNode",
-            display_name="ByteDance Seedance 2.0 Reference to Video",
-            category="api node/video/ByteDance",
-            description="Generate, edit, or extend video using Seedance 2.0 with reference images, "
-            "videos, and audio. Supports multimodal reference, video editing, and video extension.",
-            inputs=[
-                IO.DynamicCombo.Input(
-                    "model",
-                    options=[
-                        IO.DynamicCombo.Option("Seedance 2.0", _seedance2_reference_inputs()),
-                        IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_reference_inputs()),
-                    ],
-                    tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=2147483647,
-                    step=1,
-                    display_mode=IO.NumberDisplay.number,
-                    control_after_generate=True,
-                    tooltip="Seed controls whether the node should re-run; "
-                    "results are non-deterministic regardless of seed.",
-                ),
-                IO.Boolean.Input(
-                    "watermark",
-                    default=False,
-                    tooltip="Whether to add a watermark to the video.",
-                    advanced=True,
-                ),
-            ],
-            outputs=[
-                IO.Video.Output(),
-            ],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                depends_on=IO.PriceBadgeDepends(
-                    widgets=["model", "model.resolution", "model.duration"],
-                    input_groups=["model.reference_videos"],
-                ),
-                expr="""
-                (
-                  $rate480 := 10044;
-                  $rate720 := 21600;
-                  $m := widgets.model;
-                  $hasVideo := $lookup(inputGroups, "model.reference_videos") > 0;
-                  $noVideoPricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
-                  $videoPricePer1K := $contains($m, "fast") ? 0.004719 : 0.006149;
-                  $res := $lookup(widgets, "model.resolution");
-                  $dur := $lookup(widgets, "model.duration");
-                  $rate := $res = "720p" ? $rate720 : $rate480;
-                  $noVideoCost := $dur * $rate * $noVideoPricePer1K / 1000;
-                  $minVideoFactor := $ceil($dur * 5 / 3);
-                  $minVideoCost := $minVideoFactor * $rate * $videoPricePer1K / 1000;
-                  $maxVideoCost := (15 + $dur) * $rate * $videoPricePer1K / 1000;
-                  $hasVideo
-                    ? {
-                        "type": "range_usd",
-                        "min_usd": $minVideoCost,
-                        "max_usd": $maxVideoCost,
-                        "format": {"approximate": true}
-                      }
-                    : {
-                        "type": "usd",
-                        "usd": $noVideoCost,
-                        "format": {"approximate": true}
-                      }
-                )
-                """,
-            ),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        model: dict,
-        seed: int,
-        watermark: bool,
-    ) -> IO.NodeOutput:
-        validate_string(model["prompt"], strip_whitespace=True, min_length=1)
-
-        reference_images = model.get("reference_images", {})
-        reference_videos = model.get("reference_videos", {})
-        reference_audios = model.get("reference_audios", {})
-
-        if not reference_images and not reference_videos:
-            raise ValueError("At least one reference image or video is required.")
-
-        model_id = SEEDANCE_MODELS[model["model"]]
-        has_video_input = len(reference_videos) > 0
-        total_video_duration = 0.0
-        for i, key in enumerate(reference_videos, 1):
-            video = reference_videos[key]
-            _validate_ref_video_pixels(video, model_id, i)
-            try:
-                dur = video.get_duration()
-                if dur < 1.8:
-                    raise ValueError(f"Reference video {i} is too short: {dur:.1f}s. Minimum duration is 1.8 seconds.")
-                total_video_duration += dur
-            except ValueError:
-                raise
-            except Exception:
-                pass
-        if total_video_duration > 15.1:
-            raise ValueError(f"Total reference video duration is {total_video_duration:.1f}s. Maximum is 15.1 seconds.")
-
-        total_audio_duration = 0.0
-        for i, key in enumerate(reference_audios, 1):
-            audio = reference_audios[key]
-            dur = int(audio["waveform"].shape[-1]) / int(audio["sample_rate"])
-            if dur < 1.8:
-                raise ValueError(f"Reference audio {i} is too short: {dur:.1f}s. Minimum duration is 1.8 seconds.")
-            total_audio_duration += dur
-        if total_audio_duration > 15.1:
-            raise ValueError(f"Total reference audio duration is {total_audio_duration:.1f}s. Maximum is 15.1 seconds.")
-
-        content: list[TaskTextContent | TaskImageContent | TaskVideoContent | TaskAudioContent] = [
-            TaskTextContent(text=model["prompt"]),
-        ]
-        for i, key in enumerate(reference_images, 1):
-            content.append(
-                TaskImageContent(
-                    image_url=TaskImageContentUrl(
-                        url=await upload_image_to_comfyapi(
-                            cls,
-                            image=reference_images[key],
-                            wait_label=f"Uploading image {i}",
-                        ),
-                    ),
-                    role="reference_image",
-                ),
-            )
-        for i, key in enumerate(reference_videos, 1):
-            content.append(
-                TaskVideoContent(
-                    video_url=TaskVideoContentUrl(
-                        url=await upload_video_to_comfyapi(
-                            cls,
-                            reference_videos[key],
-                            wait_label=f"Uploading video {i}",
-                        ),
-                    ),
-                ),
-            )
-        for key in reference_audios:
-            content.append(
-                TaskAudioContent(
-                    audio_url=TaskAudioContentUrl(
-                        url=await upload_audio_to_comfyapi(
-                            cls,
-                            reference_audios[key],
-                            container_format="mp3",
-                            codec_name="libmp3lame",
-                            mime_type="audio/mpeg",
-                        ),
-                    ),
-                ),
-            )
-        initial_response = await sync_op(
-            cls,
-            ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
-            data=Seedance2TaskCreationRequest(
-                model=model_id,
-                content=content,
-                generate_audio=model["generate_audio"],
-                resolution=model["resolution"],
-                ratio=model["ratio"],
-                duration=model["duration"],
-                seed=seed,
-                watermark=watermark,
-            ),
-            response_model=TaskCreationResponse,
-        )
-        response = await poll_op(
-            cls,
-            ApiEndpoint(path=f"{BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
-            response_model=TaskStatusResponse,
-            status_extractor=lambda r: r.status,
-            price_extractor=_seedance2_price_extractor(model_id, has_video_input=has_video_input),
-            poll_interval=9,
-        )
-        return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
-
-
-async def process_video_task(
-    cls: type[IO.ComfyNode],
-    payload: Text2VideoTaskCreationRequest | Image2VideoTaskCreationRequest,
-    estimated_duration: int | None,
-) -> IO.NodeOutput:
-    if payload.model in DEPRECATED_MODELS:
-        logger.warning(
-            "Model '%s' is deprecated and will be deactivated on May 13, 2026. "
-            "Please switch to a newer model. Recommended: seedance-1-0-pro-fast-251015.",
-            payload.model,
-        )
-    initial_response = await sync_op(
-        cls,
-        ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
-        data=payload,
-        response_model=TaskCreationResponse,
-    )
-    response = await poll_op(
-        cls,
-        ApiEndpoint(path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
-        status_extractor=lambda r: r.status,
-        estimated_duration=estimated_duration,
-        response_model=TaskStatusResponse,
-    )
-    return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
-
-
 class ByteDanceExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -1600,9 +1050,6 @@ class ByteDanceExtension(ComfyExtension):
            ByteDanceImageToVideoNode,
            ByteDanceFirstLastFrameNode,
            ByteDanceImageReferenceNode,
-            ByteDance2TextToVideoNode,
-            ByteDance2FirstLastFrameNode,
-            ByteDance2ReferenceNode,
        ]


--- a/comfy_api_nodes/nodes_grok.py
+++ b/comfy_api_nodes/nodes_grok.py
@@ -558,7 +558,7 @@ class GrokVideoReferenceNode(IO.ComfyNode):
                (
                  $res := $lookup(widgets, "model.resolution");
                  $dur := $lookup(widgets, "model.duration");
-                  $refs := $lookup(inputGroups, "model.reference_images");
+                  $refs := inputGroups["model.reference_images"];
                  $rate := $res = "720p" ? 0.07 : 0.05;
                  $price := ($rate * $dur + 0.002 * $refs) * 1.43;
                  {"type":"usd","usd": $price}
--- a/comfy_api_nodes/nodes_sonilo.py
+++ b/comfy_api_nodes/nodes_sonilo.py
@@ -1,287 +0,0 @@
-import base64
-import json
-import logging
-import time
-from urllib.parse import urljoin
-
-import aiohttp
-from typing_extensions import override
-
-from comfy_api.latest import IO, ComfyExtension, Input
-from comfy_api_nodes.util import (
-    ApiEndpoint,
-    audio_bytes_to_audio_input,
-    upload_video_to_comfyapi,
-    validate_string,
-)
-from comfy_api_nodes.util._helpers import (
-    default_base_url,
-    get_auth_header,
-    get_node_id,
-    is_processing_interrupted,
-)
-from comfy_api_nodes.util.common_exceptions import ProcessingInterrupted
-from server import PromptServer
-
-logger = logging.getLogger(__name__)
-
-
-class SoniloVideoToMusic(IO.ComfyNode):
-    """Generate music from video using Sonilo's AI model."""
-
-    @classmethod
-    def define_schema(cls) -> IO.Schema:
-        return IO.Schema(
-            node_id="SoniloVideoToMusic",
-            display_name="Sonilo Video to Music",
-            category="api node/audio/Sonilo",
-            description="Generate music from video content using Sonilo's AI model. "
-            "Analyzes the video and creates matching music.",
-            inputs=[
-                IO.Video.Input(
-                    "video",
-                    tooltip="Input video to generate music from. Maximum duration: 6 minutes.",
-                ),
-                IO.String.Input(
-                    "prompt",
-                    default="",
-                    multiline=True,
-                    tooltip="Optional text prompt to guide music generation. "
-                    "Leave empty for best quality - the model will fully analyze the video content.",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=0xFFFFFFFFFFFFFFFF,
-                    control_after_generate=True,
-                    tooltip="Seed for reproducibility. Currently ignored by the Sonilo "
-                    "service but kept for graph consistency.",
-                ),
-            ],
-            outputs=[IO.Audio.Output()],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                expr='{"type":"usd","usd":0.009,"format":{"suffix":"/second"}}',
-            ),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        video: Input.Video,
-        prompt: str = "",
-        seed: int = 0,
-    ) -> IO.NodeOutput:
-        video_url = await upload_video_to_comfyapi(cls, video, max_duration=360)
-        form = aiohttp.FormData()
-        form.add_field("video_url", video_url)
-        if prompt.strip():
-            form.add_field("prompt", prompt.strip())
-        audio_bytes = await _stream_sonilo_music(
-            cls,
-            ApiEndpoint(path="/proxy/sonilo/v2m/generate", method="POST"),
-            form,
-        )
-        return IO.NodeOutput(audio_bytes_to_audio_input(audio_bytes))
-
-
-class SoniloTextToMusic(IO.ComfyNode):
-    """Generate music from a text prompt using Sonilo's AI model."""
-
-    @classmethod
-    def define_schema(cls) -> IO.Schema:
-        return IO.Schema(
-            node_id="SoniloTextToMusic",
-            display_name="Sonilo Text to Music",
-            category="api node/audio/Sonilo",
-            description="Generate music from a text prompt using Sonilo's AI model. "
-            "Leave duration at 0 to let the model infer it from the prompt.",
-            inputs=[
-                IO.String.Input(
-                    "prompt",
-                    default="",
-                    multiline=True,
-                    tooltip="Text prompt describing the music to generate.",
-                ),
-                IO.Int.Input(
-                    "duration",
-                    default=0,
-                    min=0,
-                    max=360,
-                    tooltip="Target duration in seconds. Set to 0 to let the model "
-                    "infer the duration from the prompt. Maximum: 6 minutes.",
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=0xFFFFFFFFFFFFFFFF,
-                    control_after_generate=True,
-                    tooltip="Seed for reproducibility. Currently ignored by the Sonilo "
-                    "service but kept for graph consistency.",
-                ),
-            ],
-            outputs=[IO.Audio.Output()],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                depends_on=IO.PriceBadgeDepends(widgets=["duration"]),
-                expr="""
-                (
-                  widgets.duration > 0
-                    ? {"type":"usd","usd": 0.005 * widgets.duration}
-                    : {"type":"usd","usd": 0.005, "format":{"suffix":"/second"}}
-                )
-                """,
-            ),
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        prompt: str,
-        duration: int = 0,
-        seed: int = 0,
-    ) -> IO.NodeOutput:
-        validate_string(prompt, strip_whitespace=True, min_length=1)
-        form = aiohttp.FormData()
-        form.add_field("prompt", prompt)
-        if duration > 0:
-            form.add_field("duration", str(duration))
-        audio_bytes = await _stream_sonilo_music(
-            cls,
-            ApiEndpoint(path="/proxy/sonilo/t2m/generate", method="POST"),
-            form,
-        )
-        return IO.NodeOutput(audio_bytes_to_audio_input(audio_bytes))
-
-
-async def _stream_sonilo_music(
-    cls: type[IO.ComfyNode],
-    endpoint: ApiEndpoint,
-    form: aiohttp.FormData,
-) -> bytes:
-    """POST ``form`` to Sonilo, read the NDJSON stream, and return the first stream's audio bytes."""
-    url = urljoin(default_base_url().rstrip("/") + "/", endpoint.path.lstrip("/"))
-
-    headers: dict[str, str] = {}
-    headers.update(get_auth_header(cls))
-    headers.update(endpoint.headers)
-
-    node_id = get_node_id(cls)
-    start_ts = time.monotonic()
-    last_chunk_status_ts = 0.0
-    audio_streams: dict[int, list[bytes]] = {}
-    title: str | None = None
-
-    timeout = aiohttp.ClientTimeout(total=1200.0, sock_read=300.0)
-    async with aiohttp.ClientSession(timeout=timeout) as session:
-        PromptServer.instance.send_progress_text("Status: Queued", node_id)
-        async with session.post(url, data=form, headers=headers) as resp:
-            if resp.status >= 400:
-                msg = await _extract_error_message(resp)
-                raise Exception(f"Sonilo API error ({resp.status}): {msg}")
-
-            while True:
-                if is_processing_interrupted():
-                    raise ProcessingInterrupted("Task cancelled")
-
-                raw_line = await resp.content.readline()
-                if not raw_line:
-                    break
-
-                line = raw_line.decode("utf-8").strip()
-                if not line:
-                    continue
-
-                try:
-                    evt = json.loads(line)
-                except json.JSONDecodeError:
-                    logger.warning("Sonilo: skipping malformed NDJSON line")
-                    continue
-
-                evt_type = evt.get("type")
-                if evt_type == "error":
-                    code = evt.get("code", "UNKNOWN")
-                    message = evt.get("message", "Unknown error")
-                    raise Exception(f"Sonilo generation error ({code}): {message}")
-                if evt_type == "duration":
-                    duration_sec = evt.get("duration_sec")
-                    if duration_sec is not None:
-                        PromptServer.instance.send_progress_text(
-                            f"Status: Generating\nVideo duration: {duration_sec:.1f}s",
-                            node_id,
-                        )
-                elif evt_type in ("titles", "title"):
-                    # v2m sends a "titles" list, t2m sends a scalar "title"
-                    if evt_type == "titles":
-                        titles = evt.get("titles", [])
-                        if titles:
-                            title = titles[0]
-                    else:
-                        title = evt.get("title") or title
-                    if title:
-                        PromptServer.instance.send_progress_text(
-                            f"Status: Generating\nTitle: {title}",
-                            node_id,
-                        )
-                elif evt_type == "audio_chunk":
-                    stream_idx = evt.get("stream_index", 0)
-                    chunk_data = base64.b64decode(evt["data"])
-
-                    if stream_idx not in audio_streams:
-                        audio_streams[stream_idx] = []
-                    audio_streams[stream_idx].append(chunk_data)
-
-                    now = time.monotonic()
-                    if now - last_chunk_status_ts >= 1.0:
-                        total_chunks = sum(len(chunks) for chunks in audio_streams.values())
-                        elapsed = int(now - start_ts)
-                        status_lines = ["Status: Receiving audio"]
-                        if title:
-                            status_lines.append(f"Title: {title}")
-                        status_lines.append(f"Chunks received: {total_chunks}")
-                        status_lines.append(f"Time elapsed: {elapsed}s")
-                        PromptServer.instance.send_progress_text("\n".join(status_lines), node_id)
-                        last_chunk_status_ts = now
-                elif evt_type == "complete":
-                    break
-
-    if not audio_streams:
-        raise Exception("Sonilo API returned no audio data.")
-
-    PromptServer.instance.send_progress_text("Status: Completed", node_id)
-    selected_stream = 0 if 0 in audio_streams else min(audio_streams)
-    return b"".join(audio_streams[selected_stream])
-
-
-async def _extract_error_message(resp: aiohttp.ClientResponse) -> str:
-    """Extract a human-readable error message from an HTTP error response."""
-    try:
-        error_body = await resp.json()
-        detail = error_body.get("detail", {})
-        if isinstance(detail, dict):
-            return detail.get("message", str(detail))
-        return str(detail)
-    except Exception:
-        return await resp.text()
-
-
-class SoniloExtension(ComfyExtension):
-    @override
-    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
-        return [SoniloVideoToMusic, SoniloTextToMusic]
-
-
-async def comfy_entrypoint() -> SoniloExtension:
-    return SoniloExtension()
--- a/comfy_extras/nodes_glsl.py
+++ b/comfy_extras/nodes_glsl.py
@@ -1,85 +1,68 @@
 import os
 import sys
 import re
+import ctypes
 import logging
-import ctypes.util
-import importlib.util
 from typing import TypedDict

 import numpy as np
 import torch

 import nodes
+import comfy_angle
 from comfy_api.latest import ComfyExtension, io, ui
 from typing_extensions import override
-from utils.install_util import get_missing_requirements_message

 logger = logging.getLogger(__name__)


-def _check_opengl_availability():
-    """Early check for OpenGL availability. Raises RuntimeError if unlikely to work."""
-    logger.debug("_check_opengl_availability: starting")
-    missing = []
+def _preload_angle():
+    egl_path = comfy_angle.get_egl_path()
+    gles_path = comfy_angle.get_glesv2_path()

-    # Check Python packages (using find_spec to avoid importing)
-    logger.debug("_check_opengl_availability: checking for glfw package")
-    if importlib.util.find_spec("glfw") is None:
-        missing.append("glfw")
+    if sys.platform == "win32":
+        angle_dir = comfy_angle.get_lib_dir()
+        os.add_dll_directory(angle_dir)
+        os.environ["PATH"] = angle_dir + os.pathsep + os.environ.get("PATH", "")

-    logger.debug("_check_opengl_availability: checking for OpenGL package")
-    if importlib.util.find_spec("OpenGL") is None:
-        missing.append("PyOpenGL")
-
-    if missing:
-        raise RuntimeError(
-            f"OpenGL dependencies not available.\n{get_missing_requirements_message()}\n"
-        )
-
-    # On Linux without display, check if headless backends are available
-    logger.debug(f"_check_opengl_availability: platform={sys.platform}")
-    if sys.platform.startswith("linux"):
-        has_display = os.environ.get("DISPLAY") or os.environ.get("WAYLAND_DISPLAY")
-        logger.debug(f"_check_opengl_availability: has_display={bool(has_display)}")
-        if not has_display:
-            # Check for EGL or OSMesa libraries
-            logger.debug("_check_opengl_availability: checking for EGL library")
-            has_egl = ctypes.util.find_library("EGL")
-            logger.debug("_check_opengl_availability: checking for OSMesa library")
-            has_osmesa = ctypes.util.find_library("OSMesa")
-
-            # Error disabled for CI as it fails this check
-            # if not has_egl and not has_osmesa:
-            #     raise RuntimeError(
-            #         "GLSL Shader node: No display and no headless backend (EGL/OSMesa) found.\n"
-            #         "See error below for installation instructions."
-            #     )
-            logger.debug(f"Headless mode: EGL={'yes' if has_egl else 'no'}, OSMesa={'yes' if has_osmesa else 'no'}")
-
-    logger.debug("_check_opengl_availability: completed")
+    mode = 0 if sys.platform == "win32" else ctypes.RTLD_GLOBAL
+    ctypes.CDLL(str(egl_path), mode=mode)
+    ctypes.CDLL(str(gles_path), mode=mode)


-# Run early check at import time
-logger.debug("nodes_glsl: running _check_opengl_availability at import time")
-_check_opengl_availability()
-
-# OpenGL modules - initialized lazily when context is created
-gl = None
-glfw = None
-EGL = None
+# Pre-load ANGLE *before* any PyOpenGL import so that the EGL platform
+# plugin picks up ANGLE's libEGL / libGLESv2 instead of system libs.
+_preload_angle()
+os.environ.setdefault("PYOPENGL_PLATFORM", "egl")


-def _import_opengl():
-    """Import OpenGL module. Called after context is created."""
-    global gl
-    if gl is None:
-        logger.debug("_import_opengl: importing OpenGL.GL")
-        import OpenGL.GL as _gl
-        gl = _gl
-        logger.debug("_import_opengl: import completed")
-    return gl
+import OpenGL
+OpenGL.USE_ACCELERATE = False


+def _patch_find_library():
+    """PyOpenGL's EGL platform looks for 'EGL' and 'GLESv2' by short name
+    via ctypes.util.find_library, but ANGLE ships as 'libEGL' and
+    'libGLESv2'.  Patch find_library to return the full ANGLE paths so
+    PyOpenGL loads the same libraries we pre-loaded."""
+    if sys.platform == "linux":
+        return
+    import ctypes.util
+    _orig = ctypes.util.find_library
+    def _patched(name):
+        if name == 'EGL':
+            return comfy_angle.get_egl_path()
+        if name == 'GLESv2':
+            return comfy_angle.get_glesv2_path()
+        return _orig(name)
+    ctypes.util.find_library = _patched
+
+
+_patch_find_library()
+
+from OpenGL import EGL
+from OpenGL import GLES3 as gl
+
 class SizeModeInput(TypedDict):
    size_mode: str
    width: int
@@ -102,7 +85,7 @@ MAX_OUTPUTS = 4     # fragColor0-3 (MRT)
 # (-1,-1)---(3,-1)
 #
 # v_texCoord is computed from clip space: * 0.5 + 0.5 maps (-1,1) -> (0,1)
-VERTEX_SHADER = """#version 330 core
+VERTEX_SHADER = """#version 300 es
 out vec2 v_texCoord;
 void main() {
    vec2 verts[3] = vec2[](vec2(-1, -1), vec2(3, -1), vec2(-1, 3));
@@ -126,14 +109,99 @@ void main() {
 """


-def _convert_es_to_desktop(source: str) -> str:
-    """Convert GLSL ES (WebGL) shader source to desktop GLSL 330 core."""
-    # Remove any existing #version directive
-    source = re.sub(r"#version\s+\d+(\s+es)?\s*\n?", "", source, flags=re.IGNORECASE)
-    # Remove precision qualifiers (not needed in desktop GLSL)
-    source = re.sub(r"precision\s+(lowp|mediump|highp)\s+\w+\s*;\s*\n?", "", source)
-    # Prepend desktop GLSL version
-    return "#version 330 core\n" + source
+
+def _egl_attribs(*values):
+    """Build an EGL_NONE-terminated EGLint attribute array."""
+    vals = list(values) + [EGL.EGL_NONE]
+    return (ctypes.c_int32 * len(vals))(*vals)
+
+
+# EGL platform extension constants
+EGL_PLATFORM_ANGLE_ANGLE = 0x3202
+EGL_PLATFORM_ANGLE_TYPE_ANGLE = 0x3203
+EGL_PLATFORM_ANGLE_TYPE_VULKAN_ANGLE = 0x3450
+EGL_MESA_PLATFORM_SURFACELESS = 0x31DD
+
+
+_eglGetPlatformDisplayEXT = None
+
+def _get_egl_platform_display_ext(platform, native_display, attribs):
+    """Call eglGetPlatformDisplayEXT via ctypes (extension, not in PyOpenGL)."""
+    global _eglGetPlatformDisplayEXT
+    if _eglGetPlatformDisplayEXT is None:
+        from OpenGL import platform as _plat
+        egl_lib = _plat.PLATFORM.EGL
+        _get_proc = egl_lib.eglGetProcAddress
+        _get_proc.restype = ctypes.c_void_p
+        _get_proc.argtypes = [ctypes.c_char_p]
+        ptr = _get_proc(b"eglGetPlatformDisplayEXT")
+        if not ptr:
+            return None
+        func_type = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_uint32, ctypes.c_void_p, ctypes.c_void_p)
+        _eglGetPlatformDisplayEXT = func_type(ptr)
+
+    raw = _eglGetPlatformDisplayEXT(platform, native_display, attribs)
+    if not raw:
+        return None
+    return ctypes.cast(raw, EGL.EGLDisplay)
+
+
+def _get_egl_display():
+    """Get an EGL display, trying the default first then ANGLE's Vulkan
+    platform for headless environments without a display server."""
+    failures = []
+
+    # Try the default display first (works when X11/Wayland is available)
+    display = EGL.eglGetDisplay(EGL.EGL_DEFAULT_DISPLAY)
+    if display:
+        major, minor = ctypes.c_int32(0), ctypes.c_int32(0)
+        try:
+            if EGL.eglInitialize(display, ctypes.byref(major), ctypes.byref(minor)):
+                return display, major.value, minor.value
+        except Exception as e:
+            failures.append(f"default: {e}")
+
+    logger.info("Default EGL display unavailable, trying headless fallbacks")
+
+    # Headless fallback strategies, tried in order:
+    headless_strategies = [
+        ("surfaceless", EGL_MESA_PLATFORM_SURFACELESS, None, None),
+        ("ANGLE Vulkan", EGL_PLATFORM_ANGLE_ANGLE, None,
+         _egl_attribs(EGL_PLATFORM_ANGLE_TYPE_ANGLE, EGL_PLATFORM_ANGLE_TYPE_VULKAN_ANGLE)),
+    ]
+
+    for name, platform, native_display, attribs in headless_strategies:
+        display = _get_egl_platform_display_ext(platform, native_display, attribs)
+        if not display:
+            failures.append(f"{name}: eglGetPlatformDisplayEXT returned no display")
+            continue
+        major, minor = ctypes.c_int32(0), ctypes.c_int32(0)
+        try:
+            if EGL.eglInitialize(display, ctypes.byref(major), ctypes.byref(minor)):
+                logger.info(f"Using EGL {name} platform (headless)")
+                return display, major.value, minor.value
+            failures.append(f"{name}: eglInitialize returned false")
+        except Exception as e:
+            failures.append(f"{name}: {e}")
+            continue
+
+    details = "\n".join(f"  - {f}" for f in failures)
+    raise RuntimeError(
+        "Failed to initialize EGL display.\n"
+        "No display server and no headless EGL platform available.\n"
+        f"Tried:\n{details}\n"
+        "Ensure GPU drivers are installed or set DISPLAY for a virtual framebuffer."
+    )
+
+
+def _gl_str(name):
+    """Get an OpenGL string parameter."""
+    v = gl.glGetString(name)
+    if not v:
+        return "Unknown"
+    if isinstance(v, bytes):
+        return v.decode(errors="replace")
+    return ctypes.string_at(v).decode(errors="replace")


 def _detect_output_count(source: str) -> int:
@@ -159,163 +227,8 @@ def _detect_pass_count(source: str) -> int:
    return 1


-def _init_glfw():
-    """Initialize GLFW. Returns (window, glfw_module). Raises RuntimeError on failure."""
-    logger.debug("_init_glfw: starting")
-    # On macOS, glfw.init() must be called from main thread or it hangs forever
-    if sys.platform == "darwin":
-        logger.debug("_init_glfw: skipping on macOS")
-        raise RuntimeError("GLFW backend not supported on macOS")
-
-    logger.debug("_init_glfw: importing glfw module")
-    import glfw as _glfw
-
-    logger.debug("_init_glfw: calling glfw.init()")
-    if not _glfw.init():
-        raise RuntimeError("glfw.init() failed")
-
-    try:
-        logger.debug("_init_glfw: setting window hints")
-        _glfw.window_hint(_glfw.VISIBLE, _glfw.FALSE)
-        _glfw.window_hint(_glfw.CONTEXT_VERSION_MAJOR, 3)
-        _glfw.window_hint(_glfw.CONTEXT_VERSION_MINOR, 3)
-        _glfw.window_hint(_glfw.OPENGL_PROFILE, _glfw.OPENGL_CORE_PROFILE)
-
-        logger.debug("_init_glfw: calling create_window()")
-        window = _glfw.create_window(64, 64, "ComfyUI GLSL", None, None)
-        if not window:
-            raise RuntimeError("glfw.create_window() failed")
-
-        logger.debug("_init_glfw: calling make_context_current()")
-        _glfw.make_context_current(window)
-        logger.debug("_init_glfw: completed successfully")
-        return window, _glfw
-    except Exception:
-        logger.debug("_init_glfw: failed, terminating glfw")
-        _glfw.terminate()
-        raise
-
-
-def _init_egl():
-    """Initialize EGL for headless rendering. Returns (display, context, surface, EGL_module). Raises RuntimeError on failure."""
-    logger.debug("_init_egl: starting")
-    from OpenGL import EGL as _EGL
-    from OpenGL.EGL import (
-        eglGetDisplay, eglInitialize, eglChooseConfig, eglCreateContext,
-        eglMakeCurrent, eglCreatePbufferSurface, eglBindAPI,
-        eglTerminate, eglDestroyContext, eglDestroySurface,
-        EGL_DEFAULT_DISPLAY, EGL_NO_CONTEXT, EGL_NONE,
-        EGL_SURFACE_TYPE, EGL_PBUFFER_BIT, EGL_RENDERABLE_TYPE, EGL_OPENGL_BIT,
-        EGL_RED_SIZE, EGL_GREEN_SIZE, EGL_BLUE_SIZE, EGL_ALPHA_SIZE, EGL_DEPTH_SIZE,
-        EGL_WIDTH, EGL_HEIGHT, EGL_OPENGL_API,
-    )
-    logger.debug("_init_egl: imports completed")
-
-    display = None
-    context = None
-    surface = None
-
-    try:
-        logger.debug("_init_egl: calling eglGetDisplay()")
-        display = eglGetDisplay(EGL_DEFAULT_DISPLAY)
-        if display == _EGL.EGL_NO_DISPLAY:
-            raise RuntimeError("eglGetDisplay() failed")
-
-        logger.debug("_init_egl: calling eglInitialize()")
-        major, minor = _EGL.EGLint(), _EGL.EGLint()
-        if not eglInitialize(display, major, minor):
-            display = None  # Not initialized, don't terminate
-            raise RuntimeError("eglInitialize() failed")
-        logger.debug(f"_init_egl: EGL version {major.value}.{minor.value}")
-
-        config_attribs = [
-            EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
-            EGL_RENDERABLE_TYPE, EGL_OPENGL_BIT,
-            EGL_RED_SIZE, 8, EGL_GREEN_SIZE, 8, EGL_BLUE_SIZE, 8, EGL_ALPHA_SIZE, 8,
-            EGL_DEPTH_SIZE, 0, EGL_NONE
-        ]
-        configs = (_EGL.EGLConfig * 1)()
-        num_configs = _EGL.EGLint()
-        if not eglChooseConfig(display, config_attribs, configs, 1, num_configs) or num_configs.value == 0:
-            raise RuntimeError("eglChooseConfig() failed")
-        config = configs[0]
-        logger.debug(f"_init_egl: config chosen, num_configs={num_configs.value}")
-
-        if not eglBindAPI(EGL_OPENGL_API):
-            raise RuntimeError("eglBindAPI() failed")
-
-        logger.debug("_init_egl: calling eglCreateContext()")
-        context_attribs = [
-            _EGL.EGL_CONTEXT_MAJOR_VERSION, 3,
-            _EGL.EGL_CONTEXT_MINOR_VERSION, 3,
-            _EGL.EGL_CONTEXT_OPENGL_PROFILE_MASK, _EGL.EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT,
-            EGL_NONE
-        ]
-        context = eglCreateContext(display, config, EGL_NO_CONTEXT, context_attribs)
-        if context == EGL_NO_CONTEXT:
-            raise RuntimeError("eglCreateContext() failed")
-
-        logger.debug("_init_egl: calling eglCreatePbufferSurface()")
-        pbuffer_attribs = [EGL_WIDTH, 64, EGL_HEIGHT, 64, EGL_NONE]
-        surface = eglCreatePbufferSurface(display, config, pbuffer_attribs)
-        if surface == _EGL.EGL_NO_SURFACE:
-            raise RuntimeError("eglCreatePbufferSurface() failed")
-
-        logger.debug("_init_egl: calling eglMakeCurrent()")
-        if not eglMakeCurrent(display, surface, surface, context):
-            raise RuntimeError("eglMakeCurrent() failed")
-
-        logger.debug("_init_egl: completed successfully")
-        return display, context, surface, _EGL
-
-    except Exception:
-        logger.debug("_init_egl: failed, cleaning up")
-        # Clean up any resources on failure
-        if surface is not None:
-            eglDestroySurface(display, surface)
-        if context is not None:
-            eglDestroyContext(display, context)
-        if display is not None:
-            eglTerminate(display)
-        raise
-
-
-def _init_osmesa():
-    """Initialize OSMesa for software rendering. Returns (context, buffer). Raises RuntimeError on failure."""
-    import ctypes
-
-    logger.debug("_init_osmesa: starting")
-    os.environ["PYOPENGL_PLATFORM"] = "osmesa"
-
-    logger.debug("_init_osmesa: importing OpenGL.osmesa")
-    from OpenGL import GL as _gl
-    from OpenGL.osmesa import (
-        OSMesaCreateContextExt, OSMesaMakeCurrent, OSMesaDestroyContext,
-        OSMESA_RGBA,
-    )
-    logger.debug("_init_osmesa: imports completed")
-
-    ctx = OSMesaCreateContextExt(OSMESA_RGBA, 24, 0, 0, None)
-    if not ctx:
-        raise RuntimeError("OSMesaCreateContextExt() failed")
-
-    width, height = 64, 64
-    buffer = (ctypes.c_ubyte * (width * height * 4))()
-
-    logger.debug("_init_osmesa: calling OSMesaMakeCurrent()")
-    if not OSMesaMakeCurrent(ctx, buffer, _gl.GL_UNSIGNED_BYTE, width, height):
-        OSMesaDestroyContext(ctx)
-        raise RuntimeError("OSMesaMakeCurrent() failed")
-
-    logger.debug("_init_osmesa: completed successfully")
-    return ctx, buffer
-
-
 class GLContext:
-    """Manages OpenGL context and resources for shader execution.
-
-    Tries backends in order: GLFW (desktop) → EGL (headless GPU) → OSMesa (software).
-    """
+    """Manages an OpenGL ES 3.0 context via EGL/ANGLE (singleton)."""

    _instance = None
    _initialized = False
@@ -327,131 +240,105 @@ class GLContext:

    def __init__(self):
        if GLContext._initialized:
-            logger.debug("GLContext.__init__: already initialized, skipping")
            return

-        logger.debug("GLContext.__init__: starting initialization")
-
-        global glfw, EGL
-
        import time
        start = time.perf_counter()

-        self._backend = None
-        self._window = None
-        self._egl_display = None
-        self._egl_context = None
-        self._egl_surface = None
-        self._osmesa_ctx = None
-        self._osmesa_buffer = None
+        self._display = None
+        self._surface = None
+        self._context = None
        self._vao = None

-        # Try backends in order: GLFW → EGL → OSMesa
-        errors = []
-
-        logger.debug("GLContext.__init__: trying GLFW backend")
        try:
-            self._window, glfw = _init_glfw()
-            self._backend = "glfw"
-            logger.debug("GLContext.__init__: GLFW backend succeeded")
-        except Exception as e:
-            logger.debug(f"GLContext.__init__: GLFW backend failed: {e}")
-            errors.append(("GLFW", e))
+            self._display, self._egl_major, self._egl_minor = _get_egl_display()

-        if self._backend is None:
-            logger.debug("GLContext.__init__: trying EGL backend")
-            try:
-                self._egl_display, self._egl_context, self._egl_surface, EGL = _init_egl()
-                self._backend = "egl"
-                logger.debug("GLContext.__init__: EGL backend succeeded")
-            except Exception as e:
-                logger.debug(f"GLContext.__init__: EGL backend failed: {e}")
-                errors.append(("EGL", e))
+            if not EGL.eglBindAPI(EGL.EGL_OPENGL_ES_API):
+                raise RuntimeError("eglBindAPI(EGL_OPENGL_ES_API) failed")

-        if self._backend is None:
-            logger.debug("GLContext.__init__: trying OSMesa backend")
-            try:
-                self._osmesa_ctx, self._osmesa_buffer = _init_osmesa()
-                self._backend = "osmesa"
-                logger.debug("GLContext.__init__: OSMesa backend succeeded")
-            except Exception as e:
-                logger.debug(f"GLContext.__init__: OSMesa backend failed: {e}")
-                errors.append(("OSMesa", e))
+            config = EGL.EGLConfig()
+            n_configs = ctypes.c_int32(0)
+            if not EGL.eglChooseConfig(
+                self._display,
+                _egl_attribs(
+                    EGL.EGL_RENDERABLE_TYPE, EGL.EGL_OPENGL_ES3_BIT,
+                    EGL.EGL_SURFACE_TYPE, EGL.EGL_PBUFFER_BIT,
+                    EGL.EGL_RED_SIZE, 8, EGL.EGL_GREEN_SIZE, 8,
+                    EGL.EGL_BLUE_SIZE, 8, EGL.EGL_ALPHA_SIZE, 8,
+                ),
+                ctypes.byref(config), 1, ctypes.byref(n_configs),
+            ) or n_configs.value == 0:
+                raise RuntimeError("eglChooseConfig() failed")

-        if self._backend is None:
-            if sys.platform == "win32":
-                platform_help = (
-                    "Windows: Ensure GPU drivers are installed and display is available.\n"
-                    "         CPU-only/headless mode is not supported on Windows."
-                )
-            elif sys.platform == "darwin":
-                platform_help = (
-                    "macOS: GLFW is not supported.\n"
-                    "  Install OSMesa via Homebrew: brew install mesa\n"
-                    "  Then: pip install PyOpenGL PyOpenGL-accelerate"
-                )
-            else:
-                platform_help = (
-                    "Linux: Install one of these backends:\n"
-                    "  Desktop:           sudo apt install libgl1-mesa-glx libglfw3\n"
-                    "  Headless with GPU: sudo apt install libegl1-mesa libgl1-mesa-dri\n"
-                    "  Headless (CPU):    sudo apt install libosmesa6"
-                )
-
-            error_details = "\n".join(f"  {name}: {err}" for name, err in errors)
-            raise RuntimeError(
-                f"Failed to create OpenGL context.\n\n"
-                f"Backend errors:\n{error_details}\n\n"
-                f"{platform_help}"
+            self._surface = EGL.eglCreatePbufferSurface(
+                self._display, config,
+                _egl_attribs(EGL.EGL_WIDTH, 64, EGL.EGL_HEIGHT, 64),
            )
+            if not self._surface:
+                raise RuntimeError("eglCreatePbufferSurface() failed")

-        # Now import OpenGL.GL (after context is current)
-        logger.debug("GLContext.__init__: importing OpenGL.GL")
-        _import_opengl()
+            self._context = EGL.eglCreateContext(
+                self._display, config, EGL.EGL_NO_CONTEXT,
+                _egl_attribs(EGL.EGL_CONTEXT_CLIENT_VERSION, 3),
+            )
+            if not self._context:
+                raise RuntimeError("eglCreateContext() failed")

-        # Create VAO (required for core profile, but OSMesa may use compat profile)
-        logger.debug("GLContext.__init__: creating VAO")
-        try:
-            vao = gl.glGenVertexArrays(1)
-            gl.glBindVertexArray(vao)
-            self._vao = vao  # Only store after successful bind
-            logger.debug("GLContext.__init__: VAO created successfully")
-        except Exception as e:
-            logger.debug(f"GLContext.__init__: VAO creation failed (may be expected for OSMesa): {e}")
-            # OSMesa with older Mesa may not support VAOs
-            # Clean up if we created but couldn't bind
-            if vao:
-                try:
-                    gl.glDeleteVertexArrays(1, [vao])
-                except Exception:
-                    pass
+            if not EGL.eglMakeCurrent(self._display, self._surface, self._surface, self._context):
+                raise RuntimeError("eglMakeCurrent() failed")
+
+            self._vao = gl.glGenVertexArrays(1)
+            gl.glBindVertexArray(self._vao)
+
+        except Exception:
+            self._cleanup()
+            raise

        elapsed = (time.perf_counter() - start) * 1000

-        # Log device info
-        renderer = gl.glGetString(gl.GL_RENDERER)
-        vendor = gl.glGetString(gl.GL_VENDOR)
-        version = gl.glGetString(gl.GL_VERSION)
-        renderer = renderer.decode() if renderer else "Unknown"
-        vendor = vendor.decode() if vendor else "Unknown"
-        version = version.decode() if version else "Unknown"
+        renderer = _gl_str(gl.GL_RENDERER)
+        vendor = _gl_str(gl.GL_VENDOR)
+        version = _gl_str(gl.GL_VERSION)

        GLContext._initialized = True
-        logger.info(f"GLSL context initialized in {elapsed:.1f}ms ({self._backend}) - {renderer} ({vendor}), GL {version}")
+        logger.info(f"GLSL context initialized in {elapsed:.1f}ms - EGL {self._egl_major}.{self._egl_minor}, {renderer} ({vendor}), GL {version}")

    def make_current(self):
-        if self._backend == "glfw":
-            glfw.make_context_current(self._window)
-        elif self._backend == "egl":
-            from OpenGL.EGL import eglMakeCurrent
-            eglMakeCurrent(self._egl_display, self._egl_surface, self._egl_surface, self._egl_context)
-        elif self._backend == "osmesa":
-            from OpenGL.osmesa import OSMesaMakeCurrent
-            OSMesaMakeCurrent(self._osmesa_ctx, self._osmesa_buffer, gl.GL_UNSIGNED_BYTE, 64, 64)
-
+        if not EGL.eglMakeCurrent(self._display, self._surface, self._surface, self._context):
+            err = EGL.eglGetError()
+            raise RuntimeError(f"eglMakeCurrent() failed (EGL error: 0x{err:04X})")
        if self._vao is not None:
            gl.glBindVertexArray(self._vao)

+    def _cleanup(self):
+        if not self._display:
+            return
+        try:
+            if self._vao is not None:
+                gl.glDeleteVertexArrays(1, [self._vao])
+                self._vao = None
+        except Exception:
+            pass
+        try:
+            EGL.eglMakeCurrent(self._display, EGL.EGL_NO_SURFACE, EGL.EGL_NO_SURFACE, EGL.EGL_NO_CONTEXT)
+        except Exception:
+            pass
+        try:
+            if self._context:
+                EGL.eglDestroyContext(self._display, self._context)
+        except Exception:
+            pass
+        try:
+            if self._surface:
+                EGL.eglDestroySurface(self._display, self._surface)
+        except Exception:
+            pass
+        try:
+            EGL.eglTerminate(self._display)
+        except Exception:
+            pass
+        self._display = None
+

 def _compile_shader(source: str, shader_type: int) -> int:
    """Compile a shader and return its ID."""
@@ -459,8 +346,10 @@ def _compile_shader(source: str, shader_type: int) -> int:
    gl.glShaderSource(shader, source)
    gl.glCompileShader(shader)

-    if gl.glGetShaderiv(shader, gl.GL_COMPILE_STATUS) != gl.GL_TRUE:
-        error = gl.glGetShaderInfoLog(shader).decode()
+    if not gl.glGetShaderiv(shader, gl.GL_COMPILE_STATUS):
+        error = gl.glGetShaderInfoLog(shader)
+        if isinstance(error, bytes):
+            error = error.decode(errors="replace")
        gl.glDeleteShader(shader)
        raise RuntimeError(f"Shader compilation failed:\n{error}")

@@ -484,8 +373,10 @@ def _create_program(vertex_source: str, fragment_source: str) -> int:
    gl.glDeleteShader(vertex_shader)
    gl.glDeleteShader(fragment_shader)

-    if gl.glGetProgramiv(program, gl.GL_LINK_STATUS) != gl.GL_TRUE:
-        error = gl.glGetProgramInfoLog(program).decode()
+    if not gl.glGetProgramiv(program, gl.GL_LINK_STATUS):
+        error = gl.glGetProgramInfoLog(program)
+        if isinstance(error, bytes):
+            error = error.decode(errors="replace")
        gl.glDeleteProgram(program)
        raise RuntimeError(f"Program linking failed:\n{error}")

@@ -530,9 +421,6 @@ def _render_shader_batch(
    ctx = GLContext()
    ctx.make_current()

-    # Convert from GLSL ES to desktop GLSL 330
-    fragment_source = _convert_es_to_desktop(fragment_code)
-
    # Detect how many outputs the shader actually uses
    num_outputs = _detect_output_count(fragment_code)

@@ -558,9 +446,9 @@ def _render_shader_batch(
    try:
        # Compile shaders (once for all batches)
        try:
-            program = _create_program(VERTEX_SHADER, fragment_source)
+            program = _create_program(VERTEX_SHADER, fragment_code)
        except RuntimeError:
-            logger.error(f"Fragment shader:\n{fragment_source}")
+            logger.error(f"Fragment shader:\n{fragment_code}")
            raise

        gl.glUseProgram(program)
@@ -723,13 +611,13 @@ def _render_shader_batch(
                    gl.glDrawArrays(gl.GL_TRIANGLES, 0, 3)

            # Read back outputs for this batch
-            # (glGetTexImage is synchronous, implicitly waits for rendering)
+            gl.glBindFramebuffer(gl.GL_FRAMEBUFFER, fbo)
            batch_outputs = []
-            for tex in output_textures:
-                gl.glBindTexture(gl.GL_TEXTURE_2D, tex)
-                data = gl.glGetTexImage(gl.GL_TEXTURE_2D, 0, gl.GL_RGBA, gl.GL_FLOAT)
-                img = np.frombuffer(data, dtype=np.float32).reshape(height, width, 4)
-                batch_outputs.append(img[::-1, :, :].copy())
+            for i in range(num_outputs):
+                gl.glReadBuffer(gl.GL_COLOR_ATTACHMENT0 + i)
+                buf = np.empty((height, width, 4), dtype=np.float32)
+                gl.glReadPixels(0, 0, width, height, gl.GL_RGBA, gl.GL_FLOAT, buf)
+                batch_outputs.append(buf[::-1, :, :].copy())

            # Pad with black images for unused outputs
            black_img = np.zeros((height, width, 4), dtype=np.float32)
@@ -750,18 +638,18 @@ def _render_shader_batch(
        gl.glBindFramebuffer(gl.GL_FRAMEBUFFER, 0)
        gl.glUseProgram(0)

-        for tex in input_textures:
-            gl.glDeleteTextures(int(tex))
-        for tex in curve_textures:
-            gl.glDeleteTextures(int(tex))
-        for tex in output_textures:
-            gl.glDeleteTextures(int(tex))
-        for tex in ping_pong_textures:
-            gl.glDeleteTextures(int(tex))
+        if input_textures:
+            gl.glDeleteTextures(len(input_textures), input_textures)
+        if curve_textures:
+            gl.glDeleteTextures(len(curve_textures), curve_textures)
+        if output_textures:
+            gl.glDeleteTextures(len(output_textures), output_textures)
+        if ping_pong_textures:
+            gl.glDeleteTextures(len(ping_pong_textures), ping_pong_textures)
        if fbo is not None:
            gl.glDeleteFramebuffers(1, [fbo])
-        for pp_fbo in ping_pong_fbos:
-            gl.glDeleteFramebuffers(1, [pp_fbo])
+        if ping_pong_fbos:
+            gl.glDeleteFramebuffers(len(ping_pong_fbos), ping_pong_fbos)
        if program is not None:
            gl.glDeleteProgram(program)

--- a/comfy_extras/nodes_rtdetr.py
+++ b/comfy_extras/nodes_rtdetr.py
@@ -32,12 +32,10 @@ class RTDETR_detect(io.ComfyNode):
    def execute(cls, model, image, threshold, class_name, max_detections) -> io.NodeOutput:
        B, H, W, C = image.shape

+        image_in = comfy.utils.common_upscale(image.movedim(-1, 1), 640, 640, "bilinear", crop="disabled")
+
        comfy.model_management.load_model_gpu(model)
-        results = []
-        for i in range(0, B, 32):
-            batch = image[i:i + 32]
-            image_in = comfy.utils.common_upscale(batch.movedim(-1, 1), 640, 640, "bilinear", crop="disabled")
-            results.extend(model.model.diffusion_model(image_in, (W, H)))
+        results = model.model.diffusion_model(image_in, (W, H))  # list of B dicts

        all_bbox_dicts = []

--- a/comfy_extras/nodes_sdpose.py
+++ b/comfy_extras/nodes_sdpose.py
@@ -1,6 +1,5 @@
 import torch
 import comfy.utils
-import comfy.model_management
 import numpy as np
 import math
 import colorsys
@@ -411,9 +410,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
            pose_outputs.append(canvas)

        pose_outputs_np = np.stack(pose_outputs) if len(pose_outputs) > 1 else np.expand_dims(pose_outputs[0], 0)
-        final_pose_output = torch.from_numpy(pose_outputs_np).to(
-            device=comfy.model_management.intermediate_device(),
-            dtype=comfy.model_management.intermediate_dtype()) / 255.0
+        final_pose_output = torch.from_numpy(pose_outputs_np).float() / 255.0
        return io.NodeOutput(final_pose_output)

 class SDPoseKeypointExtractor(io.ComfyNode):
@@ -462,27 +459,6 @@ class SDPoseKeypointExtractor(io.ComfyNode):
        model_h = int(head.heatmap_size[0]) * 4   # e.g. 192 * 4 = 768
        model_w = int(head.heatmap_size[1]) * 4   # e.g. 256 * 4 = 1024

-        def _resize_to_model(imgs):
-            """Aspect-preserving resize + zero-pad BHWC images to (model_h, model_w). Returns (resized_bhwc, scale, pad_top, pad_left)."""
-            h, w = imgs.shape[-3], imgs.shape[-2]
-            scale = min(model_h / h, model_w / w)
-            sh, sw = int(round(h * scale)), int(round(w * scale))
-            pt, pl = (model_h - sh) // 2, (model_w - sw) // 2
-            chw = imgs.permute(0, 3, 1, 2).float()
-            scaled = comfy.utils.common_upscale(chw, sw, sh, upscale_method="bilinear", crop="disabled")
-            padded = torch.zeros(scaled.shape[0], scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
-            padded[:, :, pt:pt + sh, pl:pl + sw] = scaled
-            return padded.permute(0, 2, 3, 1), scale, pt, pl
-
-        def _remap_keypoints(kp, scale, pad_top, pad_left, offset_x=0, offset_y=0):
-            """Remap keypoints from model space back to original image space."""
-            kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
-            invalid = kp[..., 0] < 0
-            kp[..., 0] = (kp[..., 0] - pad_left) / scale + offset_x
-            kp[..., 1] = (kp[..., 1] - pad_top)  / scale + offset_y
-            kp[invalid] = -1
-            return kp
-
        def _run_on_latent(latent_batch):
            """Run one forward pass and return (keypoints_list, scores_list) for the batch."""
            nonlocal captured_feat
@@ -528,19 +504,36 @@ class SDPoseKeypointExtractor(io.ComfyNode):
                        if x2 <= x1 or y2 <= y1:
                            continue

+                        crop_h_px, crop_w_px = y2 - y1, x2 - x1
                        crop = img[:, y1:y2, x1:x2, :]  # (1, crop_h, crop_w, C)
-                        crop_resized, scale, pad_top, pad_left = _resize_to_model(crop)
+
+                        # scale to fit inside (model_h, model_w) while preserving aspect ratio, then pad to exact model size.
+                        scale = min(model_h / crop_h_px, model_w / crop_w_px)
+                        scaled_h, scaled_w = int(round(crop_h_px * scale)), int(round(crop_w_px * scale))
+                        pad_top, pad_left  = (model_h - scaled_h) // 2, (model_w - scaled_w) // 2
+
+                        crop_chw = crop.permute(0, 3, 1, 2).float()  # BHWC → BCHW
+                        scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled")
+                        padded = torch.zeros(1, scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
+                        padded[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled
+                        crop_resized = padded.permute(0, 2, 3, 1)  # BCHW → BHWC

                        latent_crop = vae.encode(crop_resized)
                        kp_batch, sc_batch = _run_on_latent(latent_crop)
-                        kp = _remap_keypoints(kp_batch[0], scale, pad_top, pad_left, x1, y1)
+                        kp, sc = kp_batch[0], sc_batch[0]  # (K, 2), coords in model pixel space
+
+                        # remove padding offset, undo scale, offset to full-image coordinates.
+                        kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
+                        kp[..., 0] = (kp[..., 0] - pad_left) / scale + x1
+                        kp[..., 1] = (kp[..., 1] - pad_top)  / scale + y1
+
                        img_keypoints.append(kp)
-                        img_scores.append(sc_batch[0])
+                        img_scores.append(sc)
                else:
-                    img_resized, scale, pad_top, pad_left = _resize_to_model(img)
-                    latent_img = vae.encode(img_resized)
+                    # No bboxes for this image – run on the full image
+                    latent_img = vae.encode(img)
                    kp_batch, sc_batch = _run_on_latent(latent_img)
-                    img_keypoints.append(_remap_keypoints(kp_batch[0], scale, pad_top, pad_left))
+                    img_keypoints.append(kp_batch[0])
                    img_scores.append(sc_batch[0])

                all_keypoints.append(img_keypoints)
@@ -548,16 +541,19 @@ class SDPoseKeypointExtractor(io.ComfyNode):
                pbar.update(1)

        else: # full-image mode, batched
-            for batch_start in tqdm(range(0, total_images, batch_size), desc="Extracting keypoints"):
-                batch_resized, scale, pad_top, pad_left = _resize_to_model(image[batch_start:batch_start + batch_size])
-                latent_batch = vae.encode(batch_resized)
+            tqdm_pbar = tqdm(total=total_images, desc="Extracting keypoints")
+            for batch_start in range(0, total_images, batch_size):
+                batch_end = min(batch_start + batch_size, total_images)
+                latent_batch = vae.encode(image[batch_start:batch_end])
+
                kp_batch, sc_batch = _run_on_latent(latent_batch)

                for kp, sc in zip(kp_batch, sc_batch):
-                    all_keypoints.append([_remap_keypoints(kp, scale, pad_top, pad_left)])
+                    all_keypoints.append([kp])
                    all_scores.append([sc])
+                    tqdm_pbar.update(1)

-                pbar.update(len(kp_batch))
+                pbar.update(batch_end - batch_start)

        openpose_frames = _to_openpose_frames(all_keypoints, all_scores, height, width)
        return io.NodeOutput(openpose_frames)
--- a/comfy_extras/nodes_upscale_model.py
+++ b/comfy_extras/nodes_upscale_model.py
@@ -6,7 +6,6 @@ import comfy.utils
 import folder_paths
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
-import comfy.model_management

 try:
    from spandrel_extra_arches import EXTRA_REGISTRY
@@ -79,15 +78,13 @@ class ImageUpscaleWithModel(io.ComfyNode):
        tile = 512
        overlap = 32

-        output_device = comfy.model_management.intermediate_device()
-
        oom = True
        try:
            while oom:
                try:
                    steps = in_img.shape[0] * comfy.utils.get_tiled_scale_steps(in_img.shape[3], in_img.shape[2], tile_x=tile, tile_y=tile, overlap=overlap)
                    pbar = comfy.utils.ProgressBar(steps)
-                    s = comfy.utils.tiled_scale(in_img, lambda a: upscale_model(a.float()), tile_x=tile, tile_y=tile, overlap=overlap, upscale_amount=upscale_model.scale, pbar=pbar, output_device=output_device)
+                    s = comfy.utils.tiled_scale(in_img, lambda a: upscale_model(a), tile_x=tile, tile_y=tile, overlap=overlap, upscale_amount=upscale_model.scale, pbar=pbar)
                    oom = False
                except Exception as e:
                    model_management.raise_non_oom(e)
@@ -97,7 +94,7 @@ class ImageUpscaleWithModel(io.ComfyNode):
        finally:
            upscale_model.to("cpu")

-        s = torch.clamp(s.movedim(-3,-1), min=0, max=1.0).to(comfy.model_management.intermediate_dtype())
+        s = torch.clamp(s.movedim(-3,-1), min=0, max=1.0)
        return io.NodeOutput(s)

    upscale = execute  # TODO: remove
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.19.0"
+__version__ = "0.18.1"
--- a/nodes.py
+++ b/nodes.py
@@ -2457,7 +2457,7 @@ async def init_builtin_extra_nodes():
        "nodes_number_convert.py",
        "nodes_painter.py",
        "nodes_curve.py",
-        "nodes_rtdetr.py",
+        "nodes_rtdetr.py"
    ]

    import_failed = []
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.19.0"
+version = "0.18.1"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-comfyui-frontend-package==1.42.10
-comfyui-workflow-templates==0.9.50
+comfyui-frontend-package==1.42.8
+comfyui-workflow-templates==0.9.44
 comfyui-embedded-docs==0.4.3
 torch
 torchsde
@@ -33,5 +33,5 @@ kornia>=0.7.1
 spandrel
 pydantic~=2.0
 pydantic-settings~=2.0
-PyOpenGL
-glfw
+PyOpenGL>=3.1.8
+comfy-angle
Author	SHA1	Message	Date
pythongosssss	fd5acc96a4	add support for headless	2026-04-08 02:36:28 -07:00
pythongosssss	ee600a3cce	Merge branch 'master' into pysssss/angle-glsl	2026-04-07 10:16:31 +01:00
pythongosssss	8114516ee6	Merge remote-tracking branch 'origin/master' into pysssss/angle-glsl # Conflicts: # comfy_extras/nodes_glsl.py	2026-03-27 09:05:42 -07:00
pythongosssss	3eb624ce6c	add comfy-angle req	2026-03-26 07:37:19 -07:00
pythongosssss	54ff5464bd	fix for mac	2026-03-24 12:10:03 -07:00
pythongosssss	333ff2e8a0	Merge remote-tracking branch 'origin/master' into pysssss/angle-glsl	2026-03-24 11:02:39 -07:00
pythongosssss	c821d8ee2a	add library override for windows, forcing EGL/GLESv2 -> libEGL/libGELSv2	2026-03-23 06:50:29 -07:00
pythongosssss	27b6f8a927	fix	2026-03-23 04:48:27 -07:00
pythongosssss	9ad848bd59	unused	2026-03-19 08:08:21 -07:00
pythongosssss	efe6439ad0	update to use comfy_angle lib	2026-03-19 08:08:07 -07:00
pythongosssss	8d76bb94fd	angle	2026-03-17 02:51:47 -07:00