Compare commits

...

10 Commits

Author SHA1 Message Date
Terry Jia
4fe06a265c Make Load3D model_file optional by adding "none" choice 2026-04-12 21:07:35 -04:00
comfyanonymous
31283d2892 Implement Ernie Image model. (#13369) 2026-04-11 22:29:31 -04:00
comfyanonymous
55ebd287ee Add a supports_fp64 function. (#13368) 2026-04-11 21:06:36 -04:00
comfyanonymous
a2840e7552 Make ImageUpscaleWithModel node work with intermediate device and dtype. (#13357) 2026-04-10 21:48:26 -04:00
Jukka Seppänen
a134423890 SDPose: resize input always (#13349) 2026-04-10 11:26:55 -10:00
Daxiong (Lin)
b920bdd77d chore: update workflow templates to v0.9.45 (#13353) 2026-04-10 15:50:40 -04:00
Alexander Piskun
5410ed34f5 fix(api-nodes): fix GrokVideoReferenceNode price badge (#13354) 2026-04-10 08:01:15 -10:00
Terry Jia
e6be419a30 should use 0 as defalut for brightness (#13345) 2026-04-09 21:58:05 -04:00
comfyanonymous
3d4aca8084 Bump comfyui-frontend-package version to 1.42.10 (#13346) 2026-04-09 21:56:49 -04:00
comfyanonymous
2d861fb146 Basic intel standalone package .bat (#13333) 2026-04-08 21:39:29 -04:00
20 changed files with 510 additions and 50 deletions

View File

@@ -0,0 +1,2 @@
.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build
pause

View File

@@ -182,7 +182,7 @@
]
},
"widgets_values": [
50
0
]
},
{

View File

@@ -316,7 +316,7 @@
"step": 1
},
"widgets_values": [
30
0
]
},
{

303
comfy/ldm/ernie/model.py Normal file
View File

@@ -0,0 +1,303 @@
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from comfy.ldm.modules.attention import optimized_attention
import comfy.model_management
def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
assert dim % 2 == 0
if not comfy.model_management.supports_fp64(pos.device):
device = torch.device("cpu")
else:
device = pos.device
scale = torch.arange(0, dim, 2, dtype=torch.float64, device=device) / dim
omega = 1.0 / (theta**scale)
out = torch.einsum("...n,d->...nd", pos, omega)
out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
return out.to(dtype=torch.float32, device=pos.device)
def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
rot_dim = freqs_cis.shape[-1]
x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:]
cos_ = freqs_cis[0]
sin_ = freqs_cis[1]
x1, x2 = x.chunk(2, dim=-1)
x_rotated = torch.cat((-x2, x1), dim=-1)
return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1)
class ErnieImageEmbedND3(nn.Module):
def __init__(self, dim: int, theta: int, axes_dim: tuple):
super().__init__()
self.dim = dim
self.theta = theta
self.axes_dim = list(axes_dim)
def forward(self, ids: torch.Tensor) -> torch.Tensor:
emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1)
emb = emb.unsqueeze(3) # [2, B, S, 1, head_dim//2]
return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1) # [B, S, 1, head_dim]
class ErnieImagePatchEmbedDynamic(nn.Module):
def __init__(self, in_channels: int, embed_dim: int, patch_size: int, operations, device=None, dtype=None):
super().__init__()
self.patch_size = patch_size
self.proj = operations.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True, device=device, dtype=dtype)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.proj(x)
batch_size, dim, height, width = x.shape
return x.reshape(batch_size, dim, height * width).transpose(1, 2).contiguous()
class Timesteps(nn.Module):
def __init__(self, num_channels: int, flip_sin_to_cos: bool = False):
super().__init__()
self.num_channels = num_channels
self.flip_sin_to_cos = flip_sin_to_cos
def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
half_dim = self.num_channels // 2
exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) / half_dim
emb = torch.exp(exponent)
emb = timesteps[:, None].float() * emb[None, :]
if self.flip_sin_to_cos:
emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=-1)
else:
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
return emb
class TimestepEmbedding(nn.Module):
def __init__(self, in_channels: int, time_embed_dim: int, operations, device=None, dtype=None):
super().__init__()
Linear = operations.Linear
self.linear_1 = Linear(in_channels, time_embed_dim, bias=True, device=device, dtype=dtype)
self.act = nn.SiLU()
self.linear_2 = Linear(time_embed_dim, time_embed_dim, bias=True, device=device, dtype=dtype)
def forward(self, sample: torch.Tensor) -> torch.Tensor:
sample = self.linear_1(sample)
sample = self.act(sample)
sample = self.linear_2(sample)
return sample
class ErnieImageAttention(nn.Module):
def __init__(self, query_dim: int, heads: int, dim_head: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
super().__init__()
self.heads = heads
self.head_dim = dim_head
self.inner_dim = heads * dim_head
Linear = operations.Linear
RMSNorm = operations.RMSNorm
self.to_q = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
self.to_k = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
self.to_v = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
self.norm_q = RMSNorm(dim_head, eps=eps, elementwise_affine=True, device=device, dtype=dtype)
self.norm_k = RMSNorm(dim_head, eps=eps, elementwise_affine=True, device=device, dtype=dtype)
self.to_out = nn.ModuleList([Linear(self.inner_dim, query_dim, bias=False, device=device, dtype=dtype)])
def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None, image_rotary_emb: torch.Tensor = None) -> torch.Tensor:
B, S, _ = x.shape
q_flat = self.to_q(x)
k_flat = self.to_k(x)
v_flat = self.to_v(x)
query = q_flat.view(B, S, self.heads, self.head_dim)
key = k_flat.view(B, S, self.heads, self.head_dim)
query = self.norm_q(query)
key = self.norm_k(key)
if image_rotary_emb is not None:
query = apply_rotary_emb(query, image_rotary_emb)
key = apply_rotary_emb(key, image_rotary_emb)
query, key = query.to(x.dtype), key.to(x.dtype)
q_flat = query.reshape(B, S, -1)
k_flat = key.reshape(B, S, -1)
hidden_states = optimized_attention(q_flat, k_flat, v_flat, self.heads, mask=attention_mask)
return self.to_out[0](hidden_states)
class ErnieImageFeedForward(nn.Module):
def __init__(self, hidden_size: int, ffn_hidden_size: int, operations, device=None, dtype=None):
super().__init__()
Linear = operations.Linear
self.gate_proj = Linear(hidden_size, ffn_hidden_size, bias=False, device=device, dtype=dtype)
self.up_proj = Linear(hidden_size, ffn_hidden_size, bias=False, device=device, dtype=dtype)
self.linear_fc2 = Linear(ffn_hidden_size, hidden_size, bias=False, device=device, dtype=dtype)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.linear_fc2(self.up_proj(x) * F.gelu(self.gate_proj(x)))
class ErnieImageSharedAdaLNBlock(nn.Module):
def __init__(self, hidden_size: int, num_heads: int, ffn_hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
super().__init__()
RMSNorm = operations.RMSNorm
self.adaLN_sa_ln = RMSNorm(hidden_size, eps=eps, device=device, dtype=dtype)
self.self_attention = ErnieImageAttention(
query_dim=hidden_size,
dim_head=hidden_size // num_heads,
heads=num_heads,
eps=eps,
operations=operations,
device=device,
dtype=dtype
)
self.adaLN_mlp_ln = RMSNorm(hidden_size, eps=eps, device=device, dtype=dtype)
self.mlp = ErnieImageFeedForward(hidden_size, ffn_hidden_size, operations=operations, device=device, dtype=dtype)
def forward(self, x, rotary_pos_emb, temb, attention_mask=None):
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = temb
residual = x
x_norm = self.adaLN_sa_ln(x)
x_norm = (x_norm.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype)
attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype)
residual = x
x_norm = self.adaLN_mlp_ln(x)
x_norm = (x_norm.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype)
return residual + (gate_mlp.float() * self.mlp(x_norm).float()).to(x.dtype)
class ErnieImageAdaLNContinuous(nn.Module):
def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
super().__init__()
LayerNorm = operations.LayerNorm
Linear = operations.Linear
self.norm = LayerNorm(hidden_size, elementwise_affine=False, eps=eps, device=device, dtype=dtype)
self.linear = Linear(hidden_size, hidden_size * 2, device=device, dtype=dtype)
def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
scale, shift = self.linear(conditioning).chunk(2, dim=-1)
x = self.norm(x)
x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
return x
class ErnieImageModel(nn.Module):
def __init__(
self,
hidden_size: int = 4096,
num_attention_heads: int = 32,
num_layers: int = 36,
ffn_hidden_size: int = 12288,
in_channels: int = 128,
out_channels: int = 128,
patch_size: int = 1,
text_in_dim: int = 3072,
rope_theta: int = 256,
rope_axes_dim: tuple = (32, 48, 48),
eps: float = 1e-6,
qk_layernorm: bool = True,
device=None,
dtype=None,
operations=None,
**kwargs
):
super().__init__()
self.dtype = dtype
self.hidden_size = hidden_size
self.num_heads = num_attention_heads
self.head_dim = hidden_size // num_attention_heads
self.patch_size = patch_size
self.out_channels = out_channels
Linear = operations.Linear
self.x_embedder = ErnieImagePatchEmbedDynamic(in_channels, hidden_size, patch_size, operations, device, dtype)
self.text_proj = Linear(text_in_dim, hidden_size, bias=False, device=device, dtype=dtype) if text_in_dim != hidden_size else None
self.time_proj = Timesteps(hidden_size, flip_sin_to_cos=False)
self.time_embedding = TimestepEmbedding(hidden_size, hidden_size, operations, device, dtype)
self.pos_embed = ErnieImageEmbedND3(dim=self.head_dim, theta=rope_theta, axes_dim=rope_axes_dim)
self.adaLN_modulation = nn.Sequential(
nn.SiLU(),
Linear(hidden_size, 6 * hidden_size, device=device, dtype=dtype)
)
self.layers = nn.ModuleList([
ErnieImageSharedAdaLNBlock(hidden_size, num_attention_heads, ffn_hidden_size, eps, operations, device, dtype)
for _ in range(num_layers)
])
self.final_norm = ErnieImageAdaLNContinuous(hidden_size, eps, operations, device, dtype)
self.final_linear = Linear(hidden_size, patch_size * patch_size * out_channels, device=device, dtype=dtype)
def forward(self, x, timesteps, context, **kwargs):
device, dtype = x.device, x.dtype
B, C, H, W = x.shape
p, Hp, Wp = self.patch_size, H // self.patch_size, W // self.patch_size
N_img = Hp * Wp
img_bsh = self.x_embedder(x)
text_bth = context
if self.text_proj is not None and text_bth.numel() > 0:
text_bth = self.text_proj(text_bth)
Tmax = text_bth.shape[1]
hidden_states = torch.cat([img_bsh, text_bth], dim=1)
text_ids = torch.zeros((B, Tmax, 3), device=device, dtype=torch.float32)
text_ids[:, :, 0] = torch.linspace(0, Tmax - 1, steps=Tmax, device=x.device, dtype=torch.float32)
index = float(Tmax)
transformer_options = kwargs.get("transformer_options", {})
rope_options = transformer_options.get("rope_options", None)
h_len, w_len = float(Hp), float(Wp)
h_offset, w_offset = 0.0, 0.0
if rope_options is not None:
h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
index += rope_options.get("shift_t", 0.0)
h_offset += rope_options.get("shift_y", 0.0)
w_offset += rope_options.get("shift_x", 0.0)
image_ids = torch.zeros((Hp, Wp, 3), device=device, dtype=torch.float32)
image_ids[:, :, 0] = image_ids[:, :, 1] + index
image_ids[:, :, 1] = image_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=Hp, device=device, dtype=torch.float32).unsqueeze(1)
image_ids[:, :, 2] = image_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=Wp, device=device, dtype=torch.float32).unsqueeze(0)
image_ids = image_ids.view(1, N_img, 3).expand(B, -1, -1)
rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)).to(x.dtype)
del image_ids, text_ids
sample = self.time_proj(timesteps.to(dtype)).to(self.time_embedding.linear_1.weight.dtype)
c = self.time_embedding(sample)
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
t.unsqueeze(1).contiguous() for t in self.adaLN_modulation(c).chunk(6, dim=-1)
]
temb = [shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp]
for layer in self.layers:
hidden_states = layer(hidden_states, rotary_pos_emb, temb)
hidden_states = self.final_norm(hidden_states, c).type_as(hidden_states)
patches = self.final_linear(hidden_states)[:, :N_img, :]
output = (
patches.view(B, Hp, Wp, p, p, self.out_channels)
.permute(0, 5, 1, 3, 2, 4)
.contiguous()
.view(B, self.out_channels, H, W)
)
return output

View File

@@ -16,7 +16,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transforme
def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
assert dim % 2 == 0
if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
if not comfy.model_management.supports_fp64(pos.device):
device = torch.device("cpu")
else:
device = pos.device

View File

@@ -90,7 +90,7 @@ class HeatmapHead(torch.nn.Module):
origin_max = np.max(hm[k])
dr = np.zeros((H + 2 * border, W + 2 * border), dtype=np.float32)
dr[border:-border, border:-border] = hm[k].copy()
dr = gaussian_filter(dr, sigma=2.0)
dr = gaussian_filter(dr, sigma=2.0, truncate=2.5)
hm[k] = dr[border:-border, border:-border].copy()
cur_max = np.max(hm[k])
if cur_max > 0:

View File

@@ -53,6 +53,7 @@ import comfy.ldm.kandinsky5.model
import comfy.ldm.anima.model
import comfy.ldm.ace.ace_step15
import comfy.ldm.rt_detr.rtdetr_v4
import comfy.ldm.ernie.model
import comfy.model_management
import comfy.patcher_extension
@@ -1962,3 +1963,14 @@ class Kandinsky5Image(Kandinsky5):
class RT_DETR_v4(BaseModel):
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.rt_detr.rtdetr_v4.RTv4)
class ErnieImage(BaseModel):
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ernie.model.ErnieImageModel)
def extra_conds(self, **kwargs):
out = super().extra_conds(**kwargs)
cross_attn = kwargs.get("cross_attn", None)
if cross_attn is not None:
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
return out

View File

@@ -713,6 +713,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["enc_h"] = state_dict['{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix)].shape[0]
return dit_config
if '{}layers.0.mlp.linear_fc2.weight'.format(key_prefix) in state_dict_keys: # Ernie Image
dit_config = {}
dit_config["image_model"] = "ernie"
return dit_config
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
return None

View File

@@ -1732,6 +1732,21 @@ def supports_mxfp8_compute(device=None):
return True
def supports_fp64(device=None):
if is_device_mps(device):
return False
if is_intel_xpu():
return False
if is_directml_enabled():
return False
if is_ixuca():
return False
return True
def extended_fp16_support():
# TODO: check why some models work with fp16 on newer torch versions but not on older
if torch_version_numeric < (2, 7):

View File

@@ -62,6 +62,7 @@ import comfy.text_encoders.anima
import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image
import comfy.text_encoders.qwen35
import comfy.text_encoders.ernie
import comfy.model_patcher
import comfy.lora
@@ -1235,6 +1236,7 @@ class TEModel(Enum):
QWEN35_4B = 25
QWEN35_9B = 26
QWEN35_27B = 27
MINISTRAL_3_3B = 28
def detect_te_model(sd):
@@ -1301,6 +1303,8 @@ def detect_te_model(sd):
return TEModel.MISTRAL3_24B
else:
return TEModel.MISTRAL3_24B_PRUNED_FLUX2
if weight.shape[0] == 3072:
return TEModel.MINISTRAL_3_3B
return TEModel.LLAMA3_8
return None
@@ -1458,6 +1462,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
elif te_model == TEModel.QWEN3_06B:
clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer
elif te_model == TEModel.MINISTRAL_3_3B:
clip_target.clip = comfy.text_encoders.ernie.te(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.ernie.ErnieTokenizer
tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None)
else:
# clip_l
if clip_type == CLIPType.SD3:

View File

@@ -26,6 +26,7 @@ import comfy.text_encoders.z_image
import comfy.text_encoders.anima
import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image
import comfy.text_encoders.ernie
from . import supported_models_base
from . import latent_formats
@@ -1749,6 +1750,37 @@ class RT_DETR_v4(supported_models_base.BASE):
def clip_target(self, state_dict={}):
return None
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4]
class ErnieImage(supported_models_base.BASE):
unet_config = {
"image_model": "ernie",
}
sampling_settings = {
"multiplier": 1000.0,
"shift": 3.0,
}
memory_usage_factor = 10.0
unet_extra_config = {}
latent_format = latent_formats.Flux2
supported_inference_dtypes = [torch.bfloat16, torch.float32]
vae_key_prefix = ["vae."]
text_encoder_key_prefix = ["text_encoders."]
def get_model(self, state_dict, prefix="", device=None):
out = model_base.ErnieImage(self, device=device)
return out
def clip_target(self, state_dict={}):
pref = self.text_encoder_key_prefix[0]
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}ministral3_3b.transformer.".format(pref))
return supported_models_base.ClipTarget(comfy.text_encoders.ernie.ErnieTokenizer, comfy.text_encoders.ernie.te(**hunyuan_detect))
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage]
models += [SVD_img2vid]

View File

@@ -0,0 +1,38 @@
from .flux import Mistral3Tokenizer
from comfy import sd1_clip
import comfy.text_encoders.llama
class Ministral3_3BTokenizer(Mistral3Tokenizer):
def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_data={}):
return super().__init__(embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_data=tokenizer_data)
class ErnieTokenizer(sd1_clip.SD1Tokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="ministral3_3b", tokenizer=Mistral3Tokenizer)
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
tokens = super().tokenize_with_weights(text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
return tokens
class Ministral3_3BModel(sd1_clip.SDClipModel):
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
textmodel_json_config = {}
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 1, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Ministral3_3B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
class ErnieTEModel(sd1_clip.SD1ClipModel):
def __init__(self, device="cpu", dtype=None, model_options={}, name="ministral3_3b", clip_model=Ministral3_3BModel):
super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
def te(dtype_llama=None, llama_quantization_metadata=None):
class ErnieTEModel_(ErnieTEModel):
def __init__(self, device="cpu", dtype=None, model_options={}):
if dtype_llama is not None:
dtype = dtype_llama
if llama_quantization_metadata is not None:
model_options = model_options.copy()
model_options["quantization_metadata"] = llama_quantization_metadata
super().__init__(device=device, dtype=dtype, model_options=model_options)
return ErnieTEModel

View File

@@ -116,9 +116,9 @@ class MistralTokenizerClass:
return LlamaTokenizerFast(**kwargs)
class Mistral3Tokenizer(sd1_clip.SDTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_data={}):
self.tekken_data = tokenizer_data.get("tekken_model", None)
super().__init__("", pad_with_end=False, embedding_directory=embedding_directory, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
super().__init__("", pad_with_end=False, embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, disable_weights=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
def state_dict(self):
return {"tekken_model": self.tekken_data}

View File

@@ -60,6 +60,29 @@ class Mistral3Small24BConfig:
final_norm: bool = True
lm_head: bool = False
@dataclass
class Ministral3_3BConfig:
vocab_size: int = 131072
hidden_size: int = 3072
intermediate_size: int = 9216
num_hidden_layers: int = 26
num_attention_heads: int = 32
num_key_value_heads: int = 8
max_position_embeddings: int = 262144
rms_norm_eps: float = 1e-5
rope_theta: float = 1000000.0
transformer_type: str = "llama"
head_dim = 128
rms_norm_add = False
mlp_activation = "silu"
qkv_bias = False
rope_dims = None
q_norm = None
k_norm = None
rope_scale = None
final_norm: bool = True
lm_head: bool = False
@dataclass
class Qwen25_3BConfig:
vocab_size: int = 151936
@@ -946,6 +969,15 @@ class Mistral3Small24B(BaseLlama, torch.nn.Module):
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
self.dtype = dtype
class Ministral3_3B(BaseLlama, torch.nn.Module):
def __init__(self, config_dict, dtype, device, operations):
super().__init__()
config = Ministral3_3BConfig(**config_dict)
self.num_layers = config.num_hidden_layers
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
self.dtype = dtype
class Qwen25_3B(BaseLlama, torch.nn.Module):
def __init__(self, config_dict, dtype, device, operations):
super().__init__()

View File

@@ -558,7 +558,7 @@ class GrokVideoReferenceNode(IO.ComfyNode):
(
$res := $lookup(widgets, "model.resolution");
$dur := $lookup(widgets, "model.duration");
$refs := inputGroups["model.reference_images"];
$refs := $lookup(inputGroups, "model.reference_images");
$rate := $res = "720p" ? 0.07 : 0.05;
$price := ($rate * $dur + 0.002 * $refs) * 1.43;
{"type":"usd","usd": $price}

View File

@@ -34,7 +34,7 @@ class Load3D(IO.ComfyNode):
essentials_category="Basics",
is_experimental=True,
inputs=[
IO.Combo.Input("model_file", options=sorted(files), upload=IO.UploadType.model),
IO.Combo.Input("model_file", options=["none"] + sorted(files), upload=IO.UploadType.model),
IO.Load3D.Input("image"),
IO.Int.Input("width", default=1024, min=1, max=4096, step=1),
IO.Int.Input("height", default=1024, min=1, max=4096, step=1),
@@ -68,8 +68,12 @@ class Load3D(IO.ComfyNode):
video = InputImpl.VideoFromFile(recording_video_path)
file_3d = Types.File3D(folder_paths.get_annotated_filepath(model_file))
return IO.NodeOutput(output_image, output_mask, model_file, normal_image, image['camera_info'], video, file_3d)
file_3d = None
mesh_path = ""
if model_file and model_file != "none":
file_3d = Types.File3D(folder_paths.get_annotated_filepath(model_file))
mesh_path = model_file
return IO.NodeOutput(output_image, output_mask, mesh_path, normal_image, image['camera_info'], video, file_3d)
process = execute # TODO: remove

View File

@@ -32,10 +32,12 @@ class RTDETR_detect(io.ComfyNode):
def execute(cls, model, image, threshold, class_name, max_detections) -> io.NodeOutput:
B, H, W, C = image.shape
image_in = comfy.utils.common_upscale(image.movedim(-1, 1), 640, 640, "bilinear", crop="disabled")
comfy.model_management.load_model_gpu(model)
results = model.model.diffusion_model(image_in, (W, H)) # list of B dicts
results = []
for i in range(0, B, 32):
batch = image[i:i + 32]
image_in = comfy.utils.common_upscale(batch.movedim(-1, 1), 640, 640, "bilinear", crop="disabled")
results.extend(model.model.diffusion_model(image_in, (W, H)))
all_bbox_dicts = []

View File

@@ -1,5 +1,6 @@
import torch
import comfy.utils
import comfy.model_management
import numpy as np
import math
import colorsys
@@ -410,7 +411,9 @@ class SDPoseDrawKeypoints(io.ComfyNode):
pose_outputs.append(canvas)
pose_outputs_np = np.stack(pose_outputs) if len(pose_outputs) > 1 else np.expand_dims(pose_outputs[0], 0)
final_pose_output = torch.from_numpy(pose_outputs_np).float() / 255.0
final_pose_output = torch.from_numpy(pose_outputs_np).to(
device=comfy.model_management.intermediate_device(),
dtype=comfy.model_management.intermediate_dtype()) / 255.0
return io.NodeOutput(final_pose_output)
class SDPoseKeypointExtractor(io.ComfyNode):
@@ -459,6 +462,27 @@ class SDPoseKeypointExtractor(io.ComfyNode):
model_h = int(head.heatmap_size[0]) * 4 # e.g. 192 * 4 = 768
model_w = int(head.heatmap_size[1]) * 4 # e.g. 256 * 4 = 1024
def _resize_to_model(imgs):
"""Aspect-preserving resize + zero-pad BHWC images to (model_h, model_w). Returns (resized_bhwc, scale, pad_top, pad_left)."""
h, w = imgs.shape[-3], imgs.shape[-2]
scale = min(model_h / h, model_w / w)
sh, sw = int(round(h * scale)), int(round(w * scale))
pt, pl = (model_h - sh) // 2, (model_w - sw) // 2
chw = imgs.permute(0, 3, 1, 2).float()
scaled = comfy.utils.common_upscale(chw, sw, sh, upscale_method="bilinear", crop="disabled")
padded = torch.zeros(scaled.shape[0], scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
padded[:, :, pt:pt + sh, pl:pl + sw] = scaled
return padded.permute(0, 2, 3, 1), scale, pt, pl
def _remap_keypoints(kp, scale, pad_top, pad_left, offset_x=0, offset_y=0):
"""Remap keypoints from model space back to original image space."""
kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
invalid = kp[..., 0] < 0
kp[..., 0] = (kp[..., 0] - pad_left) / scale + offset_x
kp[..., 1] = (kp[..., 1] - pad_top) / scale + offset_y
kp[invalid] = -1
return kp
def _run_on_latent(latent_batch):
"""Run one forward pass and return (keypoints_list, scores_list) for the batch."""
nonlocal captured_feat
@@ -504,36 +528,19 @@ class SDPoseKeypointExtractor(io.ComfyNode):
if x2 <= x1 or y2 <= y1:
continue
crop_h_px, crop_w_px = y2 - y1, x2 - x1
crop = img[:, y1:y2, x1:x2, :] # (1, crop_h, crop_w, C)
# scale to fit inside (model_h, model_w) while preserving aspect ratio, then pad to exact model size.
scale = min(model_h / crop_h_px, model_w / crop_w_px)
scaled_h, scaled_w = int(round(crop_h_px * scale)), int(round(crop_w_px * scale))
pad_top, pad_left = (model_h - scaled_h) // 2, (model_w - scaled_w) // 2
crop_chw = crop.permute(0, 3, 1, 2).float() # BHWC → BCHW
scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled")
padded = torch.zeros(1, scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
padded[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled
crop_resized = padded.permute(0, 2, 3, 1) # BCHW → BHWC
crop_resized, scale, pad_top, pad_left = _resize_to_model(crop)
latent_crop = vae.encode(crop_resized)
kp_batch, sc_batch = _run_on_latent(latent_crop)
kp, sc = kp_batch[0], sc_batch[0] # (K, 2), coords in model pixel space
# remove padding offset, undo scale, offset to full-image coordinates.
kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
kp[..., 0] = (kp[..., 0] - pad_left) / scale + x1
kp[..., 1] = (kp[..., 1] - pad_top) / scale + y1
kp = _remap_keypoints(kp_batch[0], scale, pad_top, pad_left, x1, y1)
img_keypoints.append(kp)
img_scores.append(sc)
img_scores.append(sc_batch[0])
else:
# No bboxes for this image run on the full image
latent_img = vae.encode(img)
img_resized, scale, pad_top, pad_left = _resize_to_model(img)
latent_img = vae.encode(img_resized)
kp_batch, sc_batch = _run_on_latent(latent_img)
img_keypoints.append(kp_batch[0])
img_keypoints.append(_remap_keypoints(kp_batch[0], scale, pad_top, pad_left))
img_scores.append(sc_batch[0])
all_keypoints.append(img_keypoints)
@@ -541,19 +548,16 @@ class SDPoseKeypointExtractor(io.ComfyNode):
pbar.update(1)
else: # full-image mode, batched
tqdm_pbar = tqdm(total=total_images, desc="Extracting keypoints")
for batch_start in range(0, total_images, batch_size):
batch_end = min(batch_start + batch_size, total_images)
latent_batch = vae.encode(image[batch_start:batch_end])
for batch_start in tqdm(range(0, total_images, batch_size), desc="Extracting keypoints"):
batch_resized, scale, pad_top, pad_left = _resize_to_model(image[batch_start:batch_start + batch_size])
latent_batch = vae.encode(batch_resized)
kp_batch, sc_batch = _run_on_latent(latent_batch)
for kp, sc in zip(kp_batch, sc_batch):
all_keypoints.append([kp])
all_keypoints.append([_remap_keypoints(kp, scale, pad_top, pad_left)])
all_scores.append([sc])
tqdm_pbar.update(1)
pbar.update(batch_end - batch_start)
pbar.update(len(kp_batch))
openpose_frames = _to_openpose_frames(all_keypoints, all_scores, height, width)
return io.NodeOutput(openpose_frames)

View File

@@ -6,6 +6,7 @@ import comfy.utils
import folder_paths
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
import comfy.model_management
try:
from spandrel_extra_arches import EXTRA_REGISTRY
@@ -78,13 +79,15 @@ class ImageUpscaleWithModel(io.ComfyNode):
tile = 512
overlap = 32
output_device = comfy.model_management.intermediate_device()
oom = True
try:
while oom:
try:
steps = in_img.shape[0] * comfy.utils.get_tiled_scale_steps(in_img.shape[3], in_img.shape[2], tile_x=tile, tile_y=tile, overlap=overlap)
pbar = comfy.utils.ProgressBar(steps)
s = comfy.utils.tiled_scale(in_img, lambda a: upscale_model(a), tile_x=tile, tile_y=tile, overlap=overlap, upscale_amount=upscale_model.scale, pbar=pbar)
s = comfy.utils.tiled_scale(in_img, lambda a: upscale_model(a.float()), tile_x=tile, tile_y=tile, overlap=overlap, upscale_amount=upscale_model.scale, pbar=pbar, output_device=output_device)
oom = False
except Exception as e:
model_management.raise_non_oom(e)
@@ -94,7 +97,7 @@ class ImageUpscaleWithModel(io.ComfyNode):
finally:
upscale_model.to("cpu")
s = torch.clamp(s.movedim(-3,-1), min=0, max=1.0)
s = torch.clamp(s.movedim(-3,-1), min=0, max=1.0).to(comfy.model_management.intermediate_dtype())
return io.NodeOutput(s)
upscale = execute # TODO: remove

View File

@@ -1,5 +1,5 @@
comfyui-frontend-package==1.42.8
comfyui-workflow-templates==0.9.44
comfyui-frontend-package==1.42.10
comfyui-workflow-templates==0.9.45
comfyui-embedded-docs==0.4.3
torch
torchsde