mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-02-13 19:50:02 +00:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
637221995f | ||
|
|
51697d50dc | ||
|
|
19f595b788 | ||
|
|
8a15568f10 | ||
|
|
9e984c48bc | ||
|
|
fc34c3d112 | ||
|
|
8aea746212 | ||
|
|
8c19910427 | ||
|
|
e77e0a8f8f | ||
|
|
a49007a7b0 | ||
|
|
6ae3515801 |
@@ -365,8 +365,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["patch_size"] = 2
|
||||
dit_config["in_channels"] = 16
|
||||
dit_config["dim"] = 2304
|
||||
dit_config["cap_feat_dim"] = 2304
|
||||
dit_config["n_layers"] = 26
|
||||
dit_config["cap_feat_dim"] = state_dict['{}cap_embedder.1.weight'.format(key_prefix)].shape[1]
|
||||
dit_config["n_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.')
|
||||
dit_config["n_heads"] = 24
|
||||
dit_config["n_kv_heads"] = 8
|
||||
dit_config["qk_norm"] = True
|
||||
|
||||
@@ -890,6 +890,7 @@ class TEModel(Enum):
|
||||
QWEN25_3B = 10
|
||||
QWEN25_7B = 11
|
||||
BYT5_SMALL_GLYPH = 12
|
||||
GEMMA_3_4B = 13
|
||||
|
||||
def detect_te_model(sd):
|
||||
if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
|
||||
@@ -912,6 +913,8 @@ def detect_te_model(sd):
|
||||
return TEModel.BYT5_SMALL_GLYPH
|
||||
return TEModel.T5_BASE
|
||||
if 'model.layers.0.post_feedforward_layernorm.weight' in sd:
|
||||
if 'model.layers.0.self_attn.q_norm.weight' in sd:
|
||||
return TEModel.GEMMA_3_4B
|
||||
return TEModel.GEMMA_2_2B
|
||||
if 'model.layers.0.self_attn.k_proj.bias' in sd:
|
||||
weight = sd['model.layers.0.self_attn.k_proj.bias']
|
||||
@@ -1016,6 +1019,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer
|
||||
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
||||
elif te_model == TEModel.GEMMA_3_4B:
|
||||
clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b")
|
||||
clip_target.tokenizer = comfy.text_encoders.lumina2.NTokenizer
|
||||
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
||||
elif te_model == TEModel.LLAMA3_8:
|
||||
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data),
|
||||
clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
|
||||
|
||||
@@ -3,6 +3,7 @@ import torch.nn as nn
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Any
|
||||
import math
|
||||
import logging
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
import comfy.model_management
|
||||
@@ -28,6 +29,9 @@ class Llama2Config:
|
||||
mlp_activation = "silu"
|
||||
qkv_bias = False
|
||||
rope_dims = None
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
rope_scale = None
|
||||
|
||||
@dataclass
|
||||
class Qwen25_3BConfig:
|
||||
@@ -46,6 +50,9 @@ class Qwen25_3BConfig:
|
||||
mlp_activation = "silu"
|
||||
qkv_bias = True
|
||||
rope_dims = None
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
rope_scale = None
|
||||
|
||||
@dataclass
|
||||
class Qwen25_7BVLI_Config:
|
||||
@@ -64,6 +71,9 @@ class Qwen25_7BVLI_Config:
|
||||
mlp_activation = "silu"
|
||||
qkv_bias = True
|
||||
rope_dims = [16, 24, 24]
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
rope_scale = None
|
||||
|
||||
@dataclass
|
||||
class Gemma2_2B_Config:
|
||||
@@ -82,6 +92,32 @@ class Gemma2_2B_Config:
|
||||
mlp_activation = "gelu_pytorch_tanh"
|
||||
qkv_bias = False
|
||||
rope_dims = None
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
sliding_attention = None
|
||||
rope_scale = None
|
||||
|
||||
@dataclass
|
||||
class Gemma3_4B_Config:
|
||||
vocab_size: int = 262208
|
||||
hidden_size: int = 2560
|
||||
intermediate_size: int = 10240
|
||||
num_hidden_layers: int = 34
|
||||
num_attention_heads: int = 8
|
||||
num_key_value_heads: int = 4
|
||||
max_position_embeddings: int = 131072
|
||||
rms_norm_eps: float = 1e-6
|
||||
rope_theta = [10000.0, 1000000.0]
|
||||
transformer_type: str = "gemma3"
|
||||
head_dim = 256
|
||||
rms_norm_add = True
|
||||
mlp_activation = "gelu_pytorch_tanh"
|
||||
qkv_bias = False
|
||||
rope_dims = None
|
||||
q_norm = "gemma3"
|
||||
k_norm = "gemma3"
|
||||
sliding_attention = [False, False, False, False, False, 1024]
|
||||
rope_scale = [1.0, 8.0]
|
||||
|
||||
class RMSNorm(nn.Module):
|
||||
def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None):
|
||||
@@ -106,25 +142,40 @@ def rotate_half(x):
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
|
||||
def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=None):
|
||||
theta_numerator = torch.arange(0, head_dim, 2, device=device).float()
|
||||
inv_freq = 1.0 / (theta ** (theta_numerator / head_dim))
|
||||
def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None):
|
||||
if not isinstance(theta, list):
|
||||
theta = [theta]
|
||||
|
||||
inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||
position_ids_expanded = position_ids[:, None, :].float()
|
||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
cos = emb.cos()
|
||||
sin = emb.sin()
|
||||
if rope_dims is not None and position_ids.shape[0] > 1:
|
||||
mrope_section = rope_dims * 2
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
else:
|
||||
cos = cos.unsqueeze(1)
|
||||
sin = sin.unsqueeze(1)
|
||||
out = []
|
||||
for index, t in enumerate(theta):
|
||||
theta_numerator = torch.arange(0, head_dim, 2, device=device).float()
|
||||
inv_freq = 1.0 / (t ** (theta_numerator / head_dim))
|
||||
|
||||
return (cos, sin)
|
||||
if rope_scale is not None:
|
||||
if isinstance(rope_scale, list):
|
||||
inv_freq /= rope_scale[index]
|
||||
else:
|
||||
inv_freq /= rope_scale
|
||||
|
||||
inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||
position_ids_expanded = position_ids[:, None, :].float()
|
||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
cos = emb.cos()
|
||||
sin = emb.sin()
|
||||
if rope_dims is not None and position_ids.shape[0] > 1:
|
||||
mrope_section = rope_dims * 2
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
else:
|
||||
cos = cos.unsqueeze(1)
|
||||
sin = sin.unsqueeze(1)
|
||||
out.append((cos, sin))
|
||||
|
||||
if len(out) == 1:
|
||||
return out[0]
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def apply_rope(xq, xk, freqs_cis):
|
||||
@@ -152,6 +203,14 @@ class Attention(nn.Module):
|
||||
self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
|
||||
self.o_proj = ops.Linear(self.inner_size, config.hidden_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
self.q_norm = None
|
||||
self.k_norm = None
|
||||
|
||||
if config.q_norm == "gemma3":
|
||||
self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
if config.k_norm == "gemma3":
|
||||
self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@@ -168,6 +227,11 @@ class Attention(nn.Module):
|
||||
xk = xk.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
|
||||
xv = xv.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
|
||||
|
||||
if self.q_norm is not None:
|
||||
xq = self.q_norm(xq)
|
||||
if self.k_norm is not None:
|
||||
xk = self.k_norm(xk)
|
||||
|
||||
xq, xk = apply_rope(xq, xk, freqs_cis=freqs_cis)
|
||||
|
||||
xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
|
||||
@@ -192,7 +256,7 @@ class MLP(nn.Module):
|
||||
return self.down_proj(self.activation(self.gate_proj(x)) * self.up_proj(x))
|
||||
|
||||
class TransformerBlock(nn.Module):
|
||||
def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None):
|
||||
def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None):
|
||||
super().__init__()
|
||||
self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops)
|
||||
self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
|
||||
@@ -226,7 +290,7 @@ class TransformerBlock(nn.Module):
|
||||
return x
|
||||
|
||||
class TransformerBlockGemma2(nn.Module):
|
||||
def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None):
|
||||
def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None):
|
||||
super().__init__()
|
||||
self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops)
|
||||
self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
|
||||
@@ -235,6 +299,13 @@ class TransformerBlockGemma2(nn.Module):
|
||||
self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
|
||||
if config.sliding_attention is not None: # TODO: implement. (Not that necessary since models are trained on less than 1024 tokens)
|
||||
self.sliding_attention = config.sliding_attention[index % len(config.sliding_attention)]
|
||||
else:
|
||||
self.sliding_attention = False
|
||||
|
||||
self.transformer_type = config.transformer_type
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
@@ -242,6 +313,14 @@ class TransformerBlockGemma2(nn.Module):
|
||||
freqs_cis: Optional[torch.Tensor] = None,
|
||||
optimized_attention=None,
|
||||
):
|
||||
if self.transformer_type == 'gemma3':
|
||||
if self.sliding_attention:
|
||||
if x.shape[1] > self.sliding_attention:
|
||||
logging.warning("Warning: sliding attention not implemented, results may be incorrect")
|
||||
freqs_cis = freqs_cis[1]
|
||||
else:
|
||||
freqs_cis = freqs_cis[0]
|
||||
|
||||
# Self Attention
|
||||
residual = x
|
||||
x = self.input_layernorm(x)
|
||||
@@ -276,7 +355,7 @@ class Llama2_(nn.Module):
|
||||
device=device,
|
||||
dtype=dtype
|
||||
)
|
||||
if self.config.transformer_type == "gemma2":
|
||||
if self.config.transformer_type == "gemma2" or self.config.transformer_type == "gemma3":
|
||||
transformer = TransformerBlockGemma2
|
||||
self.normalize_in = True
|
||||
else:
|
||||
@@ -284,8 +363,8 @@ class Llama2_(nn.Module):
|
||||
self.normalize_in = False
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
transformer(config, device=device, dtype=dtype, ops=ops)
|
||||
for _ in range(config.num_hidden_layers)
|
||||
transformer(config, index=i, device=device, dtype=dtype, ops=ops)
|
||||
for i in range(config.num_hidden_layers)
|
||||
])
|
||||
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
# self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
|
||||
@@ -305,6 +384,7 @@ class Llama2_(nn.Module):
|
||||
freqs_cis = precompute_freqs_cis(self.config.head_dim,
|
||||
position_ids,
|
||||
self.config.rope_theta,
|
||||
self.config.rope_scale,
|
||||
self.config.rope_dims,
|
||||
device=x.device)
|
||||
|
||||
@@ -433,3 +513,12 @@ class Gemma2_2B(BaseLlama, torch.nn.Module):
|
||||
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
class Gemma3_4B(BaseLlama, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = Gemma3_4B_Config(**config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
@@ -11,23 +11,41 @@ class Gemma2BTokenizer(sd1_clip.SDTokenizer):
|
||||
def state_dict(self):
|
||||
return {"spiece_model": self.tokenizer.serialize_model()}
|
||||
|
||||
class Gemma3_4BTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer = tokenizer_data.get("spiece_model", None)
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
|
||||
|
||||
def state_dict(self):
|
||||
return {"spiece_model": self.tokenizer.serialize_model()}
|
||||
|
||||
class LuminaTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma2_2b", tokenizer=Gemma2BTokenizer)
|
||||
|
||||
class NTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma3_4b", tokenizer=Gemma3_4BTokenizer)
|
||||
|
||||
class Gemma2_2BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma2_2B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
class Gemma3_4BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
class LuminaModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, name="gemma2_2b", clip_model=Gemma2_2BModel, model_options=model_options)
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, name="gemma2_2b", clip_model=Gemma2_2BModel):
|
||||
super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_scaled_fp8=None):
|
||||
def te(dtype_llama=None, llama_scaled_fp8=None, model_type="gemma2_2b"):
|
||||
if model_type == "gemma2_2b":
|
||||
model = Gemma2_2BModel
|
||||
elif model_type == "gemma3_4b":
|
||||
model = Gemma3_4BModel
|
||||
|
||||
class LuminaTEModel_(LuminaModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
|
||||
@@ -35,5 +53,5 @@ def te(dtype_llama=None, llama_scaled_fp8=None):
|
||||
model_options["scaled_fp8"] = llama_scaled_fp8
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
super().__init__(device=device, dtype=dtype, name=model_type, model_options=model_options, clip_model=model)
|
||||
return LuminaTEModel_
|
||||
|
||||
@@ -18,7 +18,7 @@ from comfy_api_nodes.apis.client import (
|
||||
UploadResponse,
|
||||
)
|
||||
from server import PromptServer
|
||||
|
||||
from comfy.cli_args import args
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
@@ -30,7 +30,9 @@ from io import BytesIO
|
||||
import av
|
||||
|
||||
|
||||
async def download_url_to_video_output(video_url: str, timeout: int = None) -> VideoFromFile:
|
||||
async def download_url_to_video_output(
|
||||
video_url: str, timeout: int = None, auth_kwargs: Optional[dict[str, str]] = None
|
||||
) -> VideoFromFile:
|
||||
"""Downloads a video from a URL and returns a `VIDEO` output.
|
||||
|
||||
Args:
|
||||
@@ -39,7 +41,7 @@ async def download_url_to_video_output(video_url: str, timeout: int = None) -> V
|
||||
Returns:
|
||||
A Comfy node `VIDEO` output.
|
||||
"""
|
||||
video_io = await download_url_to_bytesio(video_url, timeout)
|
||||
video_io = await download_url_to_bytesio(video_url, timeout, auth_kwargs=auth_kwargs)
|
||||
if video_io is None:
|
||||
error_msg = f"Failed to download video from {video_url}"
|
||||
logging.error(error_msg)
|
||||
@@ -152,7 +154,7 @@ def validate_aspect_ratio(
|
||||
raise TypeError(
|
||||
f"Aspect ratio cannot reduce to any less than {minimum_ratio_str} ({minimum_ratio}), but was {aspect_ratio} ({calculated_ratio})."
|
||||
)
|
||||
elif calculated_ratio > maximum_ratio:
|
||||
if calculated_ratio > maximum_ratio:
|
||||
raise TypeError(
|
||||
f"Aspect ratio cannot reduce to any greater than {maximum_ratio_str} ({maximum_ratio}), but was {aspect_ratio} ({calculated_ratio})."
|
||||
)
|
||||
@@ -164,7 +166,9 @@ def mimetype_to_extension(mime_type: str) -> str:
|
||||
return mime_type.split("/")[-1].lower()
|
||||
|
||||
|
||||
async def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO:
|
||||
async def download_url_to_bytesio(
|
||||
url: str, timeout: int = None, auth_kwargs: Optional[dict[str, str]] = None
|
||||
) -> BytesIO:
|
||||
"""Downloads content from a URL using requests and returns it as BytesIO.
|
||||
|
||||
Args:
|
||||
@@ -174,9 +178,18 @@ async def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO:
|
||||
Returns:
|
||||
BytesIO object containing the downloaded content.
|
||||
"""
|
||||
headers = {}
|
||||
if url.startswith("/proxy/"):
|
||||
url = str(args.comfy_api_base).rstrip("/") + url
|
||||
auth_token = auth_kwargs.get("auth_token")
|
||||
comfy_api_key = auth_kwargs.get("comfy_api_key")
|
||||
if auth_token:
|
||||
headers["Authorization"] = f"Bearer {auth_token}"
|
||||
elif comfy_api_key:
|
||||
headers["X-API-KEY"] = comfy_api_key
|
||||
timeout_cfg = aiohttp.ClientTimeout(total=timeout) if timeout else None
|
||||
async with aiohttp.ClientSession(timeout=timeout_cfg) as session:
|
||||
async with session.get(url) as resp:
|
||||
async with session.get(url, headers=headers) as resp:
|
||||
resp.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
|
||||
return BytesIO(await resp.read())
|
||||
|
||||
|
||||
@@ -220,13 +220,16 @@ class ApiClient:
|
||||
if multipart_parser and data:
|
||||
data = multipart_parser(data)
|
||||
|
||||
form = aiohttp.FormData(default_to_multipart=True)
|
||||
if data: # regular text fields
|
||||
for k, v in data.items():
|
||||
if v is None:
|
||||
continue # aiohttp fails to serialize "None" values
|
||||
# aiohttp expects strings or bytes; convert enums etc.
|
||||
form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v)
|
||||
if isinstance(data, aiohttp.FormData):
|
||||
form = data # If the parser already returned a FormData, pass it through
|
||||
else:
|
||||
form = aiohttp.FormData(default_to_multipart=True)
|
||||
if data: # regular text fields
|
||||
for k, v in data.items():
|
||||
if v is None:
|
||||
continue # aiohttp fails to serialize "None" values
|
||||
# aiohttp expects strings or bytes; convert enums etc.
|
||||
form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v)
|
||||
|
||||
if files:
|
||||
file_iter = files if isinstance(files, list) else files.items()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -473,7 +473,7 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode):
|
||||
height=width_height["height"],
|
||||
use_negative_prompts=True,
|
||||
)
|
||||
"""Upload image to comfy backend to have a URL available for further processing"""
|
||||
|
||||
# Get MIME type from tensor - assuming PNG format for image tensors
|
||||
mime_type = "image/png"
|
||||
|
||||
@@ -591,7 +591,6 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode):
|
||||
validated_video = validate_video_to_video_input(video)
|
||||
video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=auth)
|
||||
|
||||
"""Validate prompts and inference input"""
|
||||
validate_prompts(prompt, negative_prompt)
|
||||
|
||||
# Only include motion_intensity for Motion Transfer
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -146,7 +146,7 @@ class PixverseTextToVideoNode(comfy_io.ComfyNode):
|
||||
comfy_io.String.Input(
|
||||
"negative_prompt",
|
||||
default="",
|
||||
force_input=True,
|
||||
multiline=True,
|
||||
tooltip="An optional text description of undesired elements on an image.",
|
||||
optional=True,
|
||||
),
|
||||
@@ -284,7 +284,7 @@ class PixverseImageToVideoNode(comfy_io.ComfyNode):
|
||||
comfy_io.String.Input(
|
||||
"negative_prompt",
|
||||
default="",
|
||||
force_input=True,
|
||||
multiline=True,
|
||||
tooltip="An optional text description of undesired elements on an image.",
|
||||
optional=True,
|
||||
),
|
||||
@@ -425,7 +425,7 @@ class PixverseTransitionVideoNode(comfy_io.ComfyNode):
|
||||
comfy_io.String.Input(
|
||||
"negative_prompt",
|
||||
default="",
|
||||
force_input=True,
|
||||
multiline=True,
|
||||
tooltip="An optional text description of undesired elements on an image.",
|
||||
optional=True,
|
||||
),
|
||||
|
||||
@@ -35,6 +35,7 @@ from server import PromptServer
|
||||
import torch
|
||||
from io import BytesIO
|
||||
from PIL import UnidentifiedImageError
|
||||
import aiohttp
|
||||
|
||||
|
||||
async def handle_recraft_file_request(
|
||||
@@ -82,10 +83,16 @@ async def handle_recraft_file_request(
|
||||
return all_bytesio
|
||||
|
||||
|
||||
def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, converted_to_check: list[list]=None, is_list=False) -> dict:
|
||||
def recraft_multipart_parser(
|
||||
data,
|
||||
parent_key=None,
|
||||
formatter: callable = None,
|
||||
converted_to_check: list[list] = None,
|
||||
is_list: bool = False,
|
||||
return_mode: str = "formdata" # "dict" | "formdata"
|
||||
) -> dict | aiohttp.FormData:
|
||||
"""
|
||||
Formats data such that multipart/form-data will work with requests library
|
||||
when both files and data are present.
|
||||
Formats data such that multipart/form-data will work with aiohttp library when both files and data are present.
|
||||
|
||||
The OpenAI client that Recraft uses has a bizarre way of serializing lists:
|
||||
|
||||
@@ -103,23 +110,23 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co
|
||||
# Modification of a function that handled a different type of multipart parsing, big ups:
|
||||
# https://gist.github.com/kazqvaizer/4cebebe5db654a414132809f9f88067b
|
||||
|
||||
def handle_converted_lists(data, parent_key, lists_to_check=tuple[list]):
|
||||
def handle_converted_lists(item, parent_key, lists_to_check=tuple[list]):
|
||||
# if list already exists exists, just extend list with data
|
||||
for check_list in lists_to_check:
|
||||
for conv_tuple in check_list:
|
||||
if conv_tuple[0] == parent_key and type(conv_tuple[1]) is list:
|
||||
conv_tuple[1].append(formatter(data))
|
||||
if conv_tuple[0] == parent_key and isinstance(conv_tuple[1], list):
|
||||
conv_tuple[1].append(formatter(item))
|
||||
return True
|
||||
return False
|
||||
|
||||
if converted_to_check is None:
|
||||
converted_to_check = []
|
||||
|
||||
|
||||
effective_mode = return_mode if parent_key is None else "dict"
|
||||
if formatter is None:
|
||||
formatter = lambda v: v # Multipart representation of value
|
||||
|
||||
if type(data) is not dict:
|
||||
if not isinstance(data, dict):
|
||||
# if list already exists exists, just extend list with data
|
||||
added = handle_converted_lists(data, parent_key, converted_to_check)
|
||||
if added:
|
||||
@@ -136,15 +143,24 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co
|
||||
|
||||
for key, value in data.items():
|
||||
current_key = key if parent_key is None else f"{parent_key}[{key}]"
|
||||
if type(value) is dict:
|
||||
if isinstance(value, dict):
|
||||
converted.extend(recraft_multipart_parser(value, current_key, formatter, next_check).items())
|
||||
elif type(value) is list:
|
||||
elif isinstance(value, list):
|
||||
for ind, list_value in enumerate(value):
|
||||
iter_key = f"{current_key}[]"
|
||||
converted.extend(recraft_multipart_parser(list_value, iter_key, formatter, next_check, is_list=True).items())
|
||||
else:
|
||||
converted.append((current_key, formatter(value)))
|
||||
|
||||
if effective_mode == "formdata":
|
||||
fd = aiohttp.FormData()
|
||||
for k, v in dict(converted).items():
|
||||
if isinstance(v, list):
|
||||
for item in v:
|
||||
fd.add_field(k, str(item))
|
||||
else:
|
||||
fd.add_field(k, str(v))
|
||||
return fd
|
||||
return dict(converted)
|
||||
|
||||
|
||||
|
||||
175
comfy_api_nodes/nodes_sora.py
Normal file
175
comfy_api_nodes/nodes_sora.py
Normal file
@@ -0,0 +1,175 @@
|
||||
from typing import Optional
|
||||
from typing_extensions import override
|
||||
|
||||
import torch
|
||||
from pydantic import BaseModel, Field
|
||||
from comfy_api.latest import ComfyExtension, io as comfy_io
|
||||
from comfy_api_nodes.apis.client import (
|
||||
ApiEndpoint,
|
||||
HttpMethod,
|
||||
SynchronousOperation,
|
||||
PollingOperation,
|
||||
EmptyRequest,
|
||||
)
|
||||
from comfy_api_nodes.util.validation_utils import get_number_of_images
|
||||
|
||||
from comfy_api_nodes.apinode_utils import (
|
||||
download_url_to_video_output,
|
||||
tensor_to_bytesio,
|
||||
)
|
||||
|
||||
class Sora2GenerationRequest(BaseModel):
|
||||
prompt: str = Field(...)
|
||||
model: str = Field(...)
|
||||
seconds: str = Field(...)
|
||||
size: str = Field(...)
|
||||
|
||||
|
||||
class Sora2GenerationResponse(BaseModel):
|
||||
id: str = Field(...)
|
||||
error: Optional[dict] = Field(None)
|
||||
status: Optional[str] = Field(None)
|
||||
|
||||
|
||||
class OpenAIVideoSora2(comfy_io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return comfy_io.Schema(
|
||||
node_id="OpenAIVideoSora2",
|
||||
display_name="OpenAI Sora - Video",
|
||||
category="api node/video/Sora",
|
||||
description="OpenAI video and audio generation.",
|
||||
inputs=[
|
||||
comfy_io.Combo.Input(
|
||||
"model",
|
||||
options=["sora-2", "sora-2-pro"],
|
||||
default="sora-2",
|
||||
),
|
||||
comfy_io.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Guiding text; may be empty if an input image is present.",
|
||||
),
|
||||
comfy_io.Combo.Input(
|
||||
"size",
|
||||
options=[
|
||||
"720x1280",
|
||||
"1280x720",
|
||||
"1024x1792",
|
||||
"1792x1024",
|
||||
],
|
||||
default="1280x720",
|
||||
),
|
||||
comfy_io.Combo.Input(
|
||||
"duration",
|
||||
options=[4, 8, 12],
|
||||
default=8,
|
||||
),
|
||||
comfy_io.Image.Input(
|
||||
"image",
|
||||
optional=True,
|
||||
),
|
||||
comfy_io.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=comfy_io.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
optional=True,
|
||||
tooltip="Seed to determine if node should re-run; "
|
||||
"actual results are nondeterministic regardless of seed.",
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
comfy_io.Video.Output(),
|
||||
],
|
||||
hidden=[
|
||||
comfy_io.Hidden.auth_token_comfy_org,
|
||||
comfy_io.Hidden.api_key_comfy_org,
|
||||
comfy_io.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
model: str,
|
||||
prompt: str,
|
||||
size: str = "1280x720",
|
||||
duration: int = 8,
|
||||
seed: int = 0,
|
||||
image: Optional[torch.Tensor] = None,
|
||||
):
|
||||
if model == "sora-2" and size not in ("720x1280", "1280x720"):
|
||||
raise ValueError("Invalid size for sora-2 model, only 720x1280 and 1280x720 are supported.")
|
||||
files_input = None
|
||||
if image is not None:
|
||||
if get_number_of_images(image) != 1:
|
||||
raise ValueError("Currently only one input image is supported.")
|
||||
files_input = {"input_reference": ("image.png", tensor_to_bytesio(image), "image/png")}
|
||||
auth = {
|
||||
"auth_token": cls.hidden.auth_token_comfy_org,
|
||||
"comfy_api_key": cls.hidden.api_key_comfy_org,
|
||||
}
|
||||
payload = Sora2GenerationRequest(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
seconds=str(duration),
|
||||
size=size,
|
||||
)
|
||||
initial_operation = SynchronousOperation(
|
||||
endpoint=ApiEndpoint(
|
||||
path="/proxy/openai/v1/videos",
|
||||
method=HttpMethod.POST,
|
||||
request_model=Sora2GenerationRequest,
|
||||
response_model=Sora2GenerationResponse
|
||||
),
|
||||
request=payload,
|
||||
files=files_input,
|
||||
auth_kwargs=auth,
|
||||
content_type="multipart/form-data",
|
||||
)
|
||||
initial_response = await initial_operation.execute()
|
||||
if initial_response.error:
|
||||
raise Exception(initial_response.error.message)
|
||||
|
||||
model_time_multiplier = 1 if model == "sora-2" else 2
|
||||
poll_operation = PollingOperation(
|
||||
poll_endpoint=ApiEndpoint(
|
||||
path=f"/proxy/openai/v1/videos/{initial_response.id}",
|
||||
method=HttpMethod.GET,
|
||||
request_model=EmptyRequest,
|
||||
response_model=Sora2GenerationResponse
|
||||
),
|
||||
completed_statuses=["completed"],
|
||||
failed_statuses=["failed"],
|
||||
status_extractor=lambda x: x.status,
|
||||
auth_kwargs=auth,
|
||||
poll_interval=8.0,
|
||||
max_poll_attempts=160,
|
||||
node_id=cls.hidden.unique_id,
|
||||
estimated_duration=45 * (duration / 4) * model_time_multiplier,
|
||||
)
|
||||
await poll_operation.execute()
|
||||
return comfy_io.NodeOutput(
|
||||
await download_url_to_video_output(
|
||||
f"/proxy/openai/v1/videos/{initial_response.id}/content",
|
||||
auth_kwargs=auth,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class OpenAISoraExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]:
|
||||
return [
|
||||
OpenAIVideoSora2,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> OpenAISoraExtension:
|
||||
return OpenAISoraExtension()
|
||||
@@ -34,6 +34,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode):
|
||||
latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
|
||||
return io.NodeOutput({"samples": latent})
|
||||
|
||||
generate = execute # TODO: remove
|
||||
|
||||
class LTXVImgToVideo(io.ComfyNode):
|
||||
@classmethod
|
||||
@@ -77,6 +78,8 @@ class LTXVImgToVideo(io.ComfyNode):
|
||||
|
||||
return io.NodeOutput(positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask})
|
||||
|
||||
generate = execute # TODO: remove
|
||||
|
||||
|
||||
def conditioning_get_any_value(conditioning, key, default=None):
|
||||
for t in conditioning:
|
||||
@@ -264,6 +267,8 @@ class LTXVAddGuide(io.ComfyNode):
|
||||
|
||||
return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask})
|
||||
|
||||
generate = execute # TODO: remove
|
||||
|
||||
|
||||
class LTXVCropGuides(io.ComfyNode):
|
||||
@classmethod
|
||||
@@ -300,6 +305,8 @@ class LTXVCropGuides(io.ComfyNode):
|
||||
|
||||
return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask})
|
||||
|
||||
crop = execute # TODO: remove
|
||||
|
||||
|
||||
class LTXVConditioning(io.ComfyNode):
|
||||
@classmethod
|
||||
@@ -498,6 +505,7 @@ class LTXVPreprocess(io.ComfyNode):
|
||||
output_images.append(preprocess(image[i], img_compression))
|
||||
return io.NodeOutput(torch.stack(output_images))
|
||||
|
||||
preprocess = execute # TODO: remove
|
||||
|
||||
class LtxvExtension(ComfyExtension):
|
||||
@override
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# This file is automatically generated by the build process when version is
|
||||
# updated in pyproject.toml.
|
||||
__version__ = "0.3.63"
|
||||
__version__ = "0.3.64"
|
||||
|
||||
1
nodes.py
1
nodes.py
@@ -2357,6 +2357,7 @@ async def init_builtin_api_nodes():
|
||||
"nodes_stability.py",
|
||||
"nodes_pika.py",
|
||||
"nodes_runway.py",
|
||||
"nodes_sora.py",
|
||||
"nodes_tripo.py",
|
||||
"nodes_moonvalley.py",
|
||||
"nodes_rodin.py",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "ComfyUI"
|
||||
version = "0.3.63"
|
||||
version = "0.3.64"
|
||||
readme = "README.md"
|
||||
license = { file = "LICENSE" }
|
||||
requires-python = ">=3.9"
|
||||
@@ -57,18 +57,14 @@ messages_control.disable = [
|
||||
"redefined-builtin",
|
||||
"unnecessary-lambda",
|
||||
"dangerous-default-value",
|
||||
"invalid-overridden-method",
|
||||
# next warnings should be fixed in future
|
||||
"bad-classmethod-argument", # Class method should have 'cls' as first argument
|
||||
"wrong-import-order", # Standard imports should be placed before third party imports
|
||||
"logging-fstring-interpolation", # Use lazy % formatting in logging functions
|
||||
"ungrouped-imports",
|
||||
"unnecessary-pass",
|
||||
"unidiomatic-typecheck",
|
||||
"unnecessary-lambda-assignment",
|
||||
"no-else-return",
|
||||
"no-else-raise",
|
||||
"invalid-overridden-method",
|
||||
"unused-variable",
|
||||
"pointless-string-statement",
|
||||
"redefined-outer-name",
|
||||
]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
comfyui-frontend-package==1.27.7
|
||||
comfyui-workflow-templates==0.1.93
|
||||
comfyui-frontend-package==1.27.10
|
||||
comfyui-workflow-templates==0.1.94
|
||||
comfyui-embedded-docs==0.2.6
|
||||
torch
|
||||
torchsde
|
||||
|
||||
Reference in New Issue
Block a user