mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-03-13 09:10:12 +00:00
Compare commits
1 Commits
feat/model
...
ImageCropV
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
79cd9d09eb |
@@ -925,25 +925,6 @@ class Flux(BaseModel):
|
||||
out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))])
|
||||
return out
|
||||
|
||||
class LongCatImage(Flux):
|
||||
def _apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
|
||||
transformer_options = transformer_options.copy()
|
||||
rope_opts = transformer_options.get("rope_options", {})
|
||||
rope_opts = dict(rope_opts)
|
||||
rope_opts.setdefault("shift_t", 1.0)
|
||||
rope_opts.setdefault("shift_y", 512.0)
|
||||
rope_opts.setdefault("shift_x", 512.0)
|
||||
transformer_options["rope_options"] = rope_opts
|
||||
return super()._apply_model(x, t, c_concat, c_crossattn, control, transformer_options, **kwargs)
|
||||
|
||||
def encode_adm(self, **kwargs):
|
||||
return None
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
out.pop('guidance', None)
|
||||
return out
|
||||
|
||||
class Flux2(Flux):
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
|
||||
@@ -279,8 +279,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["txt_norm"] = any_suffix_in(state_dict_keys, key_prefix, 'txt_norm.', ["weight", "scale"])
|
||||
if dit_config["yak_mlp"] and dit_config["txt_norm"]: # Ovis model
|
||||
dit_config["txt_ids_dims"] = [1, 2]
|
||||
if dit_config.get("context_in_dim") == 3584 and dit_config["vec_in_dim"] is None: # LongCat-Image
|
||||
dit_config["txt_ids_dims"] = [1, 2]
|
||||
|
||||
return dit_config
|
||||
|
||||
|
||||
@@ -60,7 +60,6 @@ import comfy.text_encoders.jina_clip_2
|
||||
import comfy.text_encoders.newbie
|
||||
import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
|
||||
import comfy.model_patcher
|
||||
import comfy.lora
|
||||
@@ -1161,7 +1160,6 @@ class CLIPType(Enum):
|
||||
KANDINSKY5_IMAGE = 23
|
||||
NEWBIE = 24
|
||||
FLUX2 = 25
|
||||
LONGCAT_IMAGE = 26
|
||||
|
||||
|
||||
def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
|
||||
@@ -1374,9 +1372,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
if clip_type == CLIPType.HUNYUAN_IMAGE:
|
||||
clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
|
||||
elif clip_type == CLIPType.LONGCAT_IMAGE:
|
||||
clip_target.clip = comfy.text_encoders.longcat_image.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.longcat_image.LongCatImageTokenizer
|
||||
else:
|
||||
clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer
|
||||
|
||||
@@ -25,7 +25,6 @@ import comfy.text_encoders.kandinsky5
|
||||
import comfy.text_encoders.z_image
|
||||
import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
|
||||
from . import supported_models_base
|
||||
from . import latent_formats
|
||||
@@ -1679,37 +1678,6 @@ class ACEStep15(supported_models_base.BASE):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.ace15.ACE15Tokenizer, comfy.text_encoders.ace15.te(**detect))
|
||||
|
||||
|
||||
class LongCatImage(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "flux",
|
||||
"guidance_embed": False,
|
||||
"vec_in_dim": None,
|
||||
"context_in_dim": 3584,
|
||||
"txt_ids_dims": [1, 2],
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
}
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.Flux
|
||||
|
||||
memory_usage_factor = 2.5
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.LongCatImage(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.longcat_image.LongCatImageTokenizer, comfy.text_encoders.longcat_image.te(**hunyuan_detect))
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
|
||||
|
||||
models += [SVD_img2vid]
|
||||
|
||||
@@ -328,14 +328,14 @@ class ACE15TEModel(torch.nn.Module):
|
||||
return getattr(self, self.lm_model).load_sd(sd)
|
||||
|
||||
def memory_estimation_function(self, token_weight_pairs, device=None):
|
||||
lm_metadata = token_weight_pairs.get("lm_metadata", {})
|
||||
lm_metadata = token_weight_pairs["lm_metadata"]
|
||||
constant = self.constant
|
||||
if comfy.model_management.should_use_bf16(device):
|
||||
constant *= 0.5
|
||||
|
||||
token_weight_pairs = token_weight_pairs.get("lm_prompt", [])
|
||||
num_tokens = sum(map(lambda a: len(a), token_weight_pairs))
|
||||
num_tokens += lm_metadata.get("min_tokens", 0)
|
||||
num_tokens += lm_metadata['min_tokens']
|
||||
return num_tokens * constant * 1024 * 1024
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None, lm_model="qwen3_2b"):
|
||||
|
||||
@@ -1,184 +0,0 @@
|
||||
import re
|
||||
import numbers
|
||||
import torch
|
||||
from comfy import sd1_clip
|
||||
from comfy.text_encoders.qwen_image import Qwen25_7BVLITokenizer, Qwen25_7BVLIModel
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
QUOTE_PAIRS = [("'", "'"), ('"', '"'), ("\u2018", "\u2019"), ("\u201c", "\u201d")]
|
||||
QUOTE_PATTERN = "|".join(
|
||||
[
|
||||
re.escape(q1) + r"[^" + re.escape(q1 + q2) + r"]*?" + re.escape(q2)
|
||||
for q1, q2 in QUOTE_PAIRS
|
||||
]
|
||||
)
|
||||
WORD_INTERNAL_QUOTE_RE = re.compile(r"[a-zA-Z]+'[a-zA-Z]+")
|
||||
|
||||
|
||||
def split_quotation(prompt):
|
||||
matches = WORD_INTERNAL_QUOTE_RE.findall(prompt)
|
||||
mapping = []
|
||||
for i, word_src in enumerate(set(matches)):
|
||||
word_tgt = "longcat_$##$_longcat" * (i + 1)
|
||||
prompt = prompt.replace(word_src, word_tgt)
|
||||
mapping.append((word_src, word_tgt))
|
||||
|
||||
parts = re.split(f"({QUOTE_PATTERN})", prompt)
|
||||
result = []
|
||||
for part in parts:
|
||||
for word_src, word_tgt in mapping:
|
||||
part = part.replace(word_tgt, word_src)
|
||||
if not part:
|
||||
continue
|
||||
is_quoted = bool(re.match(QUOTE_PATTERN, part))
|
||||
result.append((part, is_quoted))
|
||||
return result
|
||||
|
||||
|
||||
class LongCatImageBaseTokenizer(Qwen25_7BVLITokenizer):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.max_length = 512
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
|
||||
parts = split_quotation(text)
|
||||
all_tokens = []
|
||||
for part_text, is_quoted in parts:
|
||||
if is_quoted:
|
||||
for char in part_text:
|
||||
ids = self.tokenizer(char, add_special_tokens=False)["input_ids"]
|
||||
all_tokens.extend(ids)
|
||||
else:
|
||||
ids = self.tokenizer(part_text, add_special_tokens=False)["input_ids"]
|
||||
all_tokens.extend(ids)
|
||||
|
||||
if len(all_tokens) > self.max_length:
|
||||
all_tokens = all_tokens[: self.max_length]
|
||||
logger.warning(f"Truncated prompt to {self.max_length} tokens")
|
||||
|
||||
output = [(t, 1.0) for t in all_tokens]
|
||||
# Pad to max length
|
||||
self.pad_tokens(output, self.max_length - len(output))
|
||||
return [output]
|
||||
|
||||
|
||||
class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(
|
||||
embedding_directory=embedding_directory,
|
||||
tokenizer_data=tokenizer_data,
|
||||
name="qwen25_7b",
|
||||
tokenizer=LongCatImageBaseTokenizer,
|
||||
)
|
||||
self.longcat_template_prefix = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"
|
||||
self.longcat_template_suffix = "<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
|
||||
skip_template = False
|
||||
if text.startswith("<|im_start|>"):
|
||||
skip_template = True
|
||||
if text.startswith("<|start_header_id|>"):
|
||||
skip_template = True
|
||||
if text == "":
|
||||
text = " "
|
||||
|
||||
base_tok = getattr(self, "qwen25_7b")
|
||||
if skip_template:
|
||||
tokens = super().tokenize_with_weights(
|
||||
text, return_word_ids=return_word_ids, disable_weights=True, **kwargs
|
||||
)
|
||||
else:
|
||||
prefix_ids = base_tok.tokenizer(
|
||||
self.longcat_template_prefix, add_special_tokens=False
|
||||
)["input_ids"]
|
||||
suffix_ids = base_tok.tokenizer(
|
||||
self.longcat_template_suffix, add_special_tokens=False
|
||||
)["input_ids"]
|
||||
|
||||
prompt_tokens = base_tok.tokenize_with_weights(
|
||||
text, return_word_ids=return_word_ids, **kwargs
|
||||
)
|
||||
prompt_pairs = prompt_tokens[0]
|
||||
|
||||
prefix_pairs = [(t, 1.0) for t in prefix_ids]
|
||||
suffix_pairs = [(t, 1.0) for t in suffix_ids]
|
||||
|
||||
combined = prefix_pairs + prompt_pairs + suffix_pairs
|
||||
tokens = {"qwen25_7b": [combined]}
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
class LongCatImageTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
name="qwen25_7b",
|
||||
clip_model=Qwen25_7BVLIModel,
|
||||
model_options=model_options,
|
||||
)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs, template_end=-1):
|
||||
out, pooled, extra = super().encode_token_weights(token_weight_pairs)
|
||||
tok_pairs = token_weight_pairs["qwen25_7b"][0]
|
||||
count_im_start = 0
|
||||
if template_end == -1:
|
||||
for i, v in enumerate(tok_pairs):
|
||||
elem = v[0]
|
||||
if not torch.is_tensor(elem):
|
||||
if isinstance(elem, numbers.Integral):
|
||||
if elem == 151644 and count_im_start < 2:
|
||||
template_end = i
|
||||
count_im_start += 1
|
||||
|
||||
if out.shape[1] > (template_end + 3):
|
||||
if tok_pairs[template_end + 1][0] == 872:
|
||||
if tok_pairs[template_end + 2][0] == 198:
|
||||
template_end += 3
|
||||
|
||||
if template_end == -1:
|
||||
template_end = 0
|
||||
|
||||
suffix_start = None
|
||||
for i in range(len(tok_pairs) - 1, -1, -1):
|
||||
elem = tok_pairs[i][0]
|
||||
if not torch.is_tensor(elem) and isinstance(elem, numbers.Integral):
|
||||
if elem == 151645:
|
||||
suffix_start = i
|
||||
break
|
||||
|
||||
out = out[:, template_end:]
|
||||
|
||||
if "attention_mask" in extra:
|
||||
extra["attention_mask"] = extra["attention_mask"][:, template_end:]
|
||||
if extra["attention_mask"].sum() == torch.numel(extra["attention_mask"]):
|
||||
extra.pop("attention_mask")
|
||||
|
||||
if suffix_start is not None:
|
||||
suffix_len = len(tok_pairs) - suffix_start
|
||||
if suffix_len > 0 and out.shape[1] > suffix_len:
|
||||
out = out[:, :-suffix_len]
|
||||
if "attention_mask" in extra:
|
||||
extra["attention_mask"] = extra["attention_mask"][:, :-suffix_len]
|
||||
if extra["attention_mask"].sum() == torch.numel(
|
||||
extra["attention_mask"]
|
||||
):
|
||||
extra.pop("attention_mask")
|
||||
|
||||
return out, pooled, extra
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class LongCatImageTEModel_(LongCatImageTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
|
||||
return LongCatImageTEModel_
|
||||
@@ -65,6 +65,8 @@ class ImageCropV2(IO.ComfyNode):
|
||||
outputs=[IO.Image.Output()],
|
||||
)
|
||||
|
||||
OUTPUT_NODE = True
|
||||
|
||||
@classmethod
|
||||
def execute(cls, image, crop_region) -> IO.NodeOutput:
|
||||
x = crop_region.get("x", 0)
|
||||
|
||||
@@ -16,15 +16,15 @@ class AspectRatio(str, Enum):
|
||||
WIDESCREEN_V = "9:16 (Portrait Widescreen)"
|
||||
|
||||
|
||||
ASPECT_RATIOS: dict[AspectRatio, tuple[int, int]] = {
|
||||
AspectRatio.SQUARE: (1, 1),
|
||||
AspectRatio.PHOTO_H: (3, 2),
|
||||
AspectRatio.STANDARD_H: (4, 3),
|
||||
AspectRatio.WIDESCREEN_H: (16, 9),
|
||||
AspectRatio.ULTRAWIDE_H: (21, 9),
|
||||
AspectRatio.PHOTO_V: (2, 3),
|
||||
AspectRatio.STANDARD_V: (3, 4),
|
||||
AspectRatio.WIDESCREEN_V: (9, 16),
|
||||
ASPECT_RATIOS: dict[str, tuple[int, int]] = {
|
||||
"1:1 (Square)": (1, 1),
|
||||
"3:2 (Photo)": (3, 2),
|
||||
"4:3 (Standard)": (4, 3),
|
||||
"16:9 (Widescreen)": (16, 9),
|
||||
"21:9 (Ultrawide)": (21, 9),
|
||||
"2:3 (Portrait Photo)": (2, 3),
|
||||
"3:4 (Portrait Standard)": (3, 4),
|
||||
"9:16 (Portrait Widescreen)": (9, 16),
|
||||
}
|
||||
|
||||
|
||||
@@ -55,12 +55,8 @@ class ResolutionSelector(io.ComfyNode):
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
io.Int.Output(
|
||||
"width", tooltip="Calculated width in pixels (multiple of 8)."
|
||||
),
|
||||
io.Int.Output(
|
||||
"height", tooltip="Calculated height in pixels (multiple of 8)."
|
||||
),
|
||||
io.Int.Output("width", tooltip="Calculated width in pixels (multiple of 8)."),
|
||||
io.Int.Output("height", tooltip="Calculated height in pixels (multiple of 8)."),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@@ -472,26 +472,6 @@ def get_save_image_path(filename_prefix: str, output_dir: str, image_width=0, im
|
||||
counter = 1
|
||||
return full_output_folder, filename, counter, subfolder, filename_prefix
|
||||
|
||||
def get_model_placeholder(folder_name: str) -> str:
|
||||
"""Generate placeholder text for empty model dropdowns.
|
||||
|
||||
Args:
|
||||
folder_name: The name of the model folder (e.g., "checkpoints", "loras").
|
||||
|
||||
Returns:
|
||||
A user-friendly placeholder string indicating where models should be placed.
|
||||
"""
|
||||
folder_name = map_legacy(folder_name)
|
||||
try:
|
||||
paths = get_folder_paths(folder_name)
|
||||
except KeyError:
|
||||
paths = []
|
||||
|
||||
if paths:
|
||||
return f"No models found — add to: {paths[0]}"
|
||||
return f"No models found for '{folder_name}'..."
|
||||
|
||||
|
||||
def get_input_subfolders() -> list[str]:
|
||||
"""Returns a list of all subfolder paths in the input directory, recursively.
|
||||
|
||||
|
||||
60
nodes.py
60
nodes.py
@@ -589,10 +589,7 @@ class CheckpointLoaderSimple:
|
||||
def INPUT_TYPES(s):
|
||||
return {
|
||||
"required": {
|
||||
"ckpt_name": (folder_paths.get_filename_list("checkpoints"), {
|
||||
"tooltip": "The name of the checkpoint (model) to load.",
|
||||
"placeholder": folder_paths.get_model_placeholder("checkpoints")
|
||||
}),
|
||||
"ckpt_name": (folder_paths.get_filename_list("checkpoints"), {"tooltip": "The name of the checkpoint (model) to load."}),
|
||||
}
|
||||
}
|
||||
RETURN_TYPES = ("MODEL", "CLIP", "VAE")
|
||||
@@ -642,9 +639,7 @@ class DiffusersLoader:
|
||||
class unCLIPCheckpointLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "ckpt_name": (folder_paths.get_filename_list("checkpoints"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("checkpoints")
|
||||
}),
|
||||
return {"required": { "ckpt_name": (folder_paths.get_filename_list("checkpoints"), ),
|
||||
}}
|
||||
RETURN_TYPES = ("MODEL", "CLIP", "VAE", "CLIP_VISION")
|
||||
FUNCTION = "load_checkpoint"
|
||||
@@ -684,10 +679,7 @@ class LoraLoader:
|
||||
"required": {
|
||||
"model": ("MODEL", {"tooltip": "The diffusion model the LoRA will be applied to."}),
|
||||
"clip": ("CLIP", {"tooltip": "The CLIP model the LoRA will be applied to."}),
|
||||
"lora_name": (folder_paths.get_filename_list("loras"), {
|
||||
"tooltip": "The name of the LoRA.",
|
||||
"placeholder": folder_paths.get_model_placeholder("loras")
|
||||
}),
|
||||
"lora_name": (folder_paths.get_filename_list("loras"), {"tooltip": "The name of the LoRA."}),
|
||||
"strength_model": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01, "tooltip": "How strongly to modify the diffusion model. This value can be negative."}),
|
||||
"strength_clip": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01, "tooltip": "How strongly to modify the CLIP model. This value can be negative."}),
|
||||
}
|
||||
@@ -724,9 +716,7 @@ class LoraLoaderModelOnly(LoraLoader):
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "model": ("MODEL",),
|
||||
"lora_name": (folder_paths.get_filename_list("loras"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("loras")
|
||||
}),
|
||||
"lora_name": (folder_paths.get_filename_list("loras"), ),
|
||||
"strength_model": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01}),
|
||||
}}
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
@@ -816,9 +806,7 @@ class VAELoader:
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "vae_name": (s.vae_list(s), {
|
||||
"placeholder": folder_paths.get_model_placeholder("vae")
|
||||
})}}
|
||||
return {"required": { "vae_name": (s.vae_list(s), )}}
|
||||
RETURN_TYPES = ("VAE",)
|
||||
FUNCTION = "load_vae"
|
||||
|
||||
@@ -845,9 +833,7 @@ class VAELoader:
|
||||
class ControlNetLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "control_net_name": (folder_paths.get_filename_list("controlnet"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("controlnet")
|
||||
})}}
|
||||
return {"required": { "control_net_name": (folder_paths.get_filename_list("controlnet"), )}}
|
||||
|
||||
RETURN_TYPES = ("CONTROL_NET",)
|
||||
FUNCTION = "load_controlnet"
|
||||
@@ -866,9 +852,7 @@ class DiffControlNetLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "model": ("MODEL",),
|
||||
"control_net_name": (folder_paths.get_filename_list("controlnet"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("controlnet")
|
||||
})}}
|
||||
"control_net_name": (folder_paths.get_filename_list("controlnet"), )}}
|
||||
|
||||
RETURN_TYPES = ("CONTROL_NET",)
|
||||
FUNCTION = "load_controlnet"
|
||||
@@ -966,9 +950,7 @@ class ControlNetApplyAdvanced:
|
||||
class UNETLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "unet_name": (folder_paths.get_filename_list("diffusion_models"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("diffusion_models")
|
||||
}),
|
||||
return {"required": { "unet_name": (folder_paths.get_filename_list("diffusion_models"), ),
|
||||
"weight_dtype": (["default", "fp8_e4m3fn", "fp8_e4m3fn_fast", "fp8_e5m2"],)
|
||||
}}
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
@@ -993,10 +975,8 @@ class UNETLoader:
|
||||
class CLIPLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("text_encoders")
|
||||
}),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ),
|
||||
return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis"], ),
|
||||
},
|
||||
"optional": {
|
||||
"device": (["default", "cpu"], {"advanced": True}),
|
||||
@@ -1022,12 +1002,8 @@ class CLIPLoader:
|
||||
class DualCLIPLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("text_encoders")
|
||||
}),
|
||||
"clip_name2": (folder_paths.get_filename_list("text_encoders"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("text_encoders")
|
||||
}),
|
||||
return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"clip_name2": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15", "kandinsky5", "kandinsky5_image", "ltxv", "newbie", "ace"], ),
|
||||
},
|
||||
"optional": {
|
||||
@@ -1056,9 +1032,7 @@ class DualCLIPLoader:
|
||||
class CLIPVisionLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name": (folder_paths.get_filename_list("clip_vision"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("clip_vision")
|
||||
}),
|
||||
return {"required": { "clip_name": (folder_paths.get_filename_list("clip_vision"), ),
|
||||
}}
|
||||
RETURN_TYPES = ("CLIP_VISION",)
|
||||
FUNCTION = "load_clip"
|
||||
@@ -1094,9 +1068,7 @@ class CLIPVisionEncode:
|
||||
class StyleModelLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "style_model_name": (folder_paths.get_filename_list("style_models"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("style_models")
|
||||
})}}
|
||||
return {"required": { "style_model_name": (folder_paths.get_filename_list("style_models"), )}}
|
||||
|
||||
RETURN_TYPES = ("STYLE_MODEL",)
|
||||
FUNCTION = "load_style_model"
|
||||
@@ -1195,9 +1167,7 @@ class unCLIPConditioning:
|
||||
class GLIGENLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "gligen_name": (folder_paths.get_filename_list("gligen"), {
|
||||
"placeholder": folder_paths.get_model_placeholder("gligen")
|
||||
})}}
|
||||
return {"required": { "gligen_name": (folder_paths.get_filename_list("gligen"), )}}
|
||||
|
||||
RETURN_TYPES = ("GLIGEN",)
|
||||
FUNCTION = "load_gligen"
|
||||
|
||||
@@ -31,4 +31,5 @@ spandrel
|
||||
pydantic~=2.0
|
||||
pydantic-settings~=2.0
|
||||
PyOpenGL
|
||||
PyOpenGL-accelerate
|
||||
glfw
|
||||
|
||||
@@ -1,112 +0,0 @@
|
||||
import torch
|
||||
|
||||
from comfy.model_detection import detect_unet_config, model_config_from_unet_config
|
||||
import comfy.supported_models
|
||||
|
||||
|
||||
def _make_longcat_comfyui_sd():
|
||||
"""Minimal ComfyUI-format state dict for pre-converted LongCat-Image weights."""
|
||||
sd = {}
|
||||
H = 32 # Reduce hidden state dimension to reduce memory usage
|
||||
C_IN = 16
|
||||
C_CTX = 3584
|
||||
|
||||
sd["img_in.weight"] = torch.empty(H, C_IN * 4)
|
||||
sd["img_in.bias"] = torch.empty(H)
|
||||
sd["txt_in.weight"] = torch.empty(H, C_CTX)
|
||||
sd["txt_in.bias"] = torch.empty(H)
|
||||
|
||||
sd["time_in.in_layer.weight"] = torch.empty(H, 256)
|
||||
sd["time_in.in_layer.bias"] = torch.empty(H)
|
||||
sd["time_in.out_layer.weight"] = torch.empty(H, H)
|
||||
sd["time_in.out_layer.bias"] = torch.empty(H)
|
||||
|
||||
sd["final_layer.adaLN_modulation.1.weight"] = torch.empty(2 * H, H)
|
||||
sd["final_layer.adaLN_modulation.1.bias"] = torch.empty(2 * H)
|
||||
sd["final_layer.linear.weight"] = torch.empty(C_IN * 4, H)
|
||||
sd["final_layer.linear.bias"] = torch.empty(C_IN * 4)
|
||||
|
||||
for i in range(19):
|
||||
sd[f"double_blocks.{i}.img_attn.norm.key_norm.weight"] = torch.empty(128)
|
||||
sd[f"double_blocks.{i}.img_attn.qkv.weight"] = torch.empty(3 * H, H)
|
||||
sd[f"double_blocks.{i}.img_mod.lin.weight"] = torch.empty(H, H)
|
||||
for i in range(38):
|
||||
sd[f"single_blocks.{i}.modulation.lin.weight"] = torch.empty(H, H)
|
||||
|
||||
return sd
|
||||
|
||||
|
||||
def _make_flux_schnell_comfyui_sd():
|
||||
"""Minimal ComfyUI-format state dict for standard Flux Schnell."""
|
||||
sd = {}
|
||||
H = 32 # Reduce hidden state dimension to reduce memory usage
|
||||
C_IN = 16
|
||||
|
||||
sd["img_in.weight"] = torch.empty(H, C_IN * 4)
|
||||
sd["img_in.bias"] = torch.empty(H)
|
||||
sd["txt_in.weight"] = torch.empty(H, 4096)
|
||||
sd["txt_in.bias"] = torch.empty(H)
|
||||
|
||||
sd["double_blocks.0.img_attn.norm.key_norm.weight"] = torch.empty(128)
|
||||
sd["double_blocks.0.img_attn.qkv.weight"] = torch.empty(3 * H, H)
|
||||
sd["double_blocks.0.img_mod.lin.weight"] = torch.empty(H, H)
|
||||
|
||||
for i in range(19):
|
||||
sd[f"double_blocks.{i}.img_attn.norm.key_norm.weight"] = torch.empty(128)
|
||||
for i in range(38):
|
||||
sd[f"single_blocks.{i}.modulation.lin.weight"] = torch.empty(H, H)
|
||||
|
||||
return sd
|
||||
|
||||
|
||||
class TestModelDetection:
|
||||
"""Verify that first-match model detection selects the correct model
|
||||
based on list ordering and unet_config specificity."""
|
||||
|
||||
def test_longcat_before_schnell_in_models_list(self):
|
||||
"""LongCatImage must appear before FluxSchnell in the models list."""
|
||||
models = comfy.supported_models.models
|
||||
longcat_idx = next(i for i, m in enumerate(models) if m.__name__ == "LongCatImage")
|
||||
schnell_idx = next(i for i, m in enumerate(models) if m.__name__ == "FluxSchnell")
|
||||
assert longcat_idx < schnell_idx, (
|
||||
f"LongCatImage (index {longcat_idx}) must come before "
|
||||
f"FluxSchnell (index {schnell_idx}) in the models list"
|
||||
)
|
||||
|
||||
def test_longcat_comfyui_detected_as_longcat(self):
|
||||
sd = _make_longcat_comfyui_sd()
|
||||
unet_config = detect_unet_config(sd, "")
|
||||
assert unet_config is not None
|
||||
assert unet_config["image_model"] == "flux"
|
||||
assert unet_config["context_in_dim"] == 3584
|
||||
assert unet_config["vec_in_dim"] is None
|
||||
assert unet_config["guidance_embed"] is False
|
||||
assert unet_config["txt_ids_dims"] == [1, 2]
|
||||
|
||||
model_config = model_config_from_unet_config(unet_config, sd)
|
||||
assert model_config is not None
|
||||
assert type(model_config).__name__ == "LongCatImage"
|
||||
|
||||
def test_longcat_comfyui_keys_pass_through_unchanged(self):
|
||||
"""Pre-converted weights should not be transformed by process_unet_state_dict."""
|
||||
sd = _make_longcat_comfyui_sd()
|
||||
unet_config = detect_unet_config(sd, "")
|
||||
model_config = model_config_from_unet_config(unet_config, sd)
|
||||
|
||||
processed = model_config.process_unet_state_dict(dict(sd))
|
||||
assert "img_in.weight" in processed
|
||||
assert "txt_in.weight" in processed
|
||||
assert "time_in.in_layer.weight" in processed
|
||||
assert "final_layer.linear.weight" in processed
|
||||
|
||||
def test_flux_schnell_comfyui_detected_as_flux_schnell(self):
|
||||
sd = _make_flux_schnell_comfyui_sd()
|
||||
unet_config = detect_unet_config(sd, "")
|
||||
assert unet_config is not None
|
||||
assert unet_config["image_model"] == "flux"
|
||||
assert unet_config["context_in_dim"] == 4096
|
||||
assert unet_config["txt_ids_dims"] == []
|
||||
|
||||
model_config = model_config_from_unet_config(unet_config, sd)
|
||||
assert model_config is not None
|
||||
assert type(model_config).__name__ == "FluxSchnell"
|
||||
Reference in New Issue
Block a user