Merge branch 'master' into worksplit-multigpu

2026-04-28 18:31:31 +00:00 · 2025-08-11 14:09:41 -07:00
parent b4f559b34d 2208aa616d
commit 962c3c832c
35 changed files with 3610 additions and 1192 deletions
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@@ -8,7 +8,7 @@ from einops import repeat
 from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
 from comfy.ldm.modules.attention import optimized_attention_masked
 from comfy.ldm.flux.layers import EmbedND
-
+import comfy.ldm.common_dit

 class GELU(nn.Module):
    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None):
@@ -364,8 +364,9 @@ class QwenImageTransformer2DModel(nn.Module):

        image_rotary_emb = self.pos_embeds(x, context)

-        orig_shape = x.shape
-        hidden_states = x.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2)
+        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size))
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2)
        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5)
        hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4)

@@ -396,4 +397,4 @@ class QwenImageTransformer2DModel(nn.Module):

        hidden_states = hidden_states.view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2)
        hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5)
-        return hidden_states.reshape(orig_shape)
+        return hidden_states.reshape(orig_shape)[:, :, :, :x.shape[-2], :x.shape[-1]]
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -293,6 +293,16 @@ def model_lora_keys_unet(model, key_map={}):
                key_lora = k[len("diffusion_model."):-len(".weight")]
                key_map["{}".format(key_lora)] = k

+    if isinstance(model, comfy.model_base.QwenImage):
+        for k in sdk:
+            if k.startswith("diffusion_model.") and k.endswith(".weight"): #QwenImage lora format
+                key_lora = k[len("diffusion_model."):-len(".weight")]
+                # Direct mapping for transformer_blocks format (QwenImage LoRA format)
+                key_map["{}".format(key_lora)] = k
+                # Support transformer prefix format
+                key_map["transformer.{}".format(key_lora)] = k
+                key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format
+
    return key_map


--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -345,9 +345,9 @@ try:
            if torch_version_numeric >= (2, 7):  # works on 2.6 but doesn't actually seem to improve much
                if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx950
                    ENABLE_PYTORCH_ATTENTION = True
-            if torch_version_numeric >= (2, 8):
-                if any((a in arch) for a in ["gfx1201"]):
-                    ENABLE_PYTORCH_ATTENTION = True
+#            if torch_version_numeric >= (2, 8):
+#                if any((a in arch) for a in ["gfx1201"]):
+#                    ENABLE_PYTORCH_ATTENTION = True
        if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
            if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
                SUPPORT_FP8_OPS = True
@@ -364,7 +364,7 @@ if ENABLE_PYTORCH_ATTENTION:

 PRIORITIZE_FP16 = False  # TODO: remove and replace with something that shows exactly which dtype is faster than the other
 try:
-    if is_nvidia() and PerformanceFeature.Fp16Accumulation in args.fast:
+    if (is_nvidia() or is_amd()) and PerformanceFeature.Fp16Accumulation in args.fast:
        torch.backends.cuda.matmul.allow_fp16_accumulation = True
        PRIORITIZE_FP16 = True  # TODO: limit to cards where it actually boosts performance
        logging.info("Enabled fp16 accumulation.")
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1237,7 +1237,7 @@ class QwenImage(supported_models_base.BASE):

    sampling_settings = {
        "multiplier": 1.0,
-        "shift": 2.6,
+        "shift": 1.15,
    }

    memory_usage_factor = 1.8 #TODO
--- a/comfy/weight_adapter/lora.py
+++ b/comfy/weight_adapter/lora.py
@@ -96,6 +96,7 @@ class LoRAAdapter(WeightAdapterBase):
        diffusers3_lora = "{}.lora.up.weight".format(x)
        mochi_lora = "{}.lora_B".format(x)
        transformers_lora = "{}.lora_linear_layer.up.weight".format(x)
+        qwen_default_lora = "{}.lora_B.default.weight".format(x)
        A_name = None

        if regular_lora in lora.keys():
@@ -122,6 +123,10 @@ class LoRAAdapter(WeightAdapterBase):
            A_name = transformers_lora
            B_name = "{}.lora_linear_layer.down.weight".format(x)
            mid_name = None
+        elif qwen_default_lora in lora.keys():
+            A_name = qwen_default_lora
+            B_name = "{}.lora_A.default.weight".format(x)
+            mid_name = None

        if A_name is not None:
            mid = None