Basic implementation of z image fun control union 2.0 (#11304)

The inpaint part is currently missing and will be implemented later. I think they messed up this model pretty bad. They added some control_noise_refiner blocks but don't actually use them. There is a typo in their code so instead of doing control_noise_refiner -> control_layers it runs the whole control_layers twice. Unfortunately they trained with this typo so the model works but is kind of slow and would probably perform a lot better if they corrected their code and trained it again.
2026-02-11 02:30:04 +00:00 · 2025-12-12 22:39:11 -08:00
parent c5a47a1692
commit da2bfb5b0a
4 changed files with 142 additions and 44 deletions
--- a/comfy_extras/nodes_model_patch.py
+++ b/comfy_extras/nodes_model_patch.py
@@ -243,7 +243,13 @@ class ModelPatchLoader:
            model = SigLIPMultiFeatProjModel(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
        elif 'control_all_x_embedder.2-1.weight' in sd: # alipai z image fun controlnet
            sd = z_image_convert(sd)
-            model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
+            config = {}
+            if 'control_layers.14.adaLN_modulation.0.weight' in sd:
+                config['n_control_layers'] = 15
+                config['additional_in_dim'] = 17
+                config['refiner_control'] = True
+                config['broken'] = True
+            model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast, **config)

        model.load_state_dict(sd)
        model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device())
@@ -297,56 +303,86 @@ class DiffSynthCnetPatch:
        return [self.model_patch]

 class ZImageControlPatch:
-    def __init__(self, model_patch, vae, image, strength):
+    def __init__(self, model_patch, vae, image, strength, inpaint_image=None, mask=None):
        self.model_patch = model_patch
        self.vae = vae
        self.image = image
+        self.inpaint_image = inpaint_image
+        self.mask = mask
        self.strength = strength
        self.encoded_image = self.encode_latent_cond(image)
        self.encoded_image_size = (image.shape[1], image.shape[2])
        self.temp_data = None

-    def encode_latent_cond(self, image):
-        latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(image))
-        return latent_image
+    def encode_latent_cond(self, control_image, inpaint_image=None):
+        latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(control_image))
+        if self.model_patch.model.additional_in_dim > 0:
+            if self.mask is None:
+                mask_ = torch.zeros_like(latent_image)[:, :1]
+            else:
+                mask_ = comfy.utils.common_upscale(self.mask.mean(dim=1, keepdim=True), latent_image.shape[-1], latent_image.shape[-2], "bilinear", "none")
+            if inpaint_image is None:
+                inpaint_image = torch.ones_like(control_image) * 0.5
+
+            inpaint_image_latent = comfy.latent_formats.Flux().process_in(self.vae.encode(inpaint_image))
+
+            return torch.cat([latent_image, mask_, inpaint_image_latent], dim=1)
+        else:
+            return latent_image

    def __call__(self, kwargs):
        x = kwargs.get("x")
        img = kwargs.get("img")
+        img_input = kwargs.get("img_input")
        txt = kwargs.get("txt")
        pe = kwargs.get("pe")
        vec = kwargs.get("vec")
        block_index = kwargs.get("block_index")
+        block_type = kwargs.get("block_type", "")
        spacial_compression = self.vae.spacial_compression_encode()
        if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression):
            image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
+            inpaint_scaled = None
+            if self.inpaint_image is not None:
+                inpaint_scaled = comfy.utils.common_upscale(self.inpaint_image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center").movedim(1, -1)
            loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
-            self.encoded_image = self.encode_latent_cond(image_scaled.movedim(1, -1))
+            self.encoded_image = self.encode_latent_cond(image_scaled.movedim(1, -1), inpaint_scaled)
            self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1])
            comfy.model_management.load_models_gpu(loaded_models)

-        cnet_index = (block_index // 5)
-        cnet_index_float = (block_index / 5)
+        cnet_blocks = self.model_patch.model.n_control_layers
+        div = round(30 / cnet_blocks)
+
+        cnet_index = (block_index // div)
+        cnet_index_float = (block_index / div)

        kwargs.pop("img")  # we do ops in place
        kwargs.pop("txt")

-        cnet_blocks = self.model_patch.model.n_control_layers
        if cnet_index_float > (cnet_blocks - 1):
            self.temp_data = None
            return kwargs

        if self.temp_data is None or self.temp_data[0] > cnet_index:
-            self.temp_data = (-1, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec)))
+            if block_type == "noise_refiner":
+                self.temp_data = (-3, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec)))
+            else:
+                self.temp_data = (-1, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec)))

-        while self.temp_data[0] < cnet_index and (self.temp_data[0] + 1) < cnet_blocks:
+        if block_type == "noise_refiner":
            next_layer = self.temp_data[0] + 1
-            self.temp_data = (next_layer, self.model_patch.model.forward_control_block(next_layer, self.temp_data[1][1], img[:, :self.temp_data[1][1].shape[1]], None, pe, vec))
+            self.temp_data = (next_layer, self.model_patch.model.forward_noise_refiner_block(block_index, self.temp_data[1][1], img_input[:, :self.temp_data[1][1].shape[1]], None, pe, vec))
+            if self.temp_data[1][0] is not None:
+                img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength)
+        else:
+            while self.temp_data[0] < cnet_index and (self.temp_data[0] + 1) < cnet_blocks:
+                next_layer = self.temp_data[0] + 1
+                self.temp_data = (next_layer, self.model_patch.model.forward_control_block(next_layer, self.temp_data[1][1], img_input[:, :self.temp_data[1][1].shape[1]], None, pe, vec))

-        if cnet_index_float == self.temp_data[0]:
-            img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength)
-            if cnet_blocks == self.temp_data[0] + 1:
-                self.temp_data = None
+            if cnet_index_float == self.temp_data[0]:
+                img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength)
+                if cnet_blocks == self.temp_data[0] + 1:
+                    self.temp_data = None

        return kwargs

@@ -386,7 +422,9 @@ class QwenImageDiffsynthControlnet:
            mask = 1.0 - mask

        if isinstance(model_patch.model, comfy.ldm.lumina.controlnet.ZImage_Control):
-            model_patched.set_model_double_block_patch(ZImageControlPatch(model_patch, vae, image, strength))
+            patch = ZImageControlPatch(model_patch, vae, image, strength, mask=mask)
+            model_patched.set_model_noise_refiner_patch(patch)
+            model_patched.set_model_double_block_patch(patch)
        else:
            model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask))
        return (model_patched,)