control rework

2026-04-30 11:11:15 +00:00 · 2024-08-02 21:29:51 -07:00
parent 9a449a1d98
commit e722991752
11 changed files with 314 additions and 324 deletions
--- a/backend/patcher/controlnet.py
+++ b/backend/patcher/controlnet.py
@@ -0,0 +1,505 @@
+import torch
+import math
+
+from backend.misc import image_resize
+from backend import memory_management, state_dict, utils
+from backend.nn.cnets import cldm, t2i_adapter
+from backend.patcher.base import ModelPatcher
+from backend.operations import using_forge_operations, ForgeOperationsWithManualCast, main_stream_worker, weights_manual_cast
+
+
+def compute_controlnet_weighting(control, cnet):
+    positive_advanced_weighting = getattr(cnet, 'positive_advanced_weighting', None)
+    negative_advanced_weighting = getattr(cnet, 'negative_advanced_weighting', None)
+    advanced_frame_weighting = getattr(cnet, 'advanced_frame_weighting', None)
+    advanced_sigma_weighting = getattr(cnet, 'advanced_sigma_weighting', None)
+    advanced_mask_weighting = getattr(cnet, 'advanced_mask_weighting', None)
+
+    transformer_options = cnet.transformer_options
+
+    if positive_advanced_weighting is None and negative_advanced_weighting is None \
+            and advanced_frame_weighting is None and advanced_sigma_weighting is None \
+            and advanced_mask_weighting is None:
+        return control
+
+    cond_or_uncond = transformer_options['cond_or_uncond']
+    sigmas = transformer_options['sigmas']
+    cond_mark = transformer_options['cond_mark']
+
+    if advanced_frame_weighting is not None:
+        advanced_frame_weighting = torch.Tensor(advanced_frame_weighting * len(cond_or_uncond)).to(sigmas)
+        assert advanced_frame_weighting.shape[0] == cond_mark.shape[0], \
+            'Frame weighting list length is different from batch size!'
+
+    if advanced_sigma_weighting is not None:
+        advanced_sigma_weighting = torch.cat([advanced_sigma_weighting(sigmas)] * len(cond_or_uncond))
+
+    for k, v in control.items():
+        for i in range(len(v)):
+            control_signal = control[k][i]
+
+            if not isinstance(control_signal, torch.Tensor):
+                continue
+
+            B, C, H, W = control_signal.shape
+
+            positive_weight = 1.0
+            negative_weight = 1.0
+            sigma_weight = 1.0
+            frame_weight = 1.0
+
+            if positive_advanced_weighting is not None:
+                positive_weight = get_at(positive_advanced_weighting.get(k, []), i, 1.0)
+
+            if negative_advanced_weighting is not None:
+                negative_weight = get_at(negative_advanced_weighting.get(k, []), i, 1.0)
+
+            if advanced_sigma_weighting is not None:
+                sigma_weight = advanced_sigma_weighting
+
+            if advanced_frame_weighting is not None:
+                frame_weight = advanced_frame_weighting
+
+            final_weight = positive_weight * (1.0 - cond_mark) + negative_weight * cond_mark
+            final_weight = final_weight * sigma_weight * frame_weight
+
+            if isinstance(advanced_mask_weighting, torch.Tensor):
+                if advanced_mask_weighting.shape[0] != 1:
+                    k_ = int(control_signal.shape[0] // advanced_mask_weighting.shape[0])
+                    if control_signal.shape[0] == k_ * advanced_mask_weighting.shape[0]:
+                        advanced_mask_weighting = advanced_mask_weighting.repeat(k_, 1, 1, 1)
+                control_signal = control_signal * torch.nn.functional.interpolate(advanced_mask_weighting.to(control_signal), size=(H, W), mode='bilinear')
+
+            control[k][i] = control_signal * final_weight[:, None, None, None]
+
+    return control
+
+
+def broadcast_image_to(tensor, target_batch_size, batched_number):
+    current_batch_size = tensor.shape[0]
+    if current_batch_size == 1:
+        return tensor
+
+    per_batch = target_batch_size // batched_number
+    tensor = tensor[:per_batch]
+
+    if per_batch > tensor.shape[0]:
+        tensor = torch.cat([tensor] * (per_batch // tensor.shape[0]) + [tensor[:(per_batch % tensor.shape[0])]], dim=0)
+
+    current_batch_size = tensor.shape[0]
+    if current_batch_size == target_batch_size:
+        return tensor
+    else:
+        return torch.cat([tensor] * batched_number, dim=0)
+
+
+def get_at(array, index, default=None):
+    return array[index] if 0 <= index < len(array) else default
+
+
+class ControlBase:
+    def __init__(self, device=None):
+        self.cond_hint_original = None
+        self.cond_hint = None
+        self.strength = 1.0
+        self.timestep_percent_range = (0.0, 1.0)
+        self.global_average_pooling = False
+        self.timestep_range = None
+        self.transformer_options = {}
+
+        if device is None:
+            device = memory_management.get_torch_device()
+        self.device = device
+        self.previous_controlnet = None
+
+    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0)):
+        self.cond_hint_original = cond_hint
+        self.strength = strength
+        self.timestep_percent_range = timestep_percent_range
+        return self
+
+    def pre_run(self, model, percent_to_timestep_function):
+        self.timestep_range = (percent_to_timestep_function(self.timestep_percent_range[0]), percent_to_timestep_function(self.timestep_percent_range[1]))
+        if self.previous_controlnet is not None:
+            self.previous_controlnet.pre_run(model, percent_to_timestep_function)
+
+    def set_previous_controlnet(self, controlnet):
+        self.previous_controlnet = controlnet
+        return self
+
+    def cleanup(self):
+        if self.previous_controlnet is not None:
+            self.previous_controlnet.cleanup()
+        if self.cond_hint is not None:
+            del self.cond_hint
+            self.cond_hint = None
+        self.timestep_range = None
+
+    def get_models(self):
+        out = []
+        if self.previous_controlnet is not None:
+            out += self.previous_controlnet.get_models()
+        return out
+
+    def copy_to(self, c):
+        c.cond_hint_original = self.cond_hint_original
+        c.strength = self.strength
+        c.timestep_percent_range = self.timestep_percent_range
+        c.global_average_pooling = self.global_average_pooling
+
+    def inference_memory_requirements(self, dtype):
+        if self.previous_controlnet is not None:
+            return self.previous_controlnet.inference_memory_requirements(dtype)
+        return 0
+
+    def control_merge(self, control_input, control_output, control_prev, output_dtype):
+        out = {'input': [], 'middle': [], 'output': []}
+
+        if control_input is not None:
+            for i in range(len(control_input)):
+                key = 'input'
+                x = control_input[i]
+                if x is not None:
+                    x *= self.strength
+                    if x.dtype != output_dtype:
+                        x = x.to(output_dtype)
+                out[key].insert(0, x)
+
+        if control_output is not None:
+            for i in range(len(control_output)):
+                if i == (len(control_output) - 1):
+                    key = 'middle'
+                    index = 0
+                else:
+                    key = 'output'
+                    index = i
+                x = control_output[i]
+                if x is not None:
+                    if self.global_average_pooling:
+                        x = torch.mean(x, dim=(2, 3), keepdim=True).repeat(1, 1, x.shape[2], x.shape[3])
+
+                    x *= self.strength
+                    if x.dtype != output_dtype:
+                        x = x.to(output_dtype)
+
+                out[key].append(x)
+
+        out = compute_controlnet_weighting(out, self)
+
+        if control_prev is not None:
+            for x in ['input', 'middle', 'output']:
+                o = out[x]
+                for i in range(len(control_prev[x])):
+                    prev_val = control_prev[x][i]
+                    if i >= len(o):
+                        o.append(prev_val)
+                    elif prev_val is not None:
+                        if o[i] is None:
+                            o[i] = prev_val
+                        else:
+                            if o[i].shape[0] < prev_val.shape[0]:
+                                o[i] = prev_val + o[i]
+                            else:
+                                o[i] += prev_val
+        return out
+
+
+class ControlNet(ControlBase):
+    def __init__(self, control_model, global_average_pooling=False, device=None, load_device=None, manual_cast_dtype=None):
+        super().__init__(device)
+        self.control_model = control_model
+        self.load_device = load_device
+        self.control_model_wrapped = ModelPatcher(self.control_model, load_device=load_device, offload_device=memory_management.unet_offload_device())
+        self.global_average_pooling = global_average_pooling
+        self.model_sampling_current = None
+        self.manual_cast_dtype = manual_cast_dtype
+
+    def get_control(self, x_noisy, t, cond, batched_number):
+        to = self.transformer_options
+
+        for conditioning_modifier in to.get('controlnet_conditioning_modifiers', []):
+            x_noisy, t, cond, batched_number = conditioning_modifier(self, x_noisy, t, cond, batched_number)
+
+        control_prev = None
+        if self.previous_controlnet is not None:
+            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)
+
+        if self.timestep_range is not None:
+            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
+                if control_prev is not None:
+                    return control_prev
+                else:
+                    return None
+
+        dtype = self.control_model.dtype
+        if self.manual_cast_dtype is not None:
+            dtype = self.manual_cast_dtype
+
+        output_dtype = x_noisy.dtype
+        if self.cond_hint is None or x_noisy.shape[2] * 8 != self.cond_hint.shape[2] or x_noisy.shape[3] * 8 != self.cond_hint.shape[3]:
+            if self.cond_hint is not None:
+                del self.cond_hint
+            self.cond_hint = None
+            self.cond_hint = image_resize.adaptive_resize(self.cond_hint_original, x_noisy.shape[3] * 8, x_noisy.shape[2] * 8, 'nearest-exact', "center").to(dtype)
+        if x_noisy.shape[0] != self.cond_hint.shape[0]:
+            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
+
+        context = cond['c_crossattn']
+        y = cond.get('y', None)
+        if y is not None:
+            y = y.to(dtype)
+        timestep = self.model_sampling_current.timestep(t)
+        x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)
+
+        controlnet_model_function_wrapper = to.get('controlnet_model_function_wrapper', None)
+
+        if controlnet_model_function_wrapper is not None:
+            wrapper_args = dict(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.float(),
+                                context=context.to(dtype), y=y)
+            wrapper_args['model'] = self
+            wrapper_args['inner_model'] = self.control_model
+            control = controlnet_model_function_wrapper(**wrapper_args)
+        else:
+            control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint.to(self.device), timesteps=timestep.float(), context=context.to(dtype), y=y)
+        return self.control_merge(None, control, control_prev, output_dtype)
+
+    def copy(self):
+        c = ControlNet(self.control_model, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
+        self.copy_to(c)
+        return c
+
+    def get_models(self):
+        out = super().get_models()
+        out.append(self.control_model_wrapped)
+        return out
+
+    def pre_run(self, model, percent_to_timestep_function):
+        super().pre_run(model, percent_to_timestep_function)
+        self.model_sampling_current = model.predictor
+
+    def cleanup(self):
+        self.model_sampling_current = None
+        super().cleanup()
+
+
+class ControlLoraOps(ForgeOperationsWithManualCast):
+    class Linear(torch.nn.Module):
+        def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None:
+            super().__init__()
+            self.in_features = in_features
+            self.out_features = out_features
+            self.weight = None
+            self.up = None
+            self.down = None
+            self.bias = None
+
+        def forward(self, input):
+            weight, bias, signal = weights_manual_cast(self, input)
+            with main_stream_worker(weight, bias, signal):
+                if self.up is not None:
+                    return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
+                else:
+                    return torch.nn.functional.linear(input, weight, bias)
+
+    class Conv2d(torch.nn.Module):
+        def __init__(
+                self,
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                bias=True,
+                padding_mode='zeros',
+                device=None,
+                dtype=None
+        ):
+            super().__init__()
+            self.in_channels = in_channels
+            self.out_channels = out_channels
+            self.kernel_size = kernel_size
+            self.stride = stride
+            self.padding = padding
+            self.dilation = dilation
+            self.transposed = False
+            self.output_padding = 0
+            self.groups = groups
+            self.padding_mode = padding_mode
+
+            self.weight = None
+            self.bias = None
+            self.up = None
+            self.down = None
+
+        def forward(self, input):
+            weight, bias, signal = weights_manual_cast(self, input)
+            with main_stream_worker(weight, bias, signal):
+                if self.up is not None:
+                    return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
+                else:
+                    return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+class ControlLora(ControlNet):
+    def __init__(self, control_weights, global_average_pooling=False, device=None):
+        ControlBase.__init__(self, device)
+        self.control_weights = control_weights
+        self.global_average_pooling = global_average_pooling
+
+    def pre_run(self, model, percent_to_timestep_function):
+        super().pre_run(model, percent_to_timestep_function)
+        controlnet_config = model.diffusion_model.legacy_config.copy()
+        controlnet_config.pop("out_channels")
+        controlnet_config["hint_channels"] = self.control_weights["input_hint_block.0.weight"].shape[1]
+        controlnet_config["dtype"] = dtype = model.storage_dtype
+
+        self.manual_cast_dtype = model.computation_dtype
+
+        with using_forge_operations(operations=ControlLoraOps):
+            self.control_model = cldm.ControlNet(**controlnet_config)
+
+        self.control_model.to(device=memory_management.get_torch_device(), dtype=dtype)
+        diffusion_model = model.diffusion_model
+        sd = diffusion_model.state_dict()
+
+        for k in sd:
+            weight = sd[k]
+            try:
+                utils.set_attr(self.control_model, k, weight)
+            except:
+                pass
+
+        for k in self.control_weights:
+            if k not in {"lora_controlnet"}:
+                utils.set_attr(self.control_model, k, self.control_weights[k].to(dtype).to(memory_management.get_torch_device()))
+
+    def copy(self):
+        c = ControlLora(self.control_weights, global_average_pooling=self.global_average_pooling)
+        self.copy_to(c)
+        return c
+
+    def cleanup(self):
+        del self.control_model
+        self.control_model = None
+        super().cleanup()
+
+    def get_models(self):
+        out = ControlBase.get_models(self)
+        return out
+
+    def inference_memory_requirements(self, dtype):
+        return utils.calculate_parameters(self.control_weights) * memory_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype)
+
+
+class T2IAdapter(ControlBase):
+    def __init__(self, t2i_model, channels_in, device=None):
+        super().__init__(device)
+        self.t2i_model = t2i_model
+        self.channels_in = channels_in
+        self.control_input = None
+
+    def scale_image_to(self, width, height):
+        unshuffle_amount = self.t2i_model.unshuffle_amount
+        width = math.ceil(width / unshuffle_amount) * unshuffle_amount
+        height = math.ceil(height / unshuffle_amount) * unshuffle_amount
+        return width, height
+
+    def get_control(self, x_noisy, t, cond, batched_number):
+        to = self.transformer_options
+
+        for conditioning_modifier in to.get('controlnet_conditioning_modifiers', []):
+            x_noisy, t, cond, batched_number = conditioning_modifier(self, x_noisy, t, cond, batched_number)
+
+        control_prev = None
+        if self.previous_controlnet is not None:
+            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)
+
+        if self.timestep_range is not None:
+            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
+                if control_prev is not None:
+                    return control_prev
+                else:
+                    return None
+
+        if self.cond_hint is None or x_noisy.shape[2] * 8 != self.cond_hint.shape[2] or x_noisy.shape[3] * 8 != self.cond_hint.shape[3]:
+            if self.cond_hint is not None:
+                del self.cond_hint
+            self.control_input = None
+            self.cond_hint = None
+            width, height = self.scale_image_to(x_noisy.shape[3] * 8, x_noisy.shape[2] * 8)
+            self.cond_hint = image_resize.adaptive_resize(self.cond_hint_original, width, height, 'nearest-exact', "center").float()
+            if self.channels_in == 1 and self.cond_hint.shape[1] > 1:
+                self.cond_hint = torch.mean(self.cond_hint, 1, keepdim=True)
+        if x_noisy.shape[0] != self.cond_hint.shape[0]:
+            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
+        if self.control_input is None:
+            self.t2i_model.to(x_noisy.dtype)
+            self.t2i_model.to(self.device)
+
+            controlnet_model_function_wrapper = to.get('controlnet_model_function_wrapper', None)
+
+            if controlnet_model_function_wrapper is not None:
+                wrapper_args = dict(hint=self.cond_hint.to(x_noisy.dtype))
+                wrapper_args['model'] = self
+                wrapper_args['inner_model'] = self.t2i_model
+                wrapper_args['inner_t2i_model'] = self.t2i_model
+                self.control_input = controlnet_model_function_wrapper(**wrapper_args)
+            else:
+                self.control_input = self.t2i_model(self.cond_hint.to(x_noisy))
+
+            self.t2i_model.cpu()
+
+        control_input = list(map(lambda a: None if a is None else a.clone(), self.control_input))
+        mid = None
+        if self.t2i_model.xl == True:
+            mid = control_input[-1:]
+            control_input = control_input[:-1]
+        return self.control_merge(control_input, mid, control_prev, x_noisy.dtype)
+
+    def copy(self):
+        c = T2IAdapter(self.t2i_model, self.channels_in)
+        self.copy_to(c)
+        return c
+
+
+def load_t2i_adapter(t2i_data):
+    if 'adapter' in t2i_data:
+        t2i_data = t2i_data['adapter']
+    if 'adapter.body.0.resnets.0.block1.weight' in t2i_data:  # diffusers format
+        prefix_replace = {}
+        for i in range(4):
+            for j in range(2):
+                prefix_replace["adapter.body.{}.resnets.{}.".format(i, j)] = "body.{}.".format(i * 2 + j)
+            prefix_replace["adapter.body.{}.".format(i, j)] = "body.{}.".format(i * 2)
+        prefix_replace["adapter."] = ""
+        t2i_data = state_dict.state_dict_prefix_replace(t2i_data, prefix_replace)
+    keys = t2i_data.keys()
+
+    if "body.0.in_conv.weight" in keys:
+        cin = t2i_data['body.0.in_conv.weight'].shape[1]
+        model_ad = t2i_adapter.Adapter_light(cin=cin, channels=[320, 640, 1280, 1280], nums_rb=4)
+    elif 'conv_in.weight' in keys:
+        cin = t2i_data['conv_in.weight'].shape[1]
+        channel = t2i_data['conv_in.weight'].shape[0]
+        ksize = t2i_data['body.0.block2.weight'].shape[2]
+        use_conv = False
+        down_opts = list(filter(lambda a: a.endswith("down_opt.op.weight"), keys))
+        if len(down_opts) > 0:
+            use_conv = True
+        xl = False
+        if cin == 256 or cin == 768:
+            xl = True
+        model_ad = t2i_adapter.Adapter(cin=cin, channels=[channel, channel * 2, channel * 4, channel * 4][:4], nums_rb=2, ksize=ksize, sk=True, use_conv=use_conv, xl=xl)
+    else:
+        return None
+
+    missing, unexpected = model_ad.load_state_dict(t2i_data)
+    if len(missing) > 0:
+        print("t2i missing", missing)
+
+    if len(unexpected) > 0:
+        print("t2i unexpected", unexpected)
+
+    return T2IAdapter(model_ad, model_ad.input_channels)