Merge branch 'main' into dev

2026-03-01 03:03:59 +00:00 · 2024-01-27 15:48:30 -08:00
parent 67100a3dfe f78e5f783f
commit 40518feb6f
66 changed files with 2746 additions and 2892 deletions
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -2,8 +2,9 @@ import argparse
 import json
 import os
 from modules.paths_internal import models_path, script_path, data_path, extensions_dir, extensions_builtin_dir, sd_default_config, sd_model_file  # noqa: F401
+from ldm_patched.modules import args_parser

-parser = argparse.ArgumentParser()
+parser = args_parser.parser

 parser.add_argument("-f", action='store_true', help=argparse.SUPPRESS)  # allows running as root; implemented outside of webui
 parser.add_argument("--update-all-extensions", action='store_true', help="launch.py argument: download updates for all extensions when starting the program")
--- a/modules/dat_model.py
+++ b/modules/dat_model.py
@@ -2,7 +2,7 @@ import os

 from modules import modelloader, errors
 from modules.shared import cmd_opts, opts
-from modules.upscaler import Upscaler, UpscalerData
+from modules.upscaler import Upscaler, UpscalerData, prepare_free_memory
 from modules.upscaler_utils import upscale_with_model


@@ -23,6 +23,7 @@ class UpscalerDAT(Upscaler):
                self.scalers.append(model)

    def do_upscale(self, img, path):
+        prepare_free_memory()
        try:
            info = self.load_model(path)
        except Exception:
--- a/modules/deepbooru.py
+++ b/modules/deepbooru.py
@@ -4,7 +4,10 @@ import re
 import torch
 import numpy as np

-from modules import modelloader, paths, deepbooru_model, devices, images, shared
+from modules import modelloader, paths, deepbooru_model, images, shared
+from ldm_patched.modules import model_management
+from ldm_patched.modules.model_patcher import ModelPatcher
+

 re_special = re.compile(r'([\\()])')

@@ -12,6 +15,14 @@ re_special = re.compile(r'([\\()])')
 class DeepDanbooru:
    def __init__(self):
        self.model = None
+        self.load_device = model_management.text_encoder_device()
+        self.offload_device = model_management.text_encoder_offload_device()
+        self.dtype = torch.float32
+
+        if model_management.should_use_fp16(device=self.load_device):
+            self.dtype = torch.float16
+
+        self.patcher = None

    def load(self):
        if self.model is not None:
@@ -28,16 +39,16 @@ class DeepDanbooru:
        self.model.load_state_dict(torch.load(files[0], map_location="cpu"))

        self.model.eval()
-        self.model.to(devices.cpu, devices.dtype)
+        self.model.to(self.offload_device, self.dtype)
+
+        self.patcher = ModelPatcher(self.model, load_device=self.load_device, offload_device=self.offload_device)

    def start(self):
        self.load()
-        self.model.to(devices.device)
+        model_management.load_models_gpu([self.patcher])

    def stop(self):
-        if not shared.opts.interrogate_keep_models_in_memory:
-            self.model.to(devices.cpu)
-            devices.torch_gc()
+        pass

    def tag(self, pil_image):
        self.start()
@@ -56,8 +67,8 @@ class DeepDanbooru:
        pic = images.resize_image(2, pil_image.convert("RGB"), 512, 512)
        a = np.expand_dims(np.array(pic, dtype=np.float32), 0) / 255

-        with torch.no_grad(), devices.autocast():
-            x = torch.from_numpy(a).to(devices.device)
+        with torch.no_grad():
+            x = torch.from_numpy(a).to(self.load_device, self.dtype)
            y = self.model(x)[0].detach().cpu().numpy()

        probability_dict = {}
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -1,211 +1,89 @@
-import sys
 import contextlib
-from functools import lru_cache
-
 import torch
-from modules import errors, shared
-from modules import torch_utils
-
-if sys.platform == "darwin":
-    from modules import mac_specific
-
-if shared.cmd_opts.use_ipex:
-    from modules import xpu_specific
+import ldm_patched.modules.model_management as model_management


 def has_xpu() -> bool:
-    return shared.cmd_opts.use_ipex and xpu_specific.has_xpu
+    return model_management.xpu_available


 def has_mps() -> bool:
-    if sys.platform != "darwin":
-        return False
-    else:
-        return mac_specific.has_mps
+    return model_management.mps_mode()


 def cuda_no_autocast(device_id=None) -> bool:
-    if device_id is None:
-        device_id = get_cuda_device_id()
-    return (
-        torch.cuda.get_device_capability(device_id) == (7, 5)
-        and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16")
-    )
+    return False


 def get_cuda_device_id():
-    return (
-        int(shared.cmd_opts.device_id)
-        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit()
-        else 0
-    ) or torch.cuda.current_device()
+    return model_management.get_torch_device().index


 def get_cuda_device_string():
-    if shared.cmd_opts.device_id is not None:
-        return f"cuda:{shared.cmd_opts.device_id}"
-
-    return "cuda"
+    return str(model_management.get_torch_device())


 def get_optimal_device_name():
-    if torch.cuda.is_available():
-        return get_cuda_device_string()
-
-    if has_mps():
-        return "mps"
-
-    if has_xpu():
-        return xpu_specific.get_xpu_device_string()
-
-    return "cpu"
+    return model_management.get_torch_device().type


 def get_optimal_device():
-    return torch.device(get_optimal_device_name())
+    return model_management.get_torch_device()


 def get_device_for(task):
-    if task in shared.cmd_opts.use_cpu or "all" in shared.cmd_opts.use_cpu:
-        return cpu
-
    return get_optimal_device()


 def torch_gc():
-
-    if torch.cuda.is_available():
-        with torch.cuda.device(get_cuda_device_string()):
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-
-    if has_mps():
-        mac_specific.torch_mps_gc()
-
-    if has_xpu():
-        xpu_specific.torch_xpu_gc()
+    model_management.soft_empty_cache()


 def enable_tf32():
-    if torch.cuda.is_available():
+    return

-        # enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't
-        # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407
-        if cuda_no_autocast():
-            torch.backends.cudnn.benchmark = True
-
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-
-
-errors.run(enable_tf32, "Enabling TF32")

 cpu: torch.device = torch.device("cpu")
 fp8: bool = False
-device: torch.device = None
-device_interrogate: torch.device = None
-device_gfpgan: torch.device = None
-device_esrgan: torch.device = None
-device_codeformer: torch.device = None
-dtype: torch.dtype = torch.float16
-dtype_vae: torch.dtype = torch.float16
-dtype_unet: torch.dtype = torch.float16
-dtype_inference: torch.dtype = torch.float16
+device: torch.device = model_management.get_torch_device()
+device_interrogate: torch.device = cpu  # not used
+device_gfpgan: torch.device = cpu
+device_esrgan: torch.device = model_management.get_torch_device()  # will be managed in special way
+device_codeformer: torch.device = cpu
+dtype: torch.dtype = model_management.unet_dtype()
+dtype_vae: torch.dtype = model_management.vae_dtype()
+dtype_unet: torch.dtype = model_management.unet_dtype()
+dtype_inference: torch.dtype = model_management.unet_dtype()
 unet_needs_upcast = False


 def cond_cast_unet(input):
-    return input.to(dtype_unet) if unet_needs_upcast else input
+    return input


 def cond_cast_float(input):
-    return input.float() if unet_needs_upcast else input
+    return input


 nv_rng = None
-patch_module_list = [
-    torch.nn.Linear,
-    torch.nn.Conv2d,
-    torch.nn.MultiheadAttention,
-    torch.nn.GroupNorm,
-    torch.nn.LayerNorm,
-]
+patch_module_list = []


 def manual_cast_forward(target_dtype):
-    def forward_wrapper(self, *args, **kwargs):
-        if any(
-            isinstance(arg, torch.Tensor) and arg.dtype != target_dtype
-            for arg in args
-        ):
-            args = [arg.to(target_dtype) if isinstance(arg, torch.Tensor) else arg for arg in args]
-            kwargs = {k: v.to(target_dtype) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
-
-        org_dtype = torch_utils.get_param(self).dtype
-        if org_dtype != target_dtype:
-            self.to(target_dtype)
-        result = self.org_forward(*args, **kwargs)
-        if org_dtype != target_dtype:
-            self.to(org_dtype)
-
-        if target_dtype != dtype_inference:
-            if isinstance(result, tuple):
-                result = tuple(
-                    i.to(dtype_inference)
-                    if isinstance(i, torch.Tensor)
-                    else i
-                    for i in result
-                )
-            elif isinstance(result, torch.Tensor):
-                result = result.to(dtype_inference)
-        return result
-    return forward_wrapper
+    return


@contextlib.contextmanager
 def manual_cast(target_dtype):
-    applied = False
-    for module_type in patch_module_list:
-        if hasattr(module_type, "org_forward"):
-            continue
-        applied = True
-        org_forward = module_type.forward
-        if module_type == torch.nn.MultiheadAttention and has_xpu():
-            module_type.forward = manual_cast_forward(torch.float32)
-        else:
-            module_type.forward = manual_cast_forward(target_dtype)
-        module_type.org_forward = org_forward
-    try:
-        yield None
-    finally:
-        if applied:
-            for module_type in patch_module_list:
-                if hasattr(module_type, "org_forward"):
-                    module_type.forward = module_type.org_forward
-                    delattr(module_type, "org_forward")
+    return


 def autocast(disable=False):
-    if disable:
-        return contextlib.nullcontext()
-
-    if fp8 and device==cpu:
-        return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
-
-    if fp8 and dtype_inference == torch.float32:
-        return manual_cast(dtype)
-
-    if dtype == torch.float32 or dtype_inference == torch.float32:
-        return contextlib.nullcontext()
-
-    if has_xpu() or has_mps() or cuda_no_autocast():
-        return manual_cast(dtype)
-
-    return torch.autocast("cuda")
+    return contextlib.nullcontext()


 def without_autocast(disable=False):
-    return torch.autocast("cuda", enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()
+    return contextlib.nullcontext()


 class NansException(Exception):
@@ -213,43 +91,9 @@ class NansException(Exception):


 def test_for_nans(x, where):
-    if shared.cmd_opts.disable_nan_check:
-        return
-
-    if not torch.all(torch.isnan(x)).item():
-        return
-
-    if where == "unet":
-        message = "A tensor with all NaNs was produced in Unet."
-
-        if not shared.cmd_opts.no_half:
-            message += " This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try setting the \"Upcast cross attention layer to float32\" option in Settings > Stable Diffusion or using the --no-half commandline argument to fix this."
-
-    elif where == "vae":
-        message = "A tensor with all NaNs was produced in VAE."
-
-        if not shared.cmd_opts.no_half and not shared.cmd_opts.no_half_vae:
-            message += " This could be because there's not enough precision to represent the picture. Try adding --no-half-vae commandline argument to fix this."
-    else:
-        message = "A tensor with all NaNs was produced."
-
-    message += " Use --disable-nan-check commandline argument to disable this check."
-
-    raise NansException(message)
+    return


-@lru_cache
 def first_time_calculation():
-    """
-    just do any calculation with pytorch layers - the first time this is done it allocaltes about 700MB of memory and
-    spends about 2.7 seconds doing that, at least wih NVidia.
-    """
-
-    x = torch.zeros((1, 1)).to(device, dtype)
-    linear = torch.nn.Linear(1, 1).to(device, dtype)
-    linear(x)
-
-    x = torch.zeros((1, 1, 3, 3)).to(device, dtype)
-    conv2d = torch.nn.Conv2d(1, 1, (3, 3)).to(device, dtype)
-    conv2d(x)
+    return

--- a/modules/esrgan_model.py
+++ b/modules/esrgan_model.py
@@ -1,6 +1,6 @@
 from modules import modelloader, devices, errors
 from modules.shared import opts
-from modules.upscaler import Upscaler, UpscalerData
+from modules.upscaler import Upscaler, UpscalerData, prepare_free_memory
 from modules.upscaler_utils import upscale_with_model


@@ -27,6 +27,7 @@ class UpscalerESRGAN(Upscaler):
            self.scalers.append(scaler_data)

    def do_upscale(self, img, selected_model):
+        prepare_free_memory()
        try:
            model = self.load_model(selected_model)
        except Exception:
--- a/modules/hat_model.py
+++ b/modules/hat_model.py
@@ -3,7 +3,7 @@ import sys

 from modules import modelloader, devices
 from modules.shared import opts
-from modules.upscaler import Upscaler, UpscalerData
+from modules.upscaler import Upscaler, UpscalerData, prepare_free_memory
 from modules.upscaler_utils import upscale_with_model


@@ -20,6 +20,7 @@ class UpscalerHAT(Upscaler):
            self.scalers.append(scaler_data)

    def do_upscale(self, img, selected_model):
+        prepare_free_memory()
        try:
            model = self.load_model(selected_model)
        except Exception as e:
--- a/modules/initialize.py
+++ b/modules/initialize.py
@@ -12,6 +12,10 @@ def imports():
    logging.getLogger("torch.distributed.nn").setLevel(logging.ERROR)  # sshh...
    logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage())

+    from modules_forge.initialization import initialize_forge
+    initialize_forge()
+    startup_timer.record("initialize forge")
+
    import torch  # noqa: F401
    startup_timer.record("import torch")
    import pytorch_lightning  # noqa: F401
--- a/modules/interrogate.py
+++ b/modules/interrogate.py
@@ -10,7 +10,10 @@ import torch.hub
 from torchvision import transforms
 from torchvision.transforms.functional import InterpolationMode

-from modules import devices, paths, shared, lowvram, modelloader, errors, torch_utils
+from modules import devices, paths, shared, modelloader, errors
+from ldm_patched.modules import model_management
+from ldm_patched.modules.model_patcher import ModelPatcher
+

 blip_image_eval_size = 384
 clip_model_name = 'ViT-L/14'
@@ -53,7 +56,16 @@ class InterrogateModels:
        self.loaded_categories = None
        self.skip_categories = []
        self.content_dir = content_dir
-        self.running_on_cpu = devices.device_interrogate == torch.device("cpu")
+
+        self.load_device = model_management.text_encoder_device()
+        self.offload_device = model_management.text_encoder_offload_device()
+        self.dtype = torch.float32
+
+        if model_management.should_use_fp16(device=self.load_device):
+            self.dtype = torch.float16
+
+        self.blip_patcher = None
+        self.clip_patcher = None

    def categories(self):
        if not os.path.exists(self.content_dir):
@@ -105,49 +117,37 @@ class InterrogateModels:

    def load_clip_model(self):
        import clip
+        import clip.model

-        if self.running_on_cpu:
-            model, preprocess = clip.load(clip_model_name, device="cpu", download_root=shared.cmd_opts.clip_models_path)
-        else:
-            model, preprocess = clip.load(clip_model_name, download_root=shared.cmd_opts.clip_models_path)
+        clip.model.LayerNorm = torch.nn.LayerNorm

+        model, preprocess = clip.load(clip_model_name, device="cpu", download_root=shared.cmd_opts.clip_models_path)
        model.eval()
-        model = model.to(devices.device_interrogate)

        return model, preprocess

    def load(self):
        if self.blip_model is None:
            self.blip_model = self.load_blip_model()
-            if not shared.cmd_opts.no_half and not self.running_on_cpu:
-                self.blip_model = self.blip_model.half()
-
-        self.blip_model = self.blip_model.to(devices.device_interrogate)
+            self.blip_model = self.blip_model.to(device=self.offload_device, dtype=self.dtype)
+            self.blip_patcher = ModelPatcher(self.blip_model, load_device=self.load_device, offload_device=self.offload_device)

        if self.clip_model is None:
            self.clip_model, self.clip_preprocess = self.load_clip_model()
-            if not shared.cmd_opts.no_half and not self.running_on_cpu:
-                self.clip_model = self.clip_model.half()
+            self.clip_model = self.clip_model.to(device=self.offload_device, dtype=self.dtype)
+            self.clip_patcher = ModelPatcher(self.clip_model, load_device=self.load_device, offload_device=self.offload_device)

-        self.clip_model = self.clip_model.to(devices.device_interrogate)
-
-        self.dtype = torch_utils.get_param(self.clip_model).dtype
+        model_management.load_models_gpu([self.blip_patcher, self.clip_patcher])
+        return

    def send_clip_to_ram(self):
-        if not shared.opts.interrogate_keep_models_in_memory:
-            if self.clip_model is not None:
-                self.clip_model = self.clip_model.to(devices.cpu)
+        pass

    def send_blip_to_ram(self):
-        if not shared.opts.interrogate_keep_models_in_memory:
-            if self.blip_model is not None:
-                self.blip_model = self.blip_model.to(devices.cpu)
+        pass

    def unload(self):
-        self.send_clip_to_ram()
-        self.send_blip_to_ram()
-
-        devices.torch_gc()
+        pass

    def rank(self, image_features, text_array, top_count=1):
        import clip
@@ -158,11 +158,11 @@ class InterrogateModels:
            text_array = text_array[0:int(shared.opts.interrogate_clip_dict_limit)]

        top_count = min(top_count, len(text_array))
-        text_tokens = clip.tokenize(list(text_array), truncate=True).to(devices.device_interrogate)
+        text_tokens = clip.tokenize(list(text_array), truncate=True).to(self.load_device)
        text_features = self.clip_model.encode_text(text_tokens).type(self.dtype)
        text_features /= text_features.norm(dim=-1, keepdim=True)

-        similarity = torch.zeros((1, len(text_array))).to(devices.device_interrogate)
+        similarity = torch.zeros((1, len(text_array))).to(self.load_device)
        for i in range(image_features.shape[0]):
            similarity += (100.0 * image_features[i].unsqueeze(0) @ text_features.T).softmax(dim=-1)
        similarity /= image_features.shape[0]
@@ -175,7 +175,7 @@ class InterrogateModels:
            transforms.Resize((blip_image_eval_size, blip_image_eval_size), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
-        ])(pil_image).unsqueeze(0).type(self.dtype).to(devices.device_interrogate)
+        ])(pil_image).unsqueeze(0).type(self.dtype).to(self.load_device)

        with torch.no_grad():
            caption = self.blip_model.generate(gpu_image, sample=False, num_beams=shared.opts.interrogate_clip_num_beams, min_length=shared.opts.interrogate_clip_min_length, max_length=shared.opts.interrogate_clip_max_length)
@@ -186,9 +186,6 @@ class InterrogateModels:
        res = ""
        shared.state.begin(job="interrogate")
        try:
-            lowvram.send_everything_to_cpu()
-            devices.torch_gc()
-
            self.load()

            caption = self.generate_caption(pil_image)
@@ -197,7 +194,7 @@ class InterrogateModels:

            res = caption

-            clip_image = self.clip_preprocess(pil_image).unsqueeze(0).type(self.dtype).to(devices.device_interrogate)
+            clip_image = self.clip_preprocess(pil_image).unsqueeze(0).type(self.dtype).to(self.load_device)

            with torch.no_grad(), devices.autocast():
                image_features = self.clip_model.encode_image(clip_image).type(self.dtype)
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -15,6 +15,7 @@ from modules import cmd_args, errors
 from modules.paths_internal import script_path, extensions_dir
 from modules.timer import startup_timer
 from modules import logging_config
+from modules_forge import forge_version

 args, _ = cmd_args.parser.parse_known_args()
 logging_config.setup_logging(args.loglevel)
@@ -70,7 +71,7 @@ def commit_hash():


@lru_cache()
-def git_tag():
+def git_tag_a1111():
    try:
        return subprocess.check_output([git, "-C", script_path, "describe", "--tags"], shell=False, encoding='utf8').strip()
    except Exception:
@@ -85,6 +86,10 @@ def git_tag():
            return "<none>"


+def git_tag():
+    return 'f' + forge_version.version + '-' + git_tag_a1111()
+
+
 def run(command, desc=None, errdesc=None, custom_env=None, live: bool = default_command_live) -> str:
    if desc is not None:
        print(desc)
--- a/modules/lowvram.py
+++ b/modules/lowvram.py
@@ -6,142 +6,20 @@ cpu = torch.device("cpu")


 def send_everything_to_cpu():
-    global module_in_gpu
-
-    if module_in_gpu is not None:
-        module_in_gpu.to(cpu)
-
-    module_in_gpu = None
+    return


 def is_needed(sd_model):
-    return shared.cmd_opts.lowvram or shared.cmd_opts.medvram or shared.cmd_opts.medvram_sdxl and hasattr(sd_model, 'conditioner')
+    return False


 def apply(sd_model):
-    enable = is_needed(sd_model)
-    shared.parallel_processing_allowed = not enable
-
-    if enable:
-        setup_for_low_vram(sd_model, not shared.cmd_opts.lowvram)
-    else:
-        sd_model.lowvram = False
+    return


 def setup_for_low_vram(sd_model, use_medvram):
-    if getattr(sd_model, 'lowvram', False):
-        return
-
-    sd_model.lowvram = True
-
-    parents = {}
-
-    def send_me_to_gpu(module, _):
-        """send this module to GPU; send whatever tracked module was previous in GPU to CPU;
-        we add this as forward_pre_hook to a lot of modules and this way all but one of them will
-        be in CPU
-        """
-        global module_in_gpu
-
-        module = parents.get(module, module)
-
-        if module_in_gpu == module:
-            return
-
-        if module_in_gpu is not None:
-            module_in_gpu.to(cpu)
-
-        module.to(devices.device)
-        module_in_gpu = module
-
-    # see below for register_forward_pre_hook;
-    # first_stage_model does not use forward(), it uses encode/decode, so register_forward_pre_hook is
-    # useless here, and we just replace those methods
-
-    first_stage_model = sd_model.first_stage_model
-    first_stage_model_encode = sd_model.first_stage_model.encode
-    first_stage_model_decode = sd_model.first_stage_model.decode
-
-    def first_stage_model_encode_wrap(x):
-        send_me_to_gpu(first_stage_model, None)
-        return first_stage_model_encode(x)
-
-    def first_stage_model_decode_wrap(z):
-        send_me_to_gpu(first_stage_model, None)
-        return first_stage_model_decode(z)
-
-    to_remain_in_cpu = [
-        (sd_model, 'first_stage_model'),
-        (sd_model, 'depth_model'),
-        (sd_model, 'embedder'),
-        (sd_model, 'model'),
-        (sd_model, 'embedder'),
-    ]
-
-    is_sdxl = hasattr(sd_model, 'conditioner')
-    is_sd2 = not is_sdxl and hasattr(sd_model.cond_stage_model, 'model')
-
-    if is_sdxl:
-        to_remain_in_cpu.append((sd_model, 'conditioner'))
-    elif is_sd2:
-        to_remain_in_cpu.append((sd_model.cond_stage_model, 'model'))
-    else:
-        to_remain_in_cpu.append((sd_model.cond_stage_model, 'transformer'))
-
-    # remove several big modules: cond, first_stage, depth/embedder (if applicable), and unet from the model
-    stored = []
-    for obj, field in to_remain_in_cpu:
-        module = getattr(obj, field, None)
-        stored.append(module)
-        setattr(obj, field, None)
-
-    # send the model to GPU.
-    sd_model.to(devices.device)
-
-    # put modules back. the modules will be in CPU.
-    for (obj, field), module in zip(to_remain_in_cpu, stored):
-        setattr(obj, field, module)
-
-    # register hooks for those the first three models
-    if is_sdxl:
-        sd_model.conditioner.register_forward_pre_hook(send_me_to_gpu)
-    elif is_sd2:
-        sd_model.cond_stage_model.model.register_forward_pre_hook(send_me_to_gpu)
-        sd_model.cond_stage_model.model.token_embedding.register_forward_pre_hook(send_me_to_gpu)
-        parents[sd_model.cond_stage_model.model] = sd_model.cond_stage_model
-        parents[sd_model.cond_stage_model.model.token_embedding] = sd_model.cond_stage_model
-    else:
-        sd_model.cond_stage_model.transformer.register_forward_pre_hook(send_me_to_gpu)
-        parents[sd_model.cond_stage_model.transformer] = sd_model.cond_stage_model
-
-    sd_model.first_stage_model.register_forward_pre_hook(send_me_to_gpu)
-    sd_model.first_stage_model.encode = first_stage_model_encode_wrap
-    sd_model.first_stage_model.decode = first_stage_model_decode_wrap
-    if sd_model.depth_model:
-        sd_model.depth_model.register_forward_pre_hook(send_me_to_gpu)
-    if sd_model.embedder:
-        sd_model.embedder.register_forward_pre_hook(send_me_to_gpu)
-
-    if use_medvram:
-        sd_model.model.register_forward_pre_hook(send_me_to_gpu)
-    else:
-        diff_model = sd_model.model.diffusion_model
-
-        # the third remaining model is still too big for 4 GB, so we also do the same for its submodules
-        # so that only one of them is in GPU at a time
-        stored = diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed
-        diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = None, None, None, None
-        sd_model.model.to(devices.device)
-        diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = stored
-
-        # install hooks for bits of third model
-        diff_model.time_embed.register_forward_pre_hook(send_me_to_gpu)
-        for block in diff_model.input_blocks:
-            block.register_forward_pre_hook(send_me_to_gpu)
-        diff_model.middle_block.register_forward_pre_hook(send_me_to_gpu)
-        for block in diff_model.output_blocks:
-            block.register_forward_pre_hook(send_me_to_gpu)
+    return


 def is_enabled(sd_model):
-    return sd_model.lowvram
+    return False
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -627,44 +627,7 @@ def decode_latent_batch(model, batch, target_device=None, check_for_nans=False):

    for i in range(batch.shape[0]):
        sample = decode_first_stage(model, batch[i:i + 1])[0]
-
-        if check_for_nans:
-
-            try:
-                devices.test_for_nans(sample, "vae")
-            except devices.NansException as e:
-                if shared.opts.auto_vae_precision_bfloat16:
-                    autofix_dtype = torch.bfloat16
-                    autofix_dtype_text = "bfloat16"
-                    autofix_dtype_setting = "Automatically convert VAE to bfloat16"
-                    autofix_dtype_comment = ""
-                elif shared.opts.auto_vae_precision:
-                    autofix_dtype = torch.float32
-                    autofix_dtype_text = "32-bit float"
-                    autofix_dtype_setting = "Automatically revert VAE to 32-bit floats"
-                    autofix_dtype_comment = "\nTo always start with 32-bit VAE, use --no-half-vae commandline flag."
-                else:
-                    raise e
-
-                if devices.dtype_vae == autofix_dtype:
-                    raise e
-
-                errors.print_error_explanation(
-                    "A tensor with all NaNs was produced in VAE.\n"
-                    f"Web UI will now convert VAE into {autofix_dtype_text} and retry.\n"
-                    f"To disable this behavior, disable the '{autofix_dtype_setting}' setting.{autofix_dtype_comment}"
-                )
-
-                devices.dtype_vae = autofix_dtype
-                model.first_stage_model.to(devices.dtype_vae)
-                batch = batch.to(devices.dtype_vae)
-
-                sample = decode_first_stage(model, batch[i:i + 1])[0]
-
-        if target_device is not None:
-            sample = sample.to(target_device)
-
-        samples.append(sample)
+        samples.append(sample.to(target_device))

    return samples

@@ -847,7 +810,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:

    infotexts = []
    output_images = []
-    with torch.no_grad(), p.sd_model.ema_scope():
+    with torch.no_grad():
        with devices.autocast():
            p.init(p.all_prompts, p.all_seeds, p.all_subseeds)

@@ -871,6 +834,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:

            sd_models.reload_model_weights()  # model can be changed for example by refiner

+            p.sd_model.forge_objects = p.sd_model.forge_objects_original.shallow_copy()
            p.prompts = p.all_prompts[n * p.batch_size:(n + 1) * p.batch_size]
            p.negative_prompts = p.all_negative_prompts[n * p.batch_size:(n + 1) * p.batch_size]
            p.seeds = p.all_seeds[n * p.batch_size:(n + 1) * p.batch_size]
@@ -887,8 +851,9 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
            p.parse_extra_network_prompts()

            if not p.disable_extra_networks:
-                with devices.autocast():
-                    extra_networks.activate(p, p.extra_network_data)
+                extra_networks.activate(p, p.extra_network_data)
+
+            p.sd_model.forge_objects = p.sd_model.forge_objects_after_applying_lora.shallow_copy()

            if p.scripts is not None:
                p.scripts.process_batch(p, batch_number=n, prompts=p.prompts, seeds=p.seeds, subseeds=p.subseeds)
@@ -940,8 +905,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
                    p.extra_generation_params['Noise Schedule'] = opts.sd_noise_schedule
                    p.sd_model.alphas_cumprod = rescale_zero_terminal_snr_abar(p.sd_model.alphas_cumprod).to(shared.device)

-            with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
-                samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
+            samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)

            if p.scripts is not None:
                ps = scripts.PostSampleArgs(samples_ddim)
@@ -960,9 +924,6 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:

            del samples_ddim

-            if lowvram.is_enabled(shared.sd_model):
-                lowvram.send_everything_to_cpu()
-
            devices.torch_gc()

            state.nextjob()
@@ -1269,7 +1230,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
                image = np.array(self.firstpass_image).astype(np.float32) / 255.0
                image = np.moveaxis(image, 2, 0)
                image = torch.from_numpy(np.expand_dims(image, axis=0))
-                image = image.to(shared.device, dtype=devices.dtype_vae)
+                image = image.to(shared.device, dtype=torch.float32)

                if opts.sd_vae_encode_method != 'Full':
                    self.extra_generation_params['VAE Encoder'] = opts.sd_vae_encode_method
@@ -1353,7 +1314,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
                batch_images.append(image)

            decoded_samples = torch.from_numpy(np.array(batch_images))
-            decoded_samples = decoded_samples.to(shared.device, dtype=devices.dtype_vae)
+            decoded_samples = decoded_samples.to(shared.device, dtype=torch.float32)

            if opts.sd_vae_encode_method != 'Full':
                self.extra_generation_params['VAE Encoder'] = opts.sd_vae_encode_method
@@ -1458,7 +1419,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
            if shared.opts.hires_fix_use_firstpass_conds:
                self.calculate_hr_conds()

-            elif lowvram.is_enabled(shared.sd_model) and shared.sd_model.sd_checkpoint_info == sd_models.select_checkpoint():  # if in lowvram mode, we need to calculate conds right away, before the cond NN is unloaded
+            elif shared.sd_model.sd_checkpoint_info == sd_models.select_checkpoint():  # if in lowvram mode, we need to calculate conds right away, before the cond NN is unloaded
                with devices.autocast():
                    extra_networks.activate(self, self.hr_extra_network_data)

@@ -1645,7 +1606,7 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
            raise RuntimeError(f"bad number of images passed: {len(imgs)}; expecting {self.batch_size} or less")

        image = torch.from_numpy(batch_images)
-        image = image.to(shared.device, dtype=devices.dtype_vae)
+        image = image.to(shared.device, dtype=torch.float32)

        if opts.sd_vae_encode_method != 'Full':
            self.extra_generation_params['VAE Encoder'] = opts.sd_vae_encode_method
--- a/modules/prompt_parser.py
+++ b/modules/prompt_parser.py
@@ -276,6 +276,12 @@ class DictWithShape(dict):
    def shape(self):
        return self["crossattn"].shape

+    def to(self, *args, **kwargs):
+        for k in self.keys():
+            if isinstance(self[k], torch.Tensor):
+                self[k] = self[k].to(*args, **kwargs)
+        return self
+

 def reconstruct_cond_batch(c: list[list[ScheduledPromptConditioning]], current_step):
    param = c[0][0].cond
@@ -317,15 +323,32 @@ def stack_conds(tensors):
    return torch.stack(tensors)


+def stack_conds_alter(tensors, weights):
+    token_count = max([x.shape[0] for x in tensors])
+    for i in range(len(tensors)):
+        if tensors[i].shape[0] != token_count:
+            last_vector = tensors[i][-1:]
+            last_vector_repeated = last_vector.repeat([token_count - tensors[i].shape[0], 1])
+            tensors[i] = torch.vstack([tensors[i], last_vector_repeated])
+
+    result = 0
+    full_weights = 0
+    for x, w in zip(tensors, weights):
+        result = result + x * float(w)
+        full_weights = full_weights + float(w)
+    result = result / full_weights
+
+    return result
+

 def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
    param = c.batch[0][0].schedules[0].cond

-    tensors = []
-    conds_list = []
+    results = []

    for composable_prompts in c.batch:
-        conds_for_batch = []
+        tensors = []
+        weights = []

        for composable_prompt in composable_prompts:
            target_index = 0
@@ -334,19 +357,24 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step):
                    target_index = current
                    break

-            conds_for_batch.append((len(tensors), composable_prompt.weight))
+            weights.append(composable_prompt.weight)
            tensors.append(composable_prompt.schedules[target_index].cond)

-        conds_list.append(conds_for_batch)
+        if isinstance(tensors[0], dict):
+            weighted = {k: stack_conds_alter([x[k] for x in tensors], weights) for k in tensors[0].keys()}
+        else:
+            weighted = stack_conds_alter(tensors, weights)

-    if isinstance(tensors[0], dict):
-        keys = list(tensors[0].keys())
-        stacked = {k: stack_conds([x[k] for x in tensors]) for k in keys}
-        stacked = DictWithShape(stacked, stacked['crossattn'].shape)
+        results.append(weighted)
+
+    if isinstance(results[0], dict):
+        results = {k: torch.stack([x[k] for x in results])
+                   for k in results[0].keys()}
+        results = DictWithShape(results, results['crossattn'].shape)
    else:
-        stacked = stack_conds(tensors).to(device=param.device, dtype=param.dtype)
+        results = torch.stack(results).to(device=param.device, dtype=param.dtype)

-    return conds_list, stacked
+    return results


 re_attention = re.compile(r"""
--- a/modules/realesrgan_model.py
+++ b/modules/realesrgan_model.py
@@ -2,7 +2,7 @@ import os

 from modules import modelloader, errors
 from modules.shared import cmd_opts, opts
-from modules.upscaler import Upscaler, UpscalerData
+from modules.upscaler import Upscaler, UpscalerData, prepare_free_memory
 from modules.upscaler_utils import upscale_with_model


@@ -27,6 +27,8 @@ class UpscalerRealESRGAN(Upscaler):
                self.scalers.append(scaler)

    def do_upscale(self, img, path):
+        prepare_free_memory()
+
        if not self.enable:
            return img

--- a/modules/sd_hijack.py
+++ b/modules/sd_hijack.py
@@ -57,57 +57,11 @@ def list_optimizers():


 def apply_optimizations(option=None):
-    global current_optimizer
-
-    undo_optimizations()
-
-    if len(optimizers) == 0:
-        # a script can access the model very early, and optimizations would not be filled by then
-        current_optimizer = None
-        return ''
-
-    ldm.modules.diffusionmodules.model.nonlinearity = silu
-    ldm.modules.diffusionmodules.openaimodel.th = sd_hijack_unet.th
-
-    sgm.modules.diffusionmodules.model.nonlinearity = silu
-    sgm.modules.diffusionmodules.openaimodel.th = sd_hijack_unet.th
-
-    if current_optimizer is not None:
-        current_optimizer.undo()
-        current_optimizer = None
-
-    selection = option or shared.opts.cross_attention_optimization
-    if selection == "Automatic" and len(optimizers) > 0:
-        matching_optimizer = next(iter([x for x in optimizers if x.cmd_opt and getattr(shared.cmd_opts, x.cmd_opt, False)]), optimizers[0])
-    else:
-        matching_optimizer = next(iter([x for x in optimizers if x.title() == selection]), None)
-
-    if selection == "None":
-        matching_optimizer = None
-    elif selection == "Automatic" and shared.cmd_opts.disable_opt_split_attention:
-        matching_optimizer = None
-    elif matching_optimizer is None:
-        matching_optimizer = optimizers[0]
-
-    if matching_optimizer is not None:
-        print(f"Applying attention optimization: {matching_optimizer.name}... ", end='')
-        matching_optimizer.apply()
-        print("done.")
-        current_optimizer = matching_optimizer
-        return current_optimizer.name
-    else:
-        print("Disabling attention optimization")
-        return ''
+    return


 def undo_optimizations():
-    ldm.modules.diffusionmodules.model.nonlinearity = diffusionmodules_model_nonlinearity
-    ldm.modules.attention.CrossAttention.forward = hypernetwork.attention_CrossAttention_forward
-    ldm.modules.diffusionmodules.model.AttnBlock.forward = diffusionmodules_model_AttnBlock_forward
-
-    sgm.modules.diffusionmodules.model.nonlinearity = diffusionmodules_model_nonlinearity
-    sgm.modules.attention.CrossAttention.forward = hypernetwork.attention_CrossAttention_forward
-    sgm.modules.diffusionmodules.model.AttnBlock.forward = diffusionmodules_model_AttnBlock_forward
+    return


 def fix_checkpoint():
@@ -182,131 +136,16 @@ class StableDiffusionModelHijack:
        self.embedding_db.add_embedding_dir(cmd_opts.embeddings_dir)

    def apply_optimizations(self, option=None):
-        try:
-            self.optimization_method = apply_optimizations(option)
-        except Exception as e:
-            errors.display(e, "applying cross attention optimization")
-            undo_optimizations()
+        pass

    def convert_sdxl_to_ssd(self, m):
-        """Converts an SDXL model to a Segmind Stable Diffusion model (see https://huggingface.co/segmind/SSD-1B)"""
-
-        delattr(m.model.diffusion_model.middle_block, '1')
-        delattr(m.model.diffusion_model.middle_block, '2')
-        for i in ['9', '8', '7', '6', '5', '4']:
-            delattr(m.model.diffusion_model.input_blocks[7][1].transformer_blocks, i)
-            delattr(m.model.diffusion_model.input_blocks[8][1].transformer_blocks, i)
-            delattr(m.model.diffusion_model.output_blocks[0][1].transformer_blocks, i)
-            delattr(m.model.diffusion_model.output_blocks[1][1].transformer_blocks, i)
-        delattr(m.model.diffusion_model.output_blocks[4][1].transformer_blocks, '1')
-        delattr(m.model.diffusion_model.output_blocks[5][1].transformer_blocks, '1')
-        devices.torch_gc()
+        pass

    def hijack(self, m):
-        conditioner = getattr(m, 'conditioner', None)
-        if conditioner:
-            text_cond_models = []
-
-            for i in range(len(conditioner.embedders)):
-                embedder = conditioner.embedders[i]
-                typename = type(embedder).__name__
-                if typename == 'FrozenOpenCLIPEmbedder':
-                    embedder.model.token_embedding = EmbeddingsWithFixes(embedder.model.token_embedding, self)
-                    conditioner.embedders[i] = sd_hijack_open_clip.FrozenOpenCLIPEmbedderWithCustomWords(embedder, self)
-                    text_cond_models.append(conditioner.embedders[i])
-                if typename == 'FrozenCLIPEmbedder':
-                    model_embeddings = embedder.transformer.text_model.embeddings
-                    model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.token_embedding, self)
-                    conditioner.embedders[i] = sd_hijack_clip.FrozenCLIPEmbedderForSDXLWithCustomWords(embedder, self)
-                    text_cond_models.append(conditioner.embedders[i])
-                if typename == 'FrozenOpenCLIPEmbedder2':
-                    embedder.model.token_embedding = EmbeddingsWithFixes(embedder.model.token_embedding, self, textual_inversion_key='clip_g')
-                    conditioner.embedders[i] = sd_hijack_open_clip.FrozenOpenCLIPEmbedder2WithCustomWords(embedder, self)
-                    text_cond_models.append(conditioner.embedders[i])
-
-            if len(text_cond_models) == 1:
-                m.cond_stage_model = text_cond_models[0]
-            else:
-                m.cond_stage_model = conditioner
-
-        if type(m.cond_stage_model) == xlmr.BertSeriesModelWithTransformation or type(m.cond_stage_model) == xlmr_m18.BertSeriesModelWithTransformation:
-            model_embeddings = m.cond_stage_model.roberta.embeddings
-            model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.word_embeddings, self)
-            m.cond_stage_model = sd_hijack_xlmr.FrozenXLMREmbedderWithCustomWords(m.cond_stage_model, self)
-
-        elif type(m.cond_stage_model) == ldm.modules.encoders.modules.FrozenCLIPEmbedder:
-            model_embeddings = m.cond_stage_model.transformer.text_model.embeddings
-            model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.token_embedding, self)
-            m.cond_stage_model = sd_hijack_clip.FrozenCLIPEmbedderWithCustomWords(m.cond_stage_model, self)
-
-        elif type(m.cond_stage_model) == ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder:
-            m.cond_stage_model.model.token_embedding = EmbeddingsWithFixes(m.cond_stage_model.model.token_embedding, self)
-            m.cond_stage_model = sd_hijack_open_clip.FrozenOpenCLIPEmbedderWithCustomWords(m.cond_stage_model, self)
-
-        apply_weighted_forward(m)
-        if m.cond_stage_key == "edit":
-            sd_hijack_unet.hijack_ddpm_edit()
-
-        self.apply_optimizations()
-
-        self.clip = m.cond_stage_model
-
-        def flatten(el):
-            flattened = [flatten(children) for children in el.children()]
-            res = [el]
-            for c in flattened:
-                res += c
-            return res
-
-        self.layers = flatten(m)
-
-        import modules.models.diffusion.ddpm_edit
-
-        if isinstance(m, ldm.models.diffusion.ddpm.LatentDiffusion):
-            sd_unet.original_forward = ldm_original_forward
-        elif isinstance(m, modules.models.diffusion.ddpm_edit.LatentDiffusion):
-            sd_unet.original_forward = ldm_original_forward
-        elif isinstance(m, sgm.models.diffusion.DiffusionEngine):
-            sd_unet.original_forward = sgm_original_forward
-        else:
-            sd_unet.original_forward = None
-
+        pass

    def undo_hijack(self, m):
-        conditioner = getattr(m, 'conditioner', None)
-        if conditioner:
-            for i in range(len(conditioner.embedders)):
-                embedder = conditioner.embedders[i]
-                if isinstance(embedder, (sd_hijack_open_clip.FrozenOpenCLIPEmbedderWithCustomWords, sd_hijack_open_clip.FrozenOpenCLIPEmbedder2WithCustomWords)):
-                    embedder.wrapped.model.token_embedding = embedder.wrapped.model.token_embedding.wrapped
-                    conditioner.embedders[i] = embedder.wrapped
-                if isinstance(embedder, sd_hijack_clip.FrozenCLIPEmbedderForSDXLWithCustomWords):
-                    embedder.wrapped.transformer.text_model.embeddings.token_embedding = embedder.wrapped.transformer.text_model.embeddings.token_embedding.wrapped
-                    conditioner.embedders[i] = embedder.wrapped
-
-            if hasattr(m, 'cond_stage_model'):
-                delattr(m, 'cond_stage_model')
-
-        elif type(m.cond_stage_model) == sd_hijack_xlmr.FrozenXLMREmbedderWithCustomWords:
-            m.cond_stage_model = m.cond_stage_model.wrapped
-
-        elif type(m.cond_stage_model) == sd_hijack_clip.FrozenCLIPEmbedderWithCustomWords:
-            m.cond_stage_model = m.cond_stage_model.wrapped
-
-            model_embeddings = m.cond_stage_model.transformer.text_model.embeddings
-            if type(model_embeddings.token_embedding) == EmbeddingsWithFixes:
-                model_embeddings.token_embedding = model_embeddings.token_embedding.wrapped
-        elif type(m.cond_stage_model) == sd_hijack_open_clip.FrozenOpenCLIPEmbedderWithCustomWords:
-            m.cond_stage_model.wrapped.model.token_embedding = m.cond_stage_model.wrapped.model.token_embedding.wrapped
-            m.cond_stage_model = m.cond_stage_model.wrapped
-
-        undo_optimizations()
-        undo_weighted_forward(m)
-
-        self.apply_circular(False)
-        self.layers = None
-        self.clip = None
-
+        pass

    def apply_circular(self, enable):
        if self.circular_enabled == enable:
@@ -321,17 +160,12 @@ class StableDiffusionModelHijack:
        self.comments = []
        self.extra_generation_params = {}

-    def get_prompt_lengths(self, text):
-        if self.clip is None:
-            return "-", "-"
-
-        _, token_count = self.clip.process_texts([text])
-
-        return token_count, self.clip.get_target_prompt_token_count(token_count)
+    def get_prompt_lengths(self, text, cond_stage_model):
+        _, token_count = cond_stage_model.process_texts([text])
+        return token_count, cond_stage_model.get_target_prompt_token_count(token_count)

    def redo_hijack(self, m):
-        self.undo_hijack(m)
-        self.hijack(m)
+        pass


 class EmbeddingsWithFixes(torch.nn.Module):
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -10,6 +10,7 @@ from omegaconf import OmegaConf, ListConfig
 from os import mkdir
 from urllib import request
 import ldm.modules.midas as midas
+import gc

 from ldm.util import instantiate_from_config

@@ -17,6 +18,12 @@ from modules import paths, shared, modelloader, devices, script_callbacks, sd_va
 from modules.timer import Timer
 import tomesd
 import numpy as np
+from modules_forge import forge_loader
+import modules_forge.ops as forge_ops
+from ldm_patched.modules.ops import manual_cast
+from ldm_patched.modules import model_management as model_management
+import ldm_patched.modules.model_patcher
+

 model_dir = "Stable-diffusion"
 model_path = os.path.abspath(os.path.join(paths.models_path, model_dir))
@@ -366,26 +373,12 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
    sd_model_hash = checkpoint_info.calculate_shorthash()
    timer.record("calculate hash")

-    if devices.fp8:
-        # prevent model to load state dict in fp8
-        model.half()
-
    if not SkipWritingToConfig.skip:
        shared.opts.data["sd_model_checkpoint"] = checkpoint_info.title

    if state_dict is None:
        state_dict = get_checkpoint_state_dict(checkpoint_info, timer)

-    model.is_sdxl = hasattr(model, 'conditioner')
-    model.is_sd2 = not model.is_sdxl and hasattr(model.cond_stage_model, 'model')
-    model.is_sd1 = not model.is_sdxl and not model.is_sd2
-    model.is_ssd = model.is_sdxl and 'model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight' not in state_dict.keys()
-    if model.is_sdxl:
-        sd_models_xl.extend_sdxl(model)
-
-    if model.is_ssd:
-        sd_hijack.model_hijack.convert_sdxl_to_ssd(model)
-
    if shared.opts.sd_checkpoint_cache > 0:
        # cache newly loaded model
        checkpoints_loaded[checkpoint_info] = state_dict.copy()
@@ -395,65 +388,6 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer

    del state_dict

-    if shared.cmd_opts.opt_channelslast:
-        model.to(memory_format=torch.channels_last)
-        timer.record("apply channels_last")
-
-    if shared.cmd_opts.no_half:
-        model.float()
-        model.alphas_cumprod_original = model.alphas_cumprod
-        devices.dtype_unet = torch.float32
-        timer.record("apply float()")
-    else:
-        vae = model.first_stage_model
-        depth_model = getattr(model, 'depth_model', None)
-
-        # with --no-half-vae, remove VAE from model when doing half() to prevent its weights from being converted to float16
-        if shared.cmd_opts.no_half_vae:
-            model.first_stage_model = None
-        # with --upcast-sampling, don't convert the depth model weights to float16
-        if shared.cmd_opts.upcast_sampling and depth_model:
-            model.depth_model = None
-
-        alphas_cumprod = model.alphas_cumprod
-        model.alphas_cumprod = None
-        model.half()
-        model.alphas_cumprod = alphas_cumprod
-        model.alphas_cumprod_original = alphas_cumprod
-        model.first_stage_model = vae
-        if depth_model:
-            model.depth_model = depth_model
-
-        devices.dtype_unet = torch.float16
-        timer.record("apply half()")
-
-    for module in model.modules():
-        if hasattr(module, 'fp16_weight'):
-            del module.fp16_weight
-        if hasattr(module, 'fp16_bias'):
-            del module.fp16_bias
-
-    if check_fp8(model):
-        devices.fp8 = True
-        first_stage = model.first_stage_model
-        model.first_stage_model = None
-        for module in model.modules():
-            if isinstance(module, (torch.nn.Conv2d, torch.nn.Linear)):
-                if shared.opts.cache_fp16_weight:
-                    module.fp16_weight = module.weight.data.clone().cpu().half()
-                    if module.bias is not None:
-                        module.fp16_bias = module.bias.data.clone().cpu().half()
-                module.to(torch.float8_e4m3fn)
-        model.first_stage_model = first_stage
-        timer.record("apply fp8")
-    else:
-        devices.fp8 = False
-
-    devices.unet_needs_upcast = shared.cmd_opts.upcast_sampling and devices.dtype == torch.float16 and devices.dtype_unet == torch.float16
-
-    model.first_stage_model.to(devices.dtype_vae)
-    timer.record("apply dtype to VAE")
-
    # clean up cache if limit is reached
    while len(checkpoints_loaded) > shared.opts.sd_checkpoint_cache:
        checkpoints_loaded.popitem(last=False)
@@ -590,14 +524,6 @@ class SdModelData:
            sd_vae.loaded_vae_file = getattr(v, "loaded_vae_file", None)
            sd_vae.checkpoint_info = v.sd_checkpoint_info

-        try:
-            self.loaded_sd_models.remove(v)
-        except ValueError:
-            pass
-
-        if v is not None:
-            self.loaded_sd_models.insert(0, v)
-

 model_data = SdModelData()

@@ -615,31 +541,19 @@ def get_empty_cond(sd_model):


 def send_model_to_cpu(m):
-    if m.lowvram:
-        lowvram.send_everything_to_cpu()
-    else:
-        m.to(devices.cpu)
-
-    devices.torch_gc()
+    pass


 def model_target_device(m):
-    if lowvram.is_needed(m):
-        return devices.cpu
-    else:
-        return devices.device
+    return devices.device


 def send_model_to_device(m):
-    lowvram.apply(m)
-
-    if not m.lowvram:
-        m.to(shared.device)
+    pass


 def send_model_to_trash(m):
-    m.to(device="meta")
-    devices.torch_gc()
+    pass


 def load_model(checkpoint_info=None, already_loaded_state_dict=None):
@@ -649,9 +563,14 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None):
    timer = Timer()

    if model_data.sd_model:
-        send_model_to_trash(model_data.sd_model)
+        if model_data.sd_model.filename == checkpoint_info.filename:
+            return model_data.sd_model
+
        model_data.sd_model = None
-        devices.torch_gc()
+        model_data.loaded_sd_models = []
+        model_management.unload_all_models()
+        model_management.soft_empty_cache()
+        gc.collect()

    timer.record("unload existing model")

@@ -660,58 +579,27 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None):
    else:
        state_dict = get_checkpoint_state_dict(checkpoint_info, timer)

-    checkpoint_config = sd_models_config.find_checkpoint_config(state_dict, checkpoint_info)
-    clip_is_included_into_sd = any(x for x in [sd1_clip_weight, sd2_clip_weight, sdxl_clip_weight, sdxl_refiner_clip_weight] if x in state_dict)
+    if shared.opts.sd_checkpoint_cache > 0:
+        # cache newly loaded model
+        checkpoints_loaded[checkpoint_info] = state_dict.copy()

-    timer.record("find config")
+    sd_model = forge_loader.load_model_for_a1111(timer=timer, checkpoint_info=checkpoint_info, state_dict=state_dict)
+    sd_model.filename = checkpoint_info.filename

-    sd_config = OmegaConf.load(checkpoint_config)
-    repair_config(sd_config)
+    del state_dict

-    timer.record("load config")
+    # clean up cache if limit is reached
+    while len(checkpoints_loaded) > shared.opts.sd_checkpoint_cache:
+        checkpoints_loaded.popitem(last=False)

-    print(f"Creating model from config: {checkpoint_config}")
+    shared.opts.data["sd_checkpoint_hash"] = checkpoint_info.sha256

-    sd_model = None
-    try:
-        with sd_disable_initialization.DisableInitialization(disable_clip=clip_is_included_into_sd or shared.cmd_opts.do_not_download_clip):
-            with sd_disable_initialization.InitializeOnMeta():
-                sd_model = instantiate_from_config(sd_config.model)
+    sd_vae.delete_base_vae()
+    sd_vae.clear_loaded_vae()
+    vae_file, vae_source = sd_vae.resolve_vae(checkpoint_info.filename).tuple()
+    sd_vae.load_vae(sd_model, vae_file, vae_source)
+    timer.record("load VAE")

-    except Exception as e:
-        errors.display(e, "creating model quickly", full_traceback=True)
-
-    if sd_model is None:
-        print('Failed to create model quickly; will retry using slow method.', file=sys.stderr)
-
-        with sd_disable_initialization.InitializeOnMeta():
-            sd_model = instantiate_from_config(sd_config.model)
-
-    sd_model.used_config = checkpoint_config
-
-    timer.record("create model")
-
-    if shared.cmd_opts.no_half:
-        weight_dtype_conversion = None
-    else:
-        weight_dtype_conversion = {
-            'first_stage_model': None,
-            'alphas_cumprod': None,
-            '': torch.float16,
-        }
-
-    with sd_disable_initialization.LoadStateDictOnMeta(state_dict, device=model_target_device(sd_model), weight_dtype_conversion=weight_dtype_conversion):
-        load_model_weights(sd_model, checkpoint_info, state_dict, timer)
-    timer.record("load weights from state dict")
-
-    send_model_to_device(sd_model)
-    timer.record("move model to device")
-
-    sd_hijack.model_hijack.hijack(sd_model)
-
-    timer.record("hijack")
-
-    sd_model.eval()
    model_data.set_sd_model(sd_model)
    model_data.was_loaded_at_least_once = True

@@ -723,7 +611,7 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None):

    timer.record("scripts callbacks")

-    with devices.autocast(), torch.no_grad():
+    with torch.no_grad():
        sd_model.cond_stage_model_empty_prompt = get_empty_cond(sd_model)

    timer.record("calculate empty prompt")
@@ -734,132 +622,14 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None):


 def reuse_model_from_already_loaded(sd_model, checkpoint_info, timer):
-    """
-    Checks if the desired checkpoint from checkpoint_info is not already loaded in model_data.loaded_sd_models.
-    If it is loaded, returns that (moving it to GPU if necessary, and moving the currently loadded model to CPU if necessary).
-    If not, returns the model that can be used to load weights from checkpoint_info's file.
-    If no such model exists, returns None.
-    Additionaly deletes loaded models that are over the limit set in settings (sd_checkpoints_limit).
-    """
-
-    already_loaded = None
-    for i in reversed(range(len(model_data.loaded_sd_models))):
-        loaded_model = model_data.loaded_sd_models[i]
-        if loaded_model.sd_checkpoint_info.filename == checkpoint_info.filename:
-            already_loaded = loaded_model
-            continue
-
-        if len(model_data.loaded_sd_models) > shared.opts.sd_checkpoints_limit > 0:
-            print(f"Unloading model {len(model_data.loaded_sd_models)} over the limit of {shared.opts.sd_checkpoints_limit}: {loaded_model.sd_checkpoint_info.title}")
-            model_data.loaded_sd_models.pop()
-            send_model_to_trash(loaded_model)
-            timer.record("send model to trash")
-
-        if shared.opts.sd_checkpoints_keep_in_cpu:
-            send_model_to_cpu(sd_model)
-            timer.record("send model to cpu")
-
-    if already_loaded is not None:
-        send_model_to_device(already_loaded)
-        timer.record("send model to device")
-
-        model_data.set_sd_model(already_loaded, already_loaded=True)
-
-        if not SkipWritingToConfig.skip:
-            shared.opts.data["sd_model_checkpoint"] = already_loaded.sd_checkpoint_info.title
-            shared.opts.data["sd_checkpoint_hash"] = already_loaded.sd_checkpoint_info.sha256
-
-        print(f"Using already loaded model {already_loaded.sd_checkpoint_info.title}: done in {timer.summary()}")
-        sd_vae.reload_vae_weights(already_loaded)
-        return model_data.sd_model
-    elif shared.opts.sd_checkpoints_limit > 1 and len(model_data.loaded_sd_models) < shared.opts.sd_checkpoints_limit:
-        print(f"Loading model {checkpoint_info.title} ({len(model_data.loaded_sd_models) + 1} out of {shared.opts.sd_checkpoints_limit})")
-
-        model_data.sd_model = None
-        load_model(checkpoint_info)
-        return model_data.sd_model
-    elif len(model_data.loaded_sd_models) > 0:
-        sd_model = model_data.loaded_sd_models.pop()
-        model_data.sd_model = sd_model
-
-        sd_vae.base_vae = getattr(sd_model, "base_vae", None)
-        sd_vae.loaded_vae_file = getattr(sd_model, "loaded_vae_file", None)
-        sd_vae.checkpoint_info = sd_model.sd_checkpoint_info
-
-        print(f"Reusing loaded model {sd_model.sd_checkpoint_info.title} to load {checkpoint_info.title}")
-        return sd_model
-    else:
-        return None
+    pass


 def reload_model_weights(sd_model=None, info=None, forced_reload=False):
-    checkpoint_info = info or select_checkpoint()
-
-    timer = Timer()
-
-    if not sd_model:
-        sd_model = model_data.sd_model
-
-    if sd_model is None:  # previous model load failed
-        current_checkpoint_info = None
-    else:
-        current_checkpoint_info = sd_model.sd_checkpoint_info
-        if check_fp8(sd_model) != devices.fp8:
-            # load from state dict again to prevent extra numerical errors
-            forced_reload = True
-        elif sd_model.sd_model_checkpoint == checkpoint_info.filename and not forced_reload:
-            return sd_model
-
-    sd_model = reuse_model_from_already_loaded(sd_model, checkpoint_info, timer)
-    if not forced_reload and sd_model is not None and sd_model.sd_checkpoint_info.filename == checkpoint_info.filename:
-        return sd_model
-
-    if sd_model is not None:
-        sd_unet.apply_unet("None")
-        send_model_to_cpu(sd_model)
-        sd_hijack.model_hijack.undo_hijack(sd_model)
-
-    state_dict = get_checkpoint_state_dict(checkpoint_info, timer)
-
-    checkpoint_config = sd_models_config.find_checkpoint_config(state_dict, checkpoint_info)
-
-    timer.record("find config")
-
-    if sd_model is None or checkpoint_config != sd_model.used_config:
-        if sd_model is not None:
-            send_model_to_trash(sd_model)
-
-        load_model(checkpoint_info, already_loaded_state_dict=state_dict)
-        return model_data.sd_model
-
-    try:
-        load_model_weights(sd_model, checkpoint_info, state_dict, timer)
-    except Exception:
-        print("Failed to load checkpoint, restoring previous")
-        load_model_weights(sd_model, current_checkpoint_info, None, timer)
-        raise
-    finally:
-        sd_hijack.model_hijack.hijack(sd_model)
-        timer.record("hijack")
-
-        if not sd_model.lowvram:
-            sd_model.to(devices.device)
-            timer.record("move model to device")
-
-        script_callbacks.model_loaded_callback(sd_model)
-        timer.record("script callbacks")
-
-    print(f"Weights loaded in {timer.summary()}.")
-
-    model_data.set_sd_model(sd_model)
-    sd_unet.apply_unet()
-
-    return sd_model
+    return load_model(info)


 def unload_model_weights(sd_model=None, info=None):
-    send_model_to_cpu(sd_model or shared.sd_model)
-
    return sd_model


--- a/modules/sd_models_xl.py
+++ b/modules/sd_models_xl.py
@@ -8,8 +8,12 @@ import sgm.modules.diffusionmodules.discretizer
 from modules import devices, shared, prompt_parser
 from modules import torch_utils

+import ldm_patched.modules.model_management as model_management
+

 def get_learned_conditioning(self: sgm.models.diffusion.DiffusionEngine, batch: prompt_parser.SdConditioning | list[str]):
+    model_management.load_model_gpu(self.forge_objects.clip.patcher)
+
    for embedder in self.conditioner.embedders:
        embedder.ucg_rate = 0.0

@@ -18,7 +22,7 @@ def get_learned_conditioning(self: sgm.models.diffusion.DiffusionEngine, batch:
    is_negative_prompt = getattr(batch, 'is_negative_prompt', False)
    aesthetic_score = shared.opts.sdxl_refiner_low_aesthetic_score if is_negative_prompt else shared.opts.sdxl_refiner_high_aesthetic_score

-    devices_args = dict(device=devices.device, dtype=devices.dtype)
+    devices_args = dict(device=self.forge_objects.clip.patcher.current_device, dtype=model_management.text_encoder_dtype())

    sdxl_conds = {
        "txt": batch,
@@ -34,14 +38,11 @@ def get_learned_conditioning(self: sgm.models.diffusion.DiffusionEngine, batch:
    return c


-def apply_model(self: sgm.models.diffusion.DiffusionEngine, x, t, cond):
-    sd = self.model.state_dict()
-    diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
-    if diffusion_model_input is not None:
-        if diffusion_model_input.shape[1] == 9:
-            x = torch.cat([x] + cond['c_concat'], dim=1)
+def apply_model(self: sgm.models.diffusion.DiffusionEngine, x, t, cond, *args, **kwargs):
+    if self.model.diffusion_model.in_channels == 9:
+        x = torch.cat([x] + cond['c_concat'], dim=1)

-    return self.model(x, t, cond)
+    return self.model(x, t, cond, *args, **kwargs)


 def get_first_stage_encoding(self, x):  # SDXL's encode_first_stage does everything so get_first_stage_encoding is just there for compatibility
--- a/modules/sd_samplers_cfg_denoiser.py
+++ b/modules/sd_samplers_cfg_denoiser.py
@@ -6,6 +6,7 @@ import modules.shared as shared
 from modules.script_callbacks import CFGDenoiserParams, cfg_denoiser_callback
 from modules.script_callbacks import CFGDenoisedParams, cfg_denoised_callback
 from modules.script_callbacks import AfterCFGCallbackParams, cfg_after_cfg_callback
+from modules_forge import forge_sampler


 def catenate_conds(conds):
@@ -66,7 +67,7 @@ class CFGDenoiser(torch.nn.Module):
    def inner_model(self):
        raise NotImplementedError()

-    def combine_denoised(self, x_out, conds_list, uncond, cond_scale):
+    def combine_denoised(self, x_out, conds_list, uncond, cond_scale, timestep, x_in, cond):
        denoised_uncond = x_out[-uncond.shape[0]:]
        denoised = torch.clone(denoised_uncond)

@@ -152,19 +153,13 @@ class CFGDenoiser(torch.nn.Module):
        if state.interrupted or state.skipped:
            raise sd_samplers_common.InterruptedException

-        if sd_samplers_common.apply_refiner(self):
+        if sd_samplers_common.apply_refiner(self, x):
            cond = self.sampler.sampler_extra_args['cond']
            uncond = self.sampler.sampler_extra_args['uncond']

-        # at self.image_cfg_scale == 1.0 produced results for edit model are the same as with normal sampling,
-        # so is_edit_model is set to False to support AND composition.
-        is_edit_model = shared.sd_model.cond_stage_key == "edit" and self.image_cfg_scale is not None and self.image_cfg_scale != 1.0
-
-        conds_list, tensor = prompt_parser.reconstruct_multicond_batch(cond, self.step)
+        cond = prompt_parser.reconstruct_multicond_batch(cond, self.step)
        uncond = prompt_parser.reconstruct_cond_batch(uncond, self.step)

-        assert not is_edit_model or all(len(conds) == 1 for conds in conds_list), "AND is not supported for InstructPix2Pix checkpoint (unless using Image CFG scale = 1.0)"
-
        # If we use masks, blending between the denoised and original latent images occurs here.
        def apply_blend(current_latent):
            blended_latent = current_latent * self.nmask + self.init_latent * self.mask
@@ -181,113 +176,12 @@ class CFGDenoiser(torch.nn.Module):
        if self.mask_before_denoising and self.mask is not None:
            x = apply_blend(x)

-        batch_size = len(conds_list)
-        repeats = [len(conds_list[i]) for i in range(batch_size)]
-
-        if shared.sd_model.model.conditioning_key == "crossattn-adm":
-            image_uncond = torch.zeros_like(image_cond)
-            make_condition_dict = lambda c_crossattn, c_adm: {"c_crossattn": [c_crossattn], "c_adm": c_adm}
-        else:
-            image_uncond = image_cond
-            if isinstance(uncond, dict):
-                make_condition_dict = lambda c_crossattn, c_concat: {**c_crossattn, "c_concat": [c_concat]}
-            else:
-                make_condition_dict = lambda c_crossattn, c_concat: {"c_crossattn": [c_crossattn], "c_concat": [c_concat]}
-
-        if not is_edit_model:
-            x_in = torch.cat([torch.stack([x[i] for _ in range(n)]) for i, n in enumerate(repeats)] + [x])
-            sigma_in = torch.cat([torch.stack([sigma[i] for _ in range(n)]) for i, n in enumerate(repeats)] + [sigma])
-            image_cond_in = torch.cat([torch.stack([image_cond[i] for _ in range(n)]) for i, n in enumerate(repeats)] + [image_uncond])
-        else:
-            x_in = torch.cat([torch.stack([x[i] for _ in range(n)]) for i, n in enumerate(repeats)] + [x] + [x])
-            sigma_in = torch.cat([torch.stack([sigma[i] for _ in range(n)]) for i, n in enumerate(repeats)] + [sigma] + [sigma])
-            image_cond_in = torch.cat([torch.stack([image_cond[i] for _ in range(n)]) for i, n in enumerate(repeats)] + [image_uncond] + [torch.zeros_like(self.init_latent)])
-
-        denoiser_params = CFGDenoiserParams(x_in, image_cond_in, sigma_in, state.sampling_step, state.sampling_steps, tensor, uncond, self)
+        denoiser_params = CFGDenoiserParams(x, image_cond, sigma, state.sampling_step, state.sampling_steps, cond, uncond, self)
        cfg_denoiser_callback(denoiser_params)
-        x_in = denoiser_params.x
-        image_cond_in = denoiser_params.image_cond
-        sigma_in = denoiser_params.sigma
-        tensor = denoiser_params.text_cond
-        uncond = denoiser_params.text_uncond
-        skip_uncond = False

-        # alternating uncond allows for higher thresholds without the quality loss normally expected from raising it
-        if self.step % 2 and s_min_uncond > 0 and sigma[0] < s_min_uncond and not is_edit_model:
-            skip_uncond = True
-            x_in = x_in[:-batch_size]
-            sigma_in = sigma_in[:-batch_size]
-
-        self.padded_cond_uncond = False
-        self.padded_cond_uncond_v0 = False
-        if shared.opts.pad_cond_uncond and tensor.shape[1] != uncond.shape[1]:
-            tensor, uncond = self.pad_cond_uncond(tensor, uncond)
-        elif shared.opts.pad_cond_uncond_v0 and tensor.shape[1] != uncond.shape[1]:
-            tensor, uncond = self.pad_cond_uncond_v0(tensor, uncond)
-
-        if tensor.shape[1] == uncond.shape[1] or skip_uncond:
-            if is_edit_model:
-                cond_in = catenate_conds([tensor, uncond, uncond])
-            elif skip_uncond:
-                cond_in = tensor
-            else:
-                cond_in = catenate_conds([tensor, uncond])
-
-            if shared.opts.batch_cond_uncond:
-                x_out = self.inner_model(x_in, sigma_in, cond=make_condition_dict(cond_in, image_cond_in))
-            else:
-                x_out = torch.zeros_like(x_in)
-                for batch_offset in range(0, x_out.shape[0], batch_size):
-                    a = batch_offset
-                    b = a + batch_size
-                    x_out[a:b] = self.inner_model(x_in[a:b], sigma_in[a:b], cond=make_condition_dict(subscript_cond(cond_in, a, b), image_cond_in[a:b]))
-        else:
-            x_out = torch.zeros_like(x_in)
-            batch_size = batch_size*2 if shared.opts.batch_cond_uncond else batch_size
-            for batch_offset in range(0, tensor.shape[0], batch_size):
-                a = batch_offset
-                b = min(a + batch_size, tensor.shape[0])
-
-                if not is_edit_model:
-                    c_crossattn = subscript_cond(tensor, a, b)
-                else:
-                    c_crossattn = torch.cat([tensor[a:b]], uncond)
-
-                x_out[a:b] = self.inner_model(x_in[a:b], sigma_in[a:b], cond=make_condition_dict(c_crossattn, image_cond_in[a:b]))
-
-            if not skip_uncond:
-                x_out[-uncond.shape[0]:] = self.inner_model(x_in[-uncond.shape[0]:], sigma_in[-uncond.shape[0]:], cond=make_condition_dict(uncond, image_cond_in[-uncond.shape[0]:]))
-
-        denoised_image_indexes = [x[0][0] for x in conds_list]
-        if skip_uncond:
-            fake_uncond = torch.cat([x_out[i:i+1] for i in denoised_image_indexes])
-            x_out = torch.cat([x_out, fake_uncond])  # we skipped uncond denoising, so we put cond-denoised image to where the uncond-denoised image should be
-
-        denoised_params = CFGDenoisedParams(x_out, state.sampling_step, state.sampling_steps, self.inner_model)
-        cfg_denoised_callback(denoised_params)
-
-        devices.test_for_nans(x_out, "unet")
-
-        if is_edit_model:
-            denoised = self.combine_denoised_for_edit_model(x_out, cond_scale)
-        elif skip_uncond:
-            denoised = self.combine_denoised(x_out, conds_list, uncond, 1.0)
-        else:
-            denoised = self.combine_denoised(x_out, conds_list, uncond, cond_scale)
-
-        # Blend in the original latents (after)
-        if not self.mask_before_denoising and self.mask is not None:
-            denoised = apply_blend(denoised)
-
-        self.sampler.last_latent = self.get_pred_x0(torch.cat([x_in[i:i + 1] for i in denoised_image_indexes]), torch.cat([x_out[i:i + 1] for i in denoised_image_indexes]), sigma)
-
-        if opts.live_preview_content == "Prompt":
-            preview = self.sampler.last_latent
-        elif opts.live_preview_content == "Negative prompt":
-            preview = self.get_pred_x0(x_in[-uncond.shape[0]:], x_out[-uncond.shape[0]:], sigma)
-        else:
-            preview = self.get_pred_x0(torch.cat([x_in[i:i+1] for i in denoised_image_indexes]), torch.cat([denoised[i:i+1] for i in denoised_image_indexes]), sigma)
+        denoised = forge_sampler.forge_sample(self, denoiser_params=denoiser_params, cond_scale=cond_scale)

+        preview = self.sampler.last_latent = denoised
        sd_samplers_common.store_latent(preview)

        after_cfg_callback_params = AfterCFGCallbackParams(denoised, state.sampling_step, state.sampling_steps)
--- a/modules/sd_samplers_common.py
+++ b/modules/sd_samplers_common.py
@@ -5,6 +5,7 @@ import torch
 from PIL import Image
 from modules import devices, images, sd_vae_approx, sd_samplers, sd_vae_taesd, shared, sd_models
 from modules.shared import opts, state
+from ldm_patched.modules import model_management
 import k_diffusion.sampling


@@ -39,9 +40,7 @@ def samples_to_images_tensor(sample, approximation=None, model=None):

    if approximation is None or (shared.state.interrupted and opts.live_preview_fast_interrupt):
        approximation = approximation_indexes.get(opts.show_progress_type, 0)
-
-        from modules import lowvram
-        if approximation == 0 and lowvram.is_enabled(shared.sd_model) and not shared.opts.live_preview_allow_lowvram_full:
+        if approximation == 0:
            approximation = 1

    if approximation == 2:
@@ -54,8 +53,7 @@ def samples_to_images_tensor(sample, approximation=None, model=None):
    else:
        if model is None:
            model = shared.sd_model
-        with devices.without_autocast(): # fixes an issue with unstable VAEs that are flaky even in fp32
-            x_sample = model.decode_first_stage(sample.to(model.first_stage_model.dtype))
+        x_sample = model.decode_first_stage(sample)

    return x_sample

@@ -71,7 +69,6 @@ def single_sample_to_image(sample, approximation=None):


 def decode_first_stage(model, x):
-    x = x.to(devices.dtype_vae)
    approx_index = approximation_indexes.get(opts.sd_vae_decode_method, 0)
    return samples_to_images_tensor(x, approx_index, model)

@@ -95,7 +92,6 @@ def images_tensor_to_samples(image, approximation=None, model=None):
    else:
        if model is None:
            model = shared.sd_model
-        model.first_stage_model.to(devices.dtype_vae)

        image = image.to(shared.device, dtype=devices.dtype_vae)
        image = image * 2 - 1
@@ -155,7 +151,7 @@ def replace_torchsde_browinan():
 replace_torchsde_browinan()


-def apply_refiner(cfg_denoiser):
+def apply_refiner(cfg_denoiser, x):
    completed_ratio = cfg_denoiser.step / cfg_denoiser.total_steps
    refiner_switch_at = cfg_denoiser.p.refiner_switch_at
    refiner_checkpoint_info = cfg_denoiser.p.refiner_checkpoint_info
@@ -184,10 +180,17 @@ def apply_refiner(cfg_denoiser):
    with sd_models.SkipWritingToConfig():
        sd_models.reload_model_weights(info=refiner_checkpoint_info)

+    refiner = sd_models.model_data.get_sd_model()
+
    devices.torch_gc()
    cfg_denoiser.p.setup_conds()
    cfg_denoiser.update_inner_model()

+    inference_memory = refiner.current_controlnet_required_memory
+    unet_patcher = refiner.forge_objects.unet
+    model_management.load_models_gpu(
+        [unet_patcher],
+        unet_patcher.memory_required([x.shape[0]] + list(x.shape[1:])) + inference_memory)
    return True


--- a/modules/sd_samplers_kdiffusion.py
+++ b/modules/sd_samplers_kdiffusion.py
@@ -7,6 +7,8 @@ from modules.script_callbacks import ExtraNoiseParams, extra_noise_callback

 from modules.shared import opts
 import modules.shared as shared
+import ldm_patched.modules.model_management
+

 samplers_k_diffusion = [
    ('DPM++ 2M Karras', 'sample_dpmpp_2m', ['k_dpmpp_2m_ka'], {'scheduler': 'karras'}),
@@ -139,11 +141,21 @@ class KDiffusionSampler(sd_samplers_common.Sampler):
        return sigmas

    def sample_img2img(self, p, x, noise, conditioning, unconditional_conditioning, steps=None, image_conditioning=None):
+        inference_memory = self.model_wrap.inner_model.current_controlnet_required_memory
+        unet_patcher = self.model_wrap.inner_model.forge_objects.unet
+        ldm_patched.modules.model_management.load_models_gpu(
+            [unet_patcher],
+            unet_patcher.memory_required([x.shape[0] * 2] + list(x.shape[1:])) + inference_memory)
+
+        self.model_wrap.log_sigmas = self.model_wrap.log_sigmas.to(unet_patcher.current_device)
+        self.model_wrap.sigmas = self.model_wrap.sigmas.to(unet_patcher.current_device)
+
        steps, t_enc = sd_samplers_common.setup_img2img_steps(p, steps)

        sigmas = self.get_sigmas(p, steps)
        sigma_sched = sigmas[steps - t_enc - 1:]

+        x = x.to(noise)
        xi = x + noise * sigma_sched[0]

        if opts.img2img_extra_noise > 0:
@@ -192,6 +204,15 @@ class KDiffusionSampler(sd_samplers_common.Sampler):
        return samples

    def sample(self, p, x, conditioning, unconditional_conditioning, steps=None, image_conditioning=None):
+        inference_memory = self.model_wrap.inner_model.current_controlnet_required_memory
+        unet_patcher = self.model_wrap.inner_model.forge_objects.unet
+        ldm_patched.modules.model_management.load_models_gpu(
+            [unet_patcher],
+            unet_patcher.memory_required([x.shape[0] * 2] + list(x.shape[1:])) + inference_memory)
+
+        self.model_wrap.log_sigmas = self.model_wrap.log_sigmas.to(unet_patcher.current_device)
+        self.model_wrap.sigmas = self.model_wrap.sigmas.to(unet_patcher.current_device)
+
        steps = steps or p.steps

        sigmas = self.get_sigmas(p, steps)
--- a/modules/sd_samplers_lcm.py
+++ b/modules/sd_samplers_lcm.py
@@ -34,7 +34,7 @@ class LCMCompVisDenoiser(DiscreteEpsDDPMDenoiser):

    def sigma_to_t(self, sigma, quantize=None):
        log_sigma = sigma.log()
-        dists = log_sigma - self.log_sigmas[:, None]
+        dists = log_sigma - self.log_sigmas.to(sigma)[:, None]
        return dists.abs().argmin(dim=0).view(sigma.shape) * self.skip_steps + (self.skip_steps - 1)


--- a/modules/sd_samplers_timesteps.py
+++ b/modules/sd_samplers_timesteps.py
@@ -7,6 +7,8 @@ from modules.script_callbacks import ExtraNoiseParams, extra_noise_callback

 from modules.shared import opts
 import modules.shared as shared
+import ldm_patched.modules.model_management
+

 samplers_timesteps = [
    ('DDIM', sd_samplers_timesteps_impl.ddim, ['ddim'], {}),
@@ -54,6 +56,7 @@ class CFGDenoiserTimesteps(CFGDenoiser):

    def get_pred_x0(self, x_in, x_out, sigma):
        ts = sigma.to(dtype=int)
+        self.alphas = self.alphas.to(ts.device)

        a_t = self.alphas[ts][:, None, None, None]
        sqrt_one_minus_at = (1 - a_t).sqrt()
@@ -95,6 +98,14 @@ class CompVisSampler(sd_samplers_common.Sampler):
        return timesteps

    def sample_img2img(self, p, x, noise, conditioning, unconditional_conditioning, steps=None, image_conditioning=None):
+        inference_memory = self.model_wrap.inner_model.current_controlnet_required_memory
+        unet_patcher = self.model_wrap.inner_model.forge_objects.unet
+        ldm_patched.modules.model_management.load_models_gpu(
+            [unet_patcher],
+            unet_patcher.memory_required([x.shape[0] * 2] + list(x.shape[1:])) + inference_memory)
+
+        self.model_wrap.inner_model.alphas_cumprod = self.model_wrap.inner_model.alphas_cumprod.to(unet_patcher.current_device)
+
        steps, t_enc = sd_samplers_common.setup_img2img_steps(p, steps)

        timesteps = self.get_timesteps(p, steps)
@@ -104,7 +115,7 @@ class CompVisSampler(sd_samplers_common.Sampler):
        sqrt_alpha_cumprod = torch.sqrt(alphas_cumprod[timesteps[t_enc]])
        sqrt_one_minus_alpha_cumprod = torch.sqrt(1 - alphas_cumprod[timesteps[t_enc]])

-        xi = x * sqrt_alpha_cumprod + noise * sqrt_one_minus_alpha_cumprod
+        xi = x.to(noise) * sqrt_alpha_cumprod + noise * sqrt_one_minus_alpha_cumprod

        if opts.img2img_extra_noise > 0:
            p.extra_generation_params["Extra noise"] = opts.img2img_extra_noise
@@ -138,6 +149,14 @@ class CompVisSampler(sd_samplers_common.Sampler):
        return samples

    def sample(self, p, x, conditioning, unconditional_conditioning, steps=None, image_conditioning=None):
+        inference_memory = self.model_wrap.inner_model.current_controlnet_required_memory
+        unet_patcher = self.model_wrap.inner_model.forge_objects.unet
+        ldm_patched.modules.model_management.load_models_gpu(
+            [unet_patcher],
+            unet_patcher.memory_required([x.shape[0] * 2] + list(x.shape[1:])) + inference_memory)
+
+        self.model_wrap.inner_model.alphas_cumprod = self.model_wrap.inner_model.alphas_cumprod.to(unet_patcher.current_device)
+
        steps = steps or p.steps
        timesteps = self.get_timesteps(p, steps)

--- a/modules/sd_vae.py
+++ b/modules/sd_vae.py
@@ -237,7 +237,6 @@ def load_vae(model, vae_file=None, vae_source="from unknown source"):
 # don't call this from outside
 def _load_vae_dict(model, vae_dict_1):
    model.first_stage_model.load_state_dict(vae_dict_1)
-    model.first_stage_model.to(devices.dtype_vae)


 def clear_loaded_vae():
@@ -263,20 +262,12 @@ def reload_vae_weights(sd_model=None, vae_file=unspecified):
    if loaded_vae_file == vae_file:
        return

-    if sd_model.lowvram:
-        lowvram.send_everything_to_cpu()
-    else:
-        sd_model.to(devices.cpu)
-
    sd_hijack.model_hijack.undo_hijack(sd_model)

    load_vae(sd_model, vae_file, vae_source)

    sd_hijack.model_hijack.hijack(sd_model)

-    if not sd_model.lowvram:
-        sd_model.to(devices.device)
-
    script_callbacks.model_loaded_callback(sd_model)

    print("VAE weights loaded.")
--- a/modules/shared_init.py
+++ b/modules/shared_init.py
@@ -24,13 +24,6 @@ def initialize():
        pass

    from modules import devices
-    devices.device, devices.device_interrogate, devices.device_gfpgan, devices.device_esrgan, devices.device_codeformer = \
-        (devices.cpu if any(y in cmd_opts.use_cpu for y in [x, 'all']) else devices.get_optimal_device() for x in ['sd', 'interrogate', 'gfpgan', 'esrgan', 'codeformer'])
-
-    devices.dtype = torch.float32 if cmd_opts.no_half else torch.float16
-    devices.dtype_vae = torch.float32 if cmd_opts.no_half or cmd_opts.no_half_vae else torch.float16
-    devices.dtype_inference = torch.float32 if cmd_opts.precision == 'full' else devices.dtype
-
    shared.device = devices.device
    shared.weight_load_location = None if cmd_opts.lowram else "cpu"

--- a/modules/shared_options.py
+++ b/modules/shared_options.py
@@ -299,7 +299,7 @@ options_templates.update(options_section(('ui_alternatives', "UI alternatives",

 options_templates.update(options_section(('ui', "User interface", "ui"), {
    "localization": OptionInfo("None", "Localization", gr.Dropdown, lambda: {"choices": ["None"] + list(localization.localizations.keys())}, refresh=lambda: localization.list_localizations(cmd_opts.localizations_dir)).needs_reload_ui(),
-    "quicksettings_list": OptionInfo(["sd_model_checkpoint"], "Quicksettings list", ui_components.DropdownMulti, lambda: {"choices": list(shared.opts.data_labels.keys())}).js("info", "settingsHintsShowQuicksettings").info("setting entries that appear at the top of page rather than in settings tab").needs_reload_ui(),
+    "quicksettings_list": OptionInfo(["sd_model_checkpoint", "sd_vae", "CLIP_stop_at_last_layers"], "Quicksettings list", ui_components.DropdownMulti, lambda: {"choices": list(shared.opts.data_labels.keys())}).js("info", "settingsHintsShowQuicksettings").info("setting entries that appear at the top of page rather than in settings tab").needs_reload_ui(),
    "ui_tab_order": OptionInfo([], "UI tab order", ui_components.DropdownMulti, lambda: {"choices": list(shared.tab_names)}).needs_reload_ui(),
    "hidden_tabs": OptionInfo([], "Hidden UI tabs", ui_components.DropdownMulti, lambda: {"choices": list(shared.tab_names)}).needs_reload_ui(),
    "ui_reorder_list": OptionInfo([], "UI item order for txt2img/img2img tabs", ui_components.DropdownMulti, lambda: {"choices": list(shared_items.ui_reorder_categories())}).info("selected items appear first").needs_reload_ui(),
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -167,9 +167,15 @@ def update_token_counter(text, steps, *, is_positive=True):
        # messages related to it in console
        prompt_schedules = [[[steps, text]]]

+    try:
+        cond_stage_model = sd_models.model_data.sd_model.cond_stage_model
+        assert cond_stage_model is not None
+    except Exception:
+        return f"<span class='gr-box gr-text-input'>?/?</span>"
+
    flat_prompts = reduce(lambda list1, list2: list1+list2, prompt_schedules)
    prompts = [prompt_text for step, prompt_text in flat_prompts]
-    token_count, max_length = max([model_hijack.get_prompt_lengths(prompt) for prompt in prompts], key=lambda args: args[0])
+    token_count, max_length = max([model_hijack.get_prompt_lengths(prompt, cond_stage_model) for prompt in prompts], key=lambda args: args[0])
    return f"<span class='gr-box gr-text-input'>{token_count}/{max_length}</span>"


--- a/modules/ui_settings.py
+++ b/modules/ui_settings.py
@@ -294,7 +294,6 @@ class UiSettings:

        for _i, k, _item in self.quicksettings_list:
            component = self.component_dict[k]
-            info = opts.data_labels[k]

            if isinstance(component, gr.Textbox):
                methods = [component.submit, component.blur]
@@ -308,7 +307,7 @@ class UiSettings:
                    fn=lambda value, k=k: self.run_settings_single(value, key=k),
                    inputs=[component],
                    outputs=[component, self.text_settings],
-                    show_progress=info.refresh is not None,
+                    show_progress=False,
                )

        button_set_checkpoint = gr.Button('Change checkpoint', elem_id='change_checkpoint', visible=False)
--- a/modules/upscaler.py
+++ b/modules/upscaler.py
@@ -6,6 +6,13 @@ from PIL import Image

 import modules.shared
 from modules import modelloader, shared
+from ldm_patched.modules import model_management
+
+
+def prepare_free_memory():
+    model_management.free_memory(memory_required=1024*1024*3, device=model_management.get_torch_device())
+    print('Upscale script freed memory successfully.')
+

 LANCZOS = (Image.Resampling.LANCZOS if hasattr(Image, 'Resampling') else Image.LANCZOS)
 NEAREST = (Image.Resampling.NEAREST if hasattr(Image, 'Resampling') else Image.NEAREST)