Initial support for RamTorch. Still a WIP

2026-05-01 03:31:35 +00:00 · 2025-10-05 13:03:26 -06:00
parent c6edd71a5b
commit 4e5707854f
8 changed files with 687 additions and 120 deletions
--- a/extensions_built_in/diffusion_models/qwen_image/qwen_image.py
+++ b/extensions_built_in/diffusion_models/qwen_image/qwen_image.py
@@ -9,63 +9,69 @@ from PIL import Image
 from toolkit.models.base_model import BaseModel
 from toolkit.basic import flush
 from toolkit.prompt_utils import PromptEmbeds
-from toolkit.samplers.custom_flowmatch_sampler import CustomFlowMatchEulerDiscreteScheduler
+from toolkit.samplers.custom_flowmatch_sampler import (
    CustomFlowMatchEulerDiscreteScheduler,
 )
 from toolkit.accelerator import get_accelerator, unwrap_model
 from optimum.quanto import freeze, QTensor
 from toolkit.util.quantize import quantize, get_qtype, quantize_model
 import torch.nn.functional as F
 from toolkit.memory_management import MemoryManager
 from safetensors.torch import load_file
-from diffusers import QwenImagePipeline, QwenImageTransformer2DModel, AutoencoderKLQwenImage
+from diffusers import (
-from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
+    QwenImagePipeline,
    QwenImageTransformer2DModel,
    AutoencoderKLQwenImage,
 )
 from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    Qwen2Tokenizer,
    Qwen2VLProcessor,
 )
 from tqdm import tqdm
 if TYPE_CHECKING:
    from toolkit.data_transfer_object.data_loader import DataLoaderBatchDTO
 scheduler_config = {
-  "base_image_seq_len": 256,
+    "base_image_seq_len": 256,
-  "base_shift": 0.5,
+    "base_shift": 0.5,
-  "invert_sigmas": False,
+    "invert_sigmas": False,
-  "max_image_seq_len": 8192,
+    "max_image_seq_len": 8192,
-  "max_shift": 0.9,
+    "max_shift": 0.9,
-  "num_train_timesteps": 1000,
+    "num_train_timesteps": 1000,
-  "shift": 1.0,
+    "shift": 1.0,
-  "shift_terminal": 0.02,
+    "shift_terminal": 0.02,
-  "stochastic_sampling": False,
+    "stochastic_sampling": False,
-  "time_shift_type": "exponential",
+    "time_shift_type": "exponential",
-  "use_beta_sigmas": False,
+    "use_beta_sigmas": False,
-  "use_dynamic_shifting": True,
+    "use_dynamic_shifting": True,
-  "use_exponential_sigmas": False,
+    "use_exponential_sigmas": False,
-  "use_karras_sigmas": False
+    "use_karras_sigmas": False,
 }
 class QwenImageModel(BaseModel):
    arch = "qwen_image"
    _qwen_image_keep_visual = False
    _qwen_pipeline = QwenImagePipeline
    def __init__(
-            self,
+        self,
-            device,
+        device,
-            model_config: ModelConfig,
+        model_config: ModelConfig,
-            dtype='bf16',
+        dtype="bf16",
-            custom_pipeline=None,
+        custom_pipeline=None,
-            noise_scheduler=None,
+        noise_scheduler=None,
-            **kwargs
+        **kwargs,
    ):
        super().__init__(
-            device,
+            device, model_config, dtype, custom_pipeline, noise_scheduler, **kwargs
            model_config,
            dtype,
            custom_pipeline,
            noise_scheduler,
            **kwargs
        )
        self.is_flow_matching = True
        self.is_transformer = True
-        self.target_lora_modules = ['QwenImageTransformer2DModel']
+        self.target_lora_modules = ["QwenImageTransformer2DModel"]
    # static method to get the noise scheduler
    @staticmethod
@@ -73,40 +79,58 @@ class QwenImageModel(BaseModel):
        return CustomFlowMatchEulerDiscreteScheduler(**scheduler_config)
    def get_bucket_divisibility(self):
-        return 16 * 2 # 16 for the VAE, 2 for patch size
+        return 16 * 2  # 16 for the VAE, 2 for patch size
    def load_model(self):
        dtype = self.torch_dtype
        self.print_and_status_update("Loading Qwen Image model")
        model_path = self.model_config.name_or_path
        base_model_path = self.model_config.extras_name_or_path
        model_dtype = dtype
-        transformer_path = model_path
+        if base_model_path.endswith(".safetensors"):
-        transformer_subfolder = 'transformer'
+            # use the repo for extras
-        if os.path.exists(transformer_path):
+            base_model_path = "Qwen/Qwen-Image"
            transformer_subfolder = None
            transformer_path = os.path.join(transformer_path, 'transformer')
            # check if the path is a full checkpoint.
            te_folder_path = os.path.join(model_path, 'text_encoder')
            # if we have the te, this folder is a full checkpoint, use it as the base
            if os.path.exists(te_folder_path):
                base_model_path = model_path
        self.print_and_status_update("Loading transformer")
-        transformer = QwenImageTransformer2DModel.from_pretrained(
+
-            transformer_path,
+        if model_path.endswith(".safetensors"):
-            subfolder=transformer_subfolder,
+            # load the safetensors file
-            torch_dtype=dtype
+            transformer = QwenImageTransformer2DModel.from_single_file(
-        )
+                model_path,
                config="Qwen/Qwen-Image",
                subfolder="transformer",
                torch_dtype=model_dtype,
            )
            transformer.to(model_dtype)
        else:
            transformer_path = model_path
            transformer_subfolder = "transformer"
            if os.path.exists(transformer_path):
                transformer_subfolder = None
                transformer_path = os.path.join(transformer_path, "transformer")
                # check if the path is a full checkpoint.
                te_folder_path = os.path.join(model_path, "text_encoder")
                # if we have the te, this folder is a full checkpoint, use it as the base
                if os.path.exists(te_folder_path):
                    base_model_path = model_path
            transformer = QwenImageTransformer2DModel.from_pretrained(
                transformer_path, subfolder=transformer_subfolder, torch_dtype=dtype
            )
        if self.model_config.quantize:
            self.print_and_status_update("Quantizing Transformer")
            quantize_model(self, transformer)
            flush()
        if self.model_config.auto_memory:
            MemoryManager.attach(transformer, self.device_torch)
        if self.model_config.low_vram:
            self.print_and_status_update("Moving transformer to CPU")
-            transformer.to('cpu')
+            transformer.to("cpu")
        flush()
@@ -123,19 +147,22 @@ class QwenImageModel(BaseModel):
        if not self._qwen_image_keep_visual:
            text_encoder.model.visual = None
        if self.model_config.auto_memory:
            MemoryManager.attach(text_encoder, self.device_torch)
        text_encoder.to(self.device_torch, dtype=dtype)
        flush()
        if self.model_config.quantize_te:
            self.print_and_status_update("Quantizing Text Encoder")
-            quantize(text_encoder, weights=get_qtype(
+            quantize(text_encoder, weights=get_qtype(self.model_config.qtype_te))
                self.model_config.qtype_te))
            freeze(text_encoder)
            flush()
        self.print_and_status_update("Loading VAE")
        vae = AutoencoderKLQwenImage.from_pretrained(
-            base_model_path, subfolder="vae", torch_dtype=dtype)
+            base_model_path, subfolder="vae", torch_dtype=dtype
        )
        self.noise_scheduler = QwenImageModel.get_train_scheduler()
@@ -152,7 +179,7 @@ class QwenImageModel(BaseModel):
                self.processor = Qwen2VLProcessor.from_pretrained(
                    base_model_path, subfolder="processor"
                )
-            kwargs['processor'] = self.processor
+            kwargs["processor"] = self.processor
        pipe: QwenImagePipeline = self._qwen_pipeline(
            scheduler=self.noise_scheduler,
@@ -160,7 +187,7 @@ class QwenImageModel(BaseModel):
            tokenizer=tokenizer,
            vae=vae,
            transformer=None,
-            **kwargs
+            **kwargs,
        )
        # for quantization, it works best to do these after making the pipe
        pipe.text_encoder = text_encoder
@@ -198,7 +225,7 @@ class QwenImageModel(BaseModel):
            text_encoder=unwrap_model(self.text_encoder[0]),
            tokenizer=self.tokenizer[0],
            vae=unwrap_model(self.vae),
-            transformer=unwrap_model(self.transformer)
+            transformer=unwrap_model(self.transformer),
        )
        pipeline = pipeline.to(self.device_torch)
@@ -231,7 +258,8 @@ class QwenImageModel(BaseModel):
        # flush for low vram if we are doing that
        flush_between_steps = self.model_config.low_vram
-            # Fix a bug in diffusers/torch
+
        # Fix a bug in diffusers/torch
        def callback_on_step_end(pipe, i, t, callback_kwargs):
            if flush_between_steps:
                flush()
@@ -240,13 +268,17 @@ class QwenImageModel(BaseModel):
            return {"latents": latents}
        sc = self.get_bucket_divisibility()
-        gen_config.width = int(gen_config.width  // sc * sc)
+        gen_config.width = int(gen_config.width // sc * sc)
        gen_config.height = int(gen_config.height // sc * sc)
        img = pipeline(
            prompt_embeds=conditional_embeds.text_embeds,
-            prompt_embeds_mask=conditional_embeds.attention_mask.to(self.device_torch, dtype=torch.int64),
+            prompt_embeds_mask=conditional_embeds.attention_mask.to(
                self.device_torch, dtype=torch.int64
            ),
            negative_prompt_embeds=unconditional_embeds.text_embeds,
-            negative_prompt_embeds_mask=unconditional_embeds.attention_mask.to(self.device_torch, dtype=torch.int64),
+            negative_prompt_embeds_mask=unconditional_embeds.attention_mask.to(
                self.device_torch, dtype=torch.int64
            ),
            height=gen_config.height,
            width=gen_config.width,
            num_inference_steps=gen_config.num_inference_steps,
@@ -254,7 +286,7 @@ class QwenImageModel(BaseModel):
            latents=gen_config.latents,
            generator=generator,
            callback_on_step_end=callback_on_step_end,
-            **extra
+            **extra,
        ).images[0]
        return img
@@ -263,7 +295,7 @@ class QwenImageModel(BaseModel):
        latent_model_input: torch.Tensor,
        timestep: torch.Tensor,  # 0 to 1000 scale
        text_embeddings: PromptEmbeds,
-        **kwargs
+        **kwargs,
    ):
        self.model.to(self.device_torch)
        batch_size, num_channels_latents, height, width = latent_model_input.shape
@@ -271,20 +303,28 @@ class QwenImageModel(BaseModel):
        ps = self.transformer.config.patch_size
        # pack image tokens
-        latent_model_input = latent_model_input.view(batch_size, num_channels_latents, height // ps, ps, width // ps, ps)
+        latent_model_input = latent_model_input.view(
            batch_size, num_channels_latents, height // ps, ps, width // ps, ps
        )
        latent_model_input = latent_model_input.permute(0, 2, 4, 1, 3, 5)
-        latent_model_input = latent_model_input.reshape(batch_size, (height // ps) * (width // ps), num_channels_latents * (ps * ps))
+        latent_model_input = latent_model_input.reshape(
            batch_size, (height // ps) * (width // ps), num_channels_latents * (ps * ps)
        )
        # img_shapes passed to the model
        img_h2, img_w2 = height // ps, width // ps
        img_shapes = [[(1, img_h2, img_w2)]] * batch_size
        enc_hs = text_embeddings.text_embeds.to(self.device_torch, self.torch_dtype)
-        prompt_embeds_mask = text_embeddings.attention_mask.to(self.device_torch, dtype=torch.int64)
+        prompt_embeds_mask = text_embeddings.attention_mask.to(
            self.device_torch, dtype=torch.int64
        )
        txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist()
        noise_pred = self.transformer(
-            hidden_states=latent_model_input.to(self.device_torch, self.torch_dtype).detach(),
+            hidden_states=latent_model_input.to(
                self.device_torch, self.torch_dtype
            ).detach(),
            timestep=(timestep / 1000).detach(),
            guidance=None,
            encoder_hidden_states=enc_hs.detach(),
@@ -296,7 +336,9 @@ class QwenImageModel(BaseModel):
        )[0]
        # unpack
-        noise_pred = noise_pred.view(batch_size, height // ps, width // ps, num_channels_latents, ps, ps)
+        noise_pred = noise_pred.view(
            batch_size, height // ps, width // ps, num_channels_latents, ps, ps
        )
        noise_pred = noise_pred.permute(0, 3, 1, 4, 2, 5)
        noise_pred = noise_pred.reshape(batch_size, num_channels_latents, height, width)
        return noise_pred
@@ -310,9 +352,7 @@ class QwenImageModel(BaseModel):
            device=self.device_torch,
            num_images_per_prompt=1,
        )
-        pe = PromptEmbeds(
+        pe = PromptEmbeds(prompt_embeds)
            prompt_embeds
        )
        pe.attention_mask = prompt_embeds_mask
        return pe
@@ -326,25 +366,24 @@ class QwenImageModel(BaseModel):
        # only save the unet
        transformer: QwenImageTransformer2DModel = unwrap_model(self.model)
        transformer.save_pretrained(
-            save_directory=os.path.join(output_path, 'transformer'),
+            save_directory=os.path.join(output_path, "transformer"),
            safe_serialization=True,
        )
-        meta_path = os.path.join(output_path, 'aitk_meta.yaml')
+        meta_path = os.path.join(output_path, "aitk_meta.yaml")
-        with open(meta_path, 'w') as f:
+        with open(meta_path, "w") as f:
            yaml.dump(meta, f)
    def get_loss_target(self, *args, **kwargs):
-        noise = kwargs.get('noise')
+        noise = kwargs.get("noise")
-        batch = kwargs.get('batch')
+        batch = kwargs.get("batch")
        return (noise - batch.latents).detach()
    def get_base_model_version(self):
        return "qwen_image"
    def get_transformer_block_names(self) -> Optional[List[str]]:
-        return ['transformer_blocks']
+        return ["transformer_blocks"]
    def convert_lora_weights_before_save(self, state_dict):
        new_sd = {}
@@ -360,19 +399,14 @@ class QwenImageModel(BaseModel):
            new_sd[new_key] = value
        return new_sd
-    def encode_images(
+    def encode_images(self, image_list: List[torch.Tensor], device=None, dtype=None):
            self,
            image_list: List[torch.Tensor],
            device=None,
            dtype=None
    ):
        if device is None:
            device = self.vae_device_torch
        if dtype is None:
            dtype = self.vae_torch_dtype
        # Move to vae to device if on cpu
-        if self.vae.device == 'cpu':
+        if self.vae.device == "cpu":
            self.vae.to(device)
        self.vae.eval()
        self.vae.requires_grad_(False)
@@ -389,14 +423,13 @@ class QwenImageModel(BaseModel):
            .view(1, self.vae.config.z_dim, 1, 1, 1)
            .to(latents.device, latents.dtype)
        )
-        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+        latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(
-            latents.device, latents.dtype
+            1, self.vae.config.z_dim, 1, 1, 1
-        )
+        ).to(latents.device, latents.dtype)
        latents = (latents - latents_mean) * latents_std
        latents = latents.to(device, dtype=dtype)
        latents = latents.squeeze(2)  # remove the frame count dimension
        return latents
--- a/jobs/process/BaseSDTrainProcess.py
+++ b/jobs/process/BaseSDTrainProcess.py
@@ -1759,7 +1759,7 @@ class BaseSDTrainProcess(BaseTrainProcess):
                )
                # we cannot merge in if quantized
-                if self.model_config.quantize:
+                if self.model_config.quantize or self.model_config.auto_memory:
                    # todo find a way around this
                    self.network.can_merge_in = False
--- a/toolkit/config_modules.py
+++ b/toolkit/config_modules.py
@@ -624,6 +624,15 @@ class ModelConfig:
        self.arch: ModelArch = kwargs.get("arch", None)
        # auto memory management, only for some models
        self.auto_memory = kwargs.get("auto_memory", False)
        if self.auto_memory and self.qtype == "qfloat8":
            print(f"Auto memory is not compatible with qfloat8, switching to float8 for model")
            self.qtype = "float8"
        if self.auto_memory and not self.qtype_te == "qfloat8":
            print(f"Auto memory is not compatible with qfloat8, switching to float8 for te")
            self.qtype_te = "float8"
        # can be used to load the extras like text encoder or vae from here
        # only setup for some models but will prevent having to download the te for
        # 20 different model variants
@@ -651,6 +660,7 @@ class ModelConfig:
        if self.arch == "flex1":
            self.arch = "flux"
        # handle migrating to new model arch
        if self.arch is not None:
            # reverse the arch to the old style
--- a/toolkit/memory_management/manager.py
+++ b/toolkit/memory_management/manager.py
@@ -1,12 +1,92 @@
-from typing import TYPE_CHECKING
+import torch
 from .manager_modules import LinearLayerMemoryManager, ConvLayerMemoryManager
-if TYPE_CHECKING:
+LINEAR_MODULES = [
-    from toolkit.models.base_model import BaseModel
+    "Linear",
    "LoRACompatibleLinear",
    "QLinear",
 ]
 CONV_MODULES = [
    "Conv2d",
    "LoRACompatibleConv",
    "QConv2d",
 ]
 UNMANAGED_MODULES = [
    "LayerNorm",
    "BatchNorm1d",
    "BatchNorm2d",
    "BatchNorm3d",
    "GroupNorm",
    "InstanceNorm1d",
    "InstanceNorm2d",
    "InstanceNorm3d",
    "Embedding",
    "EmbeddingBag",
    "RNNBase",
    "LSTM",
    "GRU",
    "RNN",
 ]
 UNMANAGED_MODULES_INCLUDES = ["RotaryEmbedding", "Norm"]
 class MemoryManager:
    def __init__(
        self,
-        model: "BaseModel",
+        module: torch.nn.Module,
        process_device: torch.device = torch.device("cpu"),
    ):
-        self.model: "BaseModel" = model
+        self.module: torch.nn.Module = module
        self.process_device: torch.device = process_device
        self.unmanaged_modules: list[torch.nn.Module] = []
    def memory_managed_to(self, *args, **kwargs):
        # first move all the unmanaged modules
        for module in self.unmanaged_modules:
            module.to(*args, **kwargs)
        # check for a dtype argument
        dtype = None
        if "dtype" in kwargs:
            dtype = kwargs["dtype"]
        elif len(args) > 0:
            for i, arg in enumerate(args):
                if isinstance(arg, torch.dtype):
                    dtype = arg
                    break
        if dtype is not None:
            return self.module._mm_to(dtype=dtype)
        return self.module
    @classmethod
    def attach(cls, module: torch.nn.Module, device: torch.device):
        if hasattr(module, "_memory_manager"):
            # already attached
            return
        module._memory_manager = cls(module, device)
        # override the to method to handle memory management
        module._mm_to = module.to
        module.to = module._memory_manager.memory_managed_to
        # attach to all modules
        for name, sub_module in module.named_modules():
            for child_name, child_module in sub_module.named_modules():
                if child_module.__class__.__name__ in LINEAR_MODULES:
                    # linear
                    LinearLayerMemoryManager.attach(
                        child_module, module._memory_manager
                    )
                elif child_module.__class__.__name__ in CONV_MODULES:
                    # conv
                    ConvLayerMemoryManager.attach(child_module, module._memory_manager)
                elif child_module.__class__.__name__ in UNMANAGED_MODULES or any(
                    inc in child_module.__class__.__name__
                    for inc in UNMANAGED_MODULES_INCLUDES
                ):
                    # unmanaged
                    module._memory_manager.unmanaged_modules.append(child_module)
                else:
                    continue
--- a/toolkit/memory_management/manager_modules.py
+++ b/toolkit/memory_management/manager_modules.py
@@ -0,0 +1,450 @@
 """
 This code was heavily inspired by the work of Lodestone-Rock, pretty much all credit goes
 to them. The original code can be found here:
 https://github.com/lodestone-rock/RamTorch/blob/main/ramtorch/modules/linear.py
 I simply modified it to work with a memory management model and with AI Toolkit's models
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import TYPE_CHECKING, Optional, Tuple
 if TYPE_CHECKING:
    from .manager import MemoryManager
 # --- Per-device global state registry ---
 _DEVICE_STATE = {}
 def _get_device_state(device: torch.device):
    """Get or initialize per-device state."""
    if isinstance(device, str):
        device = torch.device(device)
    # CPU path needs no CUDA state
    if device.type != "cuda":
        if device not in _DEVICE_STATE:
            _DEVICE_STATE[device] = {}
        return _DEVICE_STATE[device]
    if device not in _DEVICE_STATE:
        with torch.cuda.device(device):
            _DEVICE_STATE[device] = {
                # streams & events
                "transfer_stream": torch.cuda.Stream(device=device),
                "transfer_grad_stream": torch.cuda.Stream(device=device),
                "transfer_forward_finished_event": torch.cuda.Event(),
                "compute_forward_start_event": torch.cuda.Event(),
                "transfer_backward_finished_event": torch.cuda.Event(),
                "transfer_weight_backward_finished_event": torch.cuda.Event(),
                "compute_backward_start_event": torch.cuda.Event(),
                "compute_backward_finished_event": torch.cuda.Event(),
                # ping-pong buffers
                "w_buffers": [None, None],
                "b_buffers": [None, None],
                "w_bwd_buffers": [None, None],
                # device-side staging for grads to be sent to CPU
                "w_grad_buffers": [None, None],
                "b_grad_buffers": [None, None],
                # clocks
                "forward_clk": 0,
                "backward_clk": 0,
            }
    return _DEVICE_STATE[device]
 def _ensure_cpu_pinned(t: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
    if t is None:
        return None
    if t.device.type != "cpu":
        t = t.to("cpu", copy=True)
    if torch.cuda.is_available():
        try:
            t = t.pin_memory()
        except RuntimeError:
            pass
    return t
 def _move_params_to_cpu_and_pin(module: nn.Module):
    """Force parameters to CPU (+pinned) so we can 'bounce' them per forward/backward."""
    with torch.no_grad():
        if hasattr(module, "weight") and isinstance(module.weight, nn.Parameter):
            module.weight.data = _ensure_cpu_pinned(module.weight.data).detach()
        if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
            if module.bias is not None:
                module.bias.data = _ensure_cpu_pinned(module.bias.data).detach()
 # ==========================
 # Autograd functions (CUDA)
 # ==========================
 class _BouncingLinearFn(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, weight_cpu, bias_cpu, device: torch.device):
        if device.type != "cuda":
            out = F.linear(x.to("cpu"), weight_cpu, bias_cpu)
            ctx.save_for_backward(x.to("cpu"), weight_cpu, bias_cpu)
            ctx.device = torch.device("cpu")
            return out.to(x.device)
        state = _get_device_state(device)
        ts = state["transfer_stream"]
        w_bufs, b_bufs = state["w_buffers"], state["b_buffers"]
        ev_tx_f = state["transfer_forward_finished_event"]
        ev_cu_s = state["compute_forward_start_event"]
        idx = state["forward_clk"]
        with torch.cuda.stream(ts):
            ts.wait_event(ev_cu_s)
            w_bufs[idx] = weight_cpu.to(device, non_blocking=True)
            b_bufs[idx] = (
                bias_cpu.to(device, non_blocking=True) if bias_cpu is not None else None
            )
            state["forward_clk"] ^= 1
            ev_tx_f.record()
        torch.cuda.current_stream().wait_event(ev_tx_f)
        ev_cu_s.record()
        out = F.linear(x, w_bufs[idx], b_bufs[idx])
        ctx.save_for_backward(x, weight_cpu, bias_cpu)
        ctx.device = device
        return out
    @staticmethod
    def backward(ctx, grad_out):
        x, weight_cpu, bias_cpu = ctx.saved_tensors
        device = ctx.device
        if device.type != "cuda":
            go_cpu = grad_out.to("cpu")
            x_cpu = x.to("cpu")
            grad_input = go_cpu @ weight_cpu
            grad_weight = go_cpu.flatten(0, -2).T @ x_cpu.flatten(0, -2)
            grad_bias = (
                go_cpu.sum(dim=tuple(range(go_cpu.ndim - 1)))
                if bias_cpu is not None
                else None
            )
            return grad_input.to(grad_out.device), grad_weight, grad_bias, None
        state = _get_device_state(device)
        transfer_stream = state["transfer_stream"]
        transfer_grad_stream = state["transfer_grad_stream"]
        w_bwd_buffers = state["w_bwd_buffers"]
        w_grad_buffers = state["w_grad_buffers"]
        b_grad_buffers = state["b_grad_buffers"]
        ev_tx_b = state["transfer_backward_finished_event"]
        ev_tx_w_bwd_done = state["transfer_weight_backward_finished_event"]
        ev_cu_b_start = state["compute_backward_start_event"]
        ev_cu_b_finish = state["compute_backward_finished_event"]
        idx = state["backward_clk"]
        # Stage weights onto device (transfer stream), ping-pong to avoid races
        with torch.cuda.stream(transfer_stream):
            transfer_stream.wait_event(ev_cu_b_start)
            w_bwd_buffers[idx] = weight_cpu.to(device, non_blocking=True)
            state["backward_clk"] ^= 1
            ev_tx_b.record()
        # Compute stream waits for weights to arrive, then start compute
        torch.cuda.current_stream().wait_event(ev_tx_b)
        ev_cu_b_start.record()
        # 1) Compute grad_input using the freshly transferred weights
        grad_input = grad_out @ w_bwd_buffers[idx]
        # 2) Ensure previous grad-to-CPU transfer that used this slot finished
        torch.cuda.current_stream().wait_event(ev_tx_w_bwd_done)
        # 3) Compute weight/bias grads on GPU into staging buffers
        w_grad_buffers[idx] = grad_out.flatten(0, -2).T @ x.flatten(0, -2)
        if bias_cpu is not None:
            reduce_dims = tuple(range(grad_out.ndim - 1))
            b_grad_buffers[idx] = grad_out.sum(dim=reduce_dims)
        # Mark end of GPU compute
        ev_cu_b_finish.record()
        # 4) Launch non-blocking H2D->CPU transfers on a separate grad stream (full-duplex)
        with torch.cuda.stream(transfer_grad_stream):
            transfer_grad_stream.wait_event(ev_cu_b_finish)
            grad_weight = w_grad_buffers[idx].to("cpu", non_blocking=True)
            grad_bias = (
                b_grad_buffers[idx].to("cpu", non_blocking=True)
                if bias_cpu is not None
                else None
            )
            # signal that this slot's CPU transfer is complete (safe for next reuse)
            state["transfer_weight_backward_finished_event"].record()
        return grad_input, grad_weight, grad_bias, None
 class _BouncingConv2dFn(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        x,
        weight_cpu,
        bias_cpu,
        device: torch.device,
        stride: Tuple[int, int],
        padding: Tuple[int, int],
        dilation: Tuple[int, int],
        groups: int,
    ):
        if device.type != "cuda":
            out = F.conv2d(
                x.to("cpu"), weight_cpu, bias_cpu, stride, padding, dilation, groups
            )
            ctx.save_for_backward(x.to("cpu"), weight_cpu, bias_cpu)
            ctx.meta = ("cpu", stride, padding, dilation, groups)
            return out.to(x.device)
        state = _get_device_state(device)
        ts = state["transfer_stream"]
        w_bufs, b_bufs = state["w_buffers"], state["b_buffers"]
        ev_tx_f = state["transfer_forward_finished_event"]
        ev_cu_s = state["compute_forward_start_event"]
        idx = state["forward_clk"]
        with torch.cuda.stream(ts):
            ts.wait_event(ev_cu_s)
            w_bufs[idx] = weight_cpu.to(device, non_blocking=True)
            b_bufs[idx] = (
                bias_cpu.to(device, non_blocking=True) if bias_cpu is not None else None
            )
            state["forward_clk"] ^= 1
            ev_tx_f.record()
        torch.cuda.current_stream().wait_event(ev_tx_f)
        ev_cu_s.record()
        out = F.conv2d(x, w_bufs[idx], b_bufs[idx], stride, padding, dilation, groups)
        ctx.save_for_backward(x, weight_cpu, bias_cpu)
        ctx.meta = (device, stride, padding, dilation, groups)
        return out
    @staticmethod
    def backward(ctx, grad_out):
        x, weight_cpu, bias_cpu = ctx.saved_tensors
        meta = ctx.meta
        device, stride, padding, dilation, groups = meta
        if (
            isinstance(device, torch.device) and device.type != "cuda"
        ) or device == "cpu":
            # CPU grads
            go = grad_out.to("cpu")
            x_cpu = x.to("cpu")
            w_cpu = weight_cpu
            from torch.nn.grad import conv2d_input, conv2d_weight  # type: ignore
            grad_input = conv2d_input(
                x_cpu.shape,
                w_cpu,
                go,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            )
            grad_weight = conv2d_weight(
                x_cpu,
                w_cpu.shape,
                go,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
            )
            grad_bias = go.sum(dim=(0, 2, 3)) if bias_cpu is not None else None
            return (
                grad_input.to(grad_out.device),
                grad_weight,
                grad_bias,
                None,
                None,
                None,
                None,
                None,
            )
        # CUDA path (full-duplex)
        state = _get_device_state(device)
        transfer_stream = state["transfer_stream"]
        transfer_grad_stream = state["transfer_grad_stream"]
        # device-side buffers
        w_bwd_buffers = state["w_bwd_buffers"]
        w_grad_buffers = state["w_grad_buffers"]
        b_grad_buffers = state["b_grad_buffers"]
        ev_tx_b = state["transfer_backward_finished_event"]
        ev_tx_w_bwd_done = state["transfer_weight_backward_finished_event"]
        ev_cu_b_start = state["compute_backward_start_event"]
        ev_cu_b_finish = state["compute_backward_finished_event"]
        idx = state["backward_clk"]
        # Stage weights for input-grad compute
        with torch.cuda.stream(transfer_stream):
            transfer_stream.wait_event(ev_cu_b_start)
            w_bwd_buffers[idx] = weight_cpu.to(device, non_blocking=True)
            state["backward_clk"] ^= 1
            ev_tx_b.record()
        torch.cuda.current_stream().wait_event(ev_tx_b)
        ev_cu_b_start.record()
        # grad wrt input on GPU with streamed weights
        from torch.nn.grad import conv2d_input, conv2d_weight  # type: ignore
        grad_input = conv2d_input(
            x.shape,
            w_bwd_buffers[idx],
            grad_out,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
        )
        # Ensure previous grad transfer that used this slot is done
        torch.cuda.current_stream().wait_event(ev_tx_w_bwd_done)
        # Compute heavy grads on GPU into staging buffers
        w_grad_buffers[idx] = conv2d_weight(
            x,
            weight_cpu.shape,
            grad_out,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
        )
        if bias_cpu is not None:
            b_grad_buffers[idx] = grad_out.sum(dim=(0, 2, 3))
        # Mark end of GPU math
        ev_cu_b_finish.record()
        # Launch CPU copies on the dedicated grad stream (overlaps with next H2D)
        with torch.cuda.stream(transfer_grad_stream):
            transfer_grad_stream.wait_event(ev_cu_b_finish)
            grad_weight = w_grad_buffers[idx].to("cpu", non_blocking=True)
            grad_bias = (
                b_grad_buffers[idx].to("cpu", non_blocking=True)
                if bias_cpu is not None
                else None
            )
            state["transfer_weight_backward_finished_event"].record()
        return grad_input, grad_weight, grad_bias, None, None, None, None, None
 class BaseLayerMemoryManager:
    def __init__(
        self,
        module: nn.Module,
        manager: "MemoryManager",
    ):
        self.module: nn.Module = module
        self.manager: "MemoryManager" = manager
    @classmethod
    def attach(cls, module: nn.Module, manager: "MemoryManager"):
        if hasattr(module, "_layer_memory_manager"):
            return
        module._layer_memory_manager = cls(module, manager)
        # mark parameters as memory managed
        for param in module.parameters(recurse=False):
            param._is_memory_managed = True
 class LinearLayerMemoryManager(BaseLayerMemoryManager):
    def __init__(
        self,
        module: nn.Module,
        manager: "MemoryManager",
    ):
        super().__init__(module, manager)
        # 1) Move params to CPU + pin memory for fast H2D
        _move_params_to_cpu_and_pin(self.module)
        # 2) Hijack forward
        self._original_forward = getattr(self.module, "forward")
        def _mm_forward(x, *args, **kwargs):
            # ensure we only use expected signature (Linear: x)
            if args or kwargs:
                # fall back to original if a custom signature is used
                return self._original_forward(x, *args, **kwargs)
            weight_cpu = self.module.weight
            bias_cpu = getattr(self.module, "bias", None)
            device = self.manager.process_device
            # NOTE: do NOT move params to device here; autograd fn streams & bounces them
            return _BouncingLinearFn.apply(x, weight_cpu, bias_cpu, device)
        self.module.forward = _mm_forward
 class ConvLayerMemoryManager(BaseLayerMemoryManager):
    def __init__(
        self,
        module: nn.Module,
        manager: "MemoryManager",
    ):
        super().__init__(module, manager)
        # 1) Move params to CPU + pin memory for fast H2D
        _move_params_to_cpu_and_pin(self.module)
        # Cache static conv attributes from the module
        stride = (
            self.module.stride
            if isinstance(self.module.stride, tuple)
            else (self.module.stride, self.module.stride)
        )
        padding = (
            self.module.padding
            if isinstance(self.module.padding, tuple)
            else (self.module.padding, self.module.padding)
        )
        dilation = (
            self.module.dilation
            if isinstance(self.module.dilation, tuple)
            else (self.module.dilation, self.module.dilation)
        )
        groups = self.module.groups
        # 2) Hijack forward
        self._original_forward = getattr(self.module, "forward")
        def _mm_forward(x, *args, **kwargs):
            # Support the typical Conv2d(x) call; if user passes uncommon extras, fallback.
            if args or kwargs:
                return self._original_forward(x, *args, **kwargs)
            weight_cpu = self.module.weight
            bias_cpu = getattr(self.module, "bias", None)
            device = self.manager.process_device
            return _BouncingConv2dFn.apply(
                x, weight_cpu, bias_cpu, device, stride, padding, dilation, groups
            )
        self.module.forward = _mm_forward
--- a/toolkit/models/base_model.py
+++ b/toolkit/models/base_model.py
@@ -41,7 +41,6 @@ from torchvision.transforms import functional as TF
 from toolkit.accelerator import get_accelerator, unwrap_model
 from typing import TYPE_CHECKING
 from toolkit.print import print_acc
 from toolkit.memory_management import MemoryManager
 if TYPE_CHECKING:
    from toolkit.lora_special import LoRASpecialNetwork
@@ -187,8 +186,6 @@ class BaseModel:
        # do not resize control images
        self.use_raw_control_images = False
        self.memory_manager = MemoryManager(self)
    # properties for old arch for backwards compatibility
    @property
    def unet(self):
--- a/toolkit/stable_diffusion_model.py
+++ b/toolkit/stable_diffusion_model.py
@@ -70,7 +70,6 @@ from typing import TYPE_CHECKING
 from toolkit.print import print_acc
 from diffusers import FluxFillPipeline
 from transformers import AutoModel, AutoTokenizer, Gemma2Model, Qwen2Model, LlamaModel
 from toolkit.memory_management import MemoryManager
 if TYPE_CHECKING:
    from toolkit.lora_special import LoRASpecialNetwork
@@ -225,8 +224,6 @@ class StableDiffusion:
        # do not resize control images
        self.use_raw_control_images = False
        self.memory_manager = MemoryManager(self)
    # properties for old arch for backwards compatibility
    @property
    def is_xl(self):
--- a/toolkit/util/quantize.py
+++ b/toolkit/util/quantize.py
@@ -301,14 +301,14 @@ def quantize_model(
            f" - quantizing {len(all_blocks)} transformer blocks"
        )
        for block in tqdm(all_blocks):
-            block.to(base_model.device_torch, dtype=base_model.torch_dtype)
+            block.to(base_model.device_torch, dtype=base_model.torch_dtype, non_blocking=True)
            quantize(block, weights=quantization_type)
            freeze(block)
-            block.to("cpu")
+            block.to("cpu", non_blocking=True)
        # todo, on extras find a universal way to quantize them on device and move them back to their original
        # device without having to move the transformer blocks to the device first
        base_model.print_and_status_update(" - quantizing extras")
-        model_to_quantize.to(base_model.device_torch, dtype=base_model.torch_dtype)
+        # model_to_quantize.to(base_model.device_torch, dtype=base_model.torch_dtype)
        quantize(model_to_quantize, weights=quantization_type)
        freeze(model_to_quantize)