Added ability to split up flux across gpus (experimental). Changed the way timestep scheduling works to prep for more specific schedules.

2026-04-26 17:29:27 +00:00 · 2024-12-31 07:06:55 -07:00
parent 8ef07a9c36
commit 4723f23c0d
5 changed files with 182 additions and 7 deletions
--- a/toolkit/stable_diffusion_model.py
+++ b/toolkit/stable_diffusion_model.py
@@ -60,7 +60,7 @@ from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjecti

 from toolkit.paths import ORIG_CONFIGS_ROOT, DIFFUSERS_CONFIGS_ROOT
 from huggingface_hub import hf_hub_download
-from toolkit.models.flux import bypass_flux_guidance, restore_flux_guidance
+from toolkit.models.flux import add_model_gpu_splitter_to_flux, bypass_flux_guidance, restore_flux_guidance

 from optimum.quanto import freeze, qfloat8, quantize, QTensor, qint4
 from typing import TYPE_CHECKING
@@ -553,6 +553,10 @@ class StableDiffusion:
                # low_cpu_mem_usage=False,
                # device_map=None
            )
+            # hack in model gpu splitter
+            if self.model_config.split_model_over_gpus:
+                add_model_gpu_splitter_to_flux(transformer)
+            
            if not self.low_vram:
                # for low v ram, we leave it on the cpu. Quantizes slower, but allows training on primary gpu
                transformer.to(torch.device(self.quantize_device), dtype=dtype)