Added ability to split up flux across gpus (experimental). Changed the way timestep scheduling works to prep for more specific schedules.

This commit is contained in:
Jaret Burkett
2024-12-31 07:06:55 -07:00
parent 8ef07a9c36
commit 4723f23c0d
5 changed files with 182 additions and 7 deletions

View File

@@ -60,7 +60,7 @@ from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjecti
from toolkit.paths import ORIG_CONFIGS_ROOT, DIFFUSERS_CONFIGS_ROOT
from huggingface_hub import hf_hub_download
from toolkit.models.flux import bypass_flux_guidance, restore_flux_guidance
from toolkit.models.flux import add_model_gpu_splitter_to_flux, bypass_flux_guidance, restore_flux_guidance
from optimum.quanto import freeze, qfloat8, quantize, QTensor, qint4
from typing import TYPE_CHECKING
@@ -553,6 +553,10 @@ class StableDiffusion:
# low_cpu_mem_usage=False,
# device_map=None
)
# hack in model gpu splitter
if self.model_config.split_model_over_gpus:
add_model_gpu_splitter_to_flux(transformer)
if not self.low_vram:
# for low v ram, we leave it on the cpu. Quantizes slower, but allows training on primary gpu
transformer.to(torch.device(self.quantize_device), dtype=dtype)