From 5c27f89af5e16431557480ec8edfd659af3ad5bc Mon Sep 17 00:00:00 2001 From: Jaret Burkett Date: Sat, 23 Aug 2025 18:20:36 -0600 Subject: [PATCH] Add example config for qwen image edit --- .../examples/train_lora_qwen_image_24gb.yaml | 2 +- .../train_lora_qwen_image_edit_32gb.yaml | 102 ++++++++++++++++++ toolkit/dataloader_mixins.py | 4 +- version.py | 2 +- 4 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 config/examples/train_lora_qwen_image_edit_32gb.yaml diff --git a/config/examples/train_lora_qwen_image_24gb.yaml b/config/examples/train_lora_qwen_image_24gb.yaml index 692fcd24..e464020f 100644 --- a/config/examples/train_lora_qwen_image_24gb.yaml +++ b/config/examples/train_lora_qwen_image_24gb.yaml @@ -39,7 +39,7 @@ config: train: batch_size: 1 # caching text embeddings is required for 24GB - cache_text_embeddings: false + cache_text_embeddings: true steps: 2000 # total number of steps to train 500 - 4000 is a good range gradient_accumulation: 1 diff --git a/config/examples/train_lora_qwen_image_edit_32gb.yaml b/config/examples/train_lora_qwen_image_edit_32gb.yaml new file mode 100644 index 00000000..81a999d7 --- /dev/null +++ b/config/examples/train_lora_qwen_image_edit_32gb.yaml @@ -0,0 +1,102 @@ +--- +job: extension +config: + # this name will be the folder and filename name + name: "my_first_qwen_image_edit_lora_v1" + process: + - type: 'sd_trainer' + # root folder to save training sessions/samples/weights + training_folder: "output" + # uncomment to see performance stats in the terminal every N steps +# performance_log_every: 1000 + device: cuda:0 + # if a trigger word is specified, it will be added to captions of training data if it does not already exist + # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word + # Trigger words will not work when caching text embeddings +# trigger_word: "p3r5on" + network: + type: "lora" + linear: 16 + linear_alpha: 16 + save: + dtype: float16 # precision to save + save_every: 250 # save every this many steps + max_step_saves_to_keep: 4 # how many intermittent saves to keep + datasets: + # datasets are a folder of images. captions need to be txt files with the same name as the image + # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently + # images will automatically be resized and bucketed into the resolution specified + # on windows, escape back slashes with another backslash so + # "C:\\path\\to\\images\\folder" + - folder_path: "/path/to/images/folder" + control_path: "/path/to/control/images/folder" + caption_ext: "txt" + # default_caption: "a person" # if caching text embeddings, if you don't have captions, this will get cached + caption_dropout_rate: 0.05 # will drop out the caption 5% of time + resolution: [ 512, 768, 1024 ] # qwen image enjoys multiple resolutions + train: + batch_size: 1 + # caching text embeddings is required for 32GB + cache_text_embeddings: true + + steps: 3000 # total number of steps to train 500 - 4000 is a good range + gradient_accumulation: 1 + timestep_type: "weighted" + train_unet: true + train_text_encoder: false # probably won't work with qwen image + gradient_checkpointing: true # need the on unless you have a ton of vram + noise_scheduler: "flowmatch" # for training only + optimizer: "adamw8bit" + lr: 1e-4 + # uncomment this to skip the pre training sample +# skip_first_sample: true + # uncomment to completely disable sampling +# disable_sampling: true + dtype: bf16 + model: + # huggingface model name or path + name_or_path: "Qwen/Qwen-Image-Edit" + arch: "qwen_image_edit" + quantize: true + # qtype_te: "qfloat8" Default float8 qquantization + # to use the ARA use the | pipe to point to hf path, or a local path if you have one. + # 3bit is required for 32GB + qtype: "uint3|qwen_image_edit_torchao_uint3.safetensors" + quantize_te: true + qtype_te: "qfloat8" + low_vram: true + sample: + sampler: "flowmatch" # must match train.noise_scheduler + sample_every: 250 # sample every this many steps + width: 1024 + height: 1024 + samples: + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + - prompt: "do the thing to it" + ctrl_img: "/path/to/control/image.jpg" + neg: "" + seed: 42 + walk_seed: true + guidance_scale: 3 + sample_steps: 25 +# you can add any additional meta info here. [name] is replaced with config name at top +meta: + name: "[name]" + version: '1.0' diff --git a/toolkit/dataloader_mixins.py b/toolkit/dataloader_mixins.py index 4a01d752..69df9cbd 100644 --- a/toolkit/dataloader_mixins.py +++ b/toolkit/dataloader_mixins.py @@ -1865,7 +1865,9 @@ class TextEmbeddingCachingMixin: self.sd.set_device_state_preset('cache_text_encoder') did_move = True - if file_item.encode_control_in_text_embeddings and file_item.control_path is not None: + if file_item.encode_control_in_text_embeddings: + if file_item.control_path is None: + raise Exception(f"Could not find a control image for {file_item.path} which is needed for this model") # load the control image and feed it into the text encoder ctrl_img = Image.open(file_item.control_path).convert("RGB") # convert to 0 to 1 tensor diff --git a/version.py b/version.py index e023ab46..d613b95e 100644 --- a/version.py +++ b/version.py @@ -1 +1 @@ -VERSION = "0.5.4" \ No newline at end of file +VERSION = "0.5.5" \ No newline at end of file