mirror of
https://github.com/ostris/ai-toolkit.git
synced 2026-04-30 11:11:37 +00:00
Allow for matching target resolution with control images for Qwen Image Edit 2509
This commit is contained in:
@@ -200,6 +200,11 @@ class QwenImageEditPlusModel(QwenImageModel):
|
|||||||
):
|
):
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
batch_size, num_channels_latents, height, width = latent_model_input.shape
|
batch_size, num_channels_latents, height, width = latent_model_input.shape
|
||||||
|
|
||||||
|
control_image_res = VAE_IMAGE_SIZE
|
||||||
|
if self.model_config.model_kwargs.get("match_target_res", False):
|
||||||
|
# use the current target size to set the control image res
|
||||||
|
control_image_res = height * width * self.pipeline.vae_scale_factor
|
||||||
|
|
||||||
# pack image tokens
|
# pack image tokens
|
||||||
latent_model_input = latent_model_input.view(
|
latent_model_input = latent_model_input.view(
|
||||||
@@ -244,7 +249,7 @@ class QwenImageEditPlusModel(QwenImageModel):
|
|||||||
if len(control_img.shape) == 3:
|
if len(control_img.shape) == 3:
|
||||||
control_img = control_img.unsqueeze(0)
|
control_img = control_img.unsqueeze(0)
|
||||||
ratio = control_img.shape[2] / control_img.shape[3]
|
ratio = control_img.shape[2] / control_img.shape[3]
|
||||||
c_width = math.sqrt(VAE_IMAGE_SIZE * ratio)
|
c_width = math.sqrt(control_image_res * ratio)
|
||||||
c_height = c_width / ratio
|
c_height = c_width / ratio
|
||||||
|
|
||||||
c_width = round(c_width / 32) * 32
|
c_width = round(c_width / 32) * 32
|
||||||
|
|||||||
@@ -214,6 +214,14 @@ export default function SimpleJob({
|
|||||||
/>
|
/>
|
||||||
</FormGroup>
|
</FormGroup>
|
||||||
)}
|
)}
|
||||||
|
{modelArch?.additionalSections?.includes('model.qie.match_target_res') && (
|
||||||
|
<Checkbox
|
||||||
|
label="Match Target Res"
|
||||||
|
docKey="model.qie.match_target_res"
|
||||||
|
checked={jobConfig.config.process[0].model.model_kwargs.match_target_res}
|
||||||
|
onChange={value => setJobConfig(value, 'config.process[0].model.model_kwargs.match_target_res')}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
{modelArch?.additionalSections?.includes('model.layer_offloading') && (
|
{modelArch?.additionalSections?.includes('model.layer_offloading') && (
|
||||||
<>
|
<>
|
||||||
<Checkbox
|
<Checkbox
|
||||||
|
|||||||
@@ -21,7 +21,8 @@ type AdditionalSections =
|
|||||||
| 'datasets.num_frames'
|
| 'datasets.num_frames'
|
||||||
| 'model.multistage'
|
| 'model.multistage'
|
||||||
| 'model.layer_offloading'
|
| 'model.layer_offloading'
|
||||||
| 'model.low_vram';
|
| 'model.low_vram'
|
||||||
|
| 'model.qie.match_target_res';
|
||||||
type ModelGroup = 'image' | 'instruction' | 'video';
|
type ModelGroup = 'image' | 'instruction' | 'video';
|
||||||
|
|
||||||
export interface ModelArch {
|
export interface ModelArch {
|
||||||
@@ -354,6 +355,12 @@ export const modelArchs: ModelArch[] = [
|
|||||||
'config.process[0].train.noise_scheduler': ['flowmatch', 'flowmatch'],
|
'config.process[0].train.noise_scheduler': ['flowmatch', 'flowmatch'],
|
||||||
'config.process[0].train.timestep_type': ['weighted', 'sigmoid'],
|
'config.process[0].train.timestep_type': ['weighted', 'sigmoid'],
|
||||||
'config.process[0].model.qtype': ['qfloat8', 'qfloat8'],
|
'config.process[0].model.qtype': ['qfloat8', 'qfloat8'],
|
||||||
|
'config.process[0].model.model_kwargs': [
|
||||||
|
{
|
||||||
|
match_target_res: false,
|
||||||
|
},
|
||||||
|
{},
|
||||||
|
],
|
||||||
},
|
},
|
||||||
disableSections: ['network.conv', 'train.unload_text_encoder'],
|
disableSections: ['network.conv', 'train.unload_text_encoder'],
|
||||||
additionalSections: [
|
additionalSections: [
|
||||||
@@ -361,6 +368,7 @@ export const modelArchs: ModelArch[] = [
|
|||||||
'sample.multi_ctrl_imgs',
|
'sample.multi_ctrl_imgs',
|
||||||
'model.low_vram',
|
'model.low_vram',
|
||||||
'model.layer_offloading',
|
'model.layer_offloading',
|
||||||
|
'model.qie.match_target_res',
|
||||||
],
|
],
|
||||||
accuracyRecoveryAdapters: {
|
accuracyRecoveryAdapters: {
|
||||||
'3 bit with ARA': 'uint3|ostris/accuracy_recovery_adapters/qwen_image_edit_2509_torchao_uint3.safetensors',
|
'3 bit with ARA': 'uint3|ostris/accuracy_recovery_adapters/qwen_image_edit_2509_torchao_uint3.safetensors',
|
||||||
|
|||||||
@@ -204,14 +204,27 @@ const docs: { [key: string]: ConfigDoc } = {
|
|||||||
one update to the next. It will also only work with certain models.
|
one update to the next. It will also only work with certain models.
|
||||||
<br />
|
<br />
|
||||||
<br />
|
<br />
|
||||||
Layer Offloading uses the CPU RAM instead of the GPU ram to hold most of the model weights. This allows training a
|
Layer Offloading uses the CPU RAM instead of the GPU ram to hold most of the model weights. This allows training
|
||||||
much larger model on a smaller GPU, assuming you have enough CPU RAM. This is slower than training on pure GPU
|
a much larger model on a smaller GPU, assuming you have enough CPU RAM. This is slower than training on pure GPU
|
||||||
RAM, but CPU RAM is cheaper and upgradeable. You will still need GPU RAM to hold the optimizer states and LoRA weights,
|
RAM, but CPU RAM is cheaper and upgradeable. You will still need GPU RAM to hold the optimizer states and LoRA
|
||||||
so a larger card is usually still needed.
|
weights, so a larger card is usually still needed.
|
||||||
<br />
|
<br />
|
||||||
<br />
|
<br />
|
||||||
You can also select the percentage of the layers to offload. It is generally best to offload as few as possible (close to 0%)
|
You can also select the percentage of the layers to offload. It is generally best to offload as few as possible
|
||||||
for best performance, but you can offload more if you need the memory.
|
(close to 0%) for best performance, but you can offload more if you need the memory.
|
||||||
|
</>
|
||||||
|
),
|
||||||
|
},
|
||||||
|
'model.qie.match_target_res': {
|
||||||
|
title: 'Match Target Res',
|
||||||
|
description: (
|
||||||
|
<>
|
||||||
|
This setting will make the control images match the resolution of the target image. The official inference
|
||||||
|
example for Qwen-Image-Edit-2509 feeds the control image is at 1MP resolution, no matter what size you are
|
||||||
|
generating. Doing this makes training at lower res difficult because 1MP control images are fed in despite how
|
||||||
|
large your target image is. Match Target Res will match the resolution of your target to feed in the control
|
||||||
|
images allowing you to use less VRAM when training with smaller resolutions. You can still use different aspect
|
||||||
|
ratios, the image will just be resizes to match the amount of pixels in the target image.
|
||||||
</>
|
</>
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user