From 1c74ca5d229813ba5aad2b76ff22942dab001f47 Mon Sep 17 00:00:00 2001 From: Jaret Burkett Date: Thu, 19 Feb 2026 11:57:44 -0700 Subject: [PATCH] Add audio_loss_multiplier to scale audio loss to larger values if desired. --- extensions_built_in/sd_trainer/SDTrainer.py | 1 + toolkit/config_modules.py | 2 ++ ui/src/app/jobs/new/SimpleJob.tsx | 11 ++++++++ ui/src/app/jobs/new/options.ts | 4 ++- ui/src/docs.tsx | 30 ++++++++++++++------- ui/src/types.ts | 1 + 6 files changed, 39 insertions(+), 10 deletions(-) diff --git a/extensions_built_in/sd_trainer/SDTrainer.py b/extensions_built_in/sd_trainer/SDTrainer.py index 152cd131..2ca7f5fc 100644 --- a/extensions_built_in/sd_trainer/SDTrainer.py +++ b/extensions_built_in/sd_trainer/SDTrainer.py @@ -863,6 +863,7 @@ class SDTrainer(BaseSDTrainProcess): # check for audio loss if batch.audio_pred is not None and batch.audio_target is not None: audio_loss = torch.nn.functional.mse_loss(batch.audio_pred.float(), batch.audio_target.float(), reduction="mean") + audio_loss = audio_loss * self.train_config.audio_loss_multiplier loss = loss + audio_loss # check for additional losses diff --git a/toolkit/config_modules.py b/toolkit/config_modules.py index a48cbe09..51355fee 100644 --- a/toolkit/config_modules.py +++ b/toolkit/config_modules.py @@ -683,6 +683,8 @@ class ModelConfig: # model paths for models that support it self.model_paths = kwargs.get("model_paths", {}) + self.audio_loss_multiplier = kwargs.get("audio_loss_multiplier", 1.0) + # allow frontend to pass arch with a color like arch:tag # but remove the tag if self.arch is not None: diff --git a/ui/src/app/jobs/new/SimpleJob.tsx b/ui/src/app/jobs/new/SimpleJob.tsx index 5db650c3..0bbb2a2b 100644 --- a/ui/src/app/jobs/new/SimpleJob.tsx +++ b/ui/src/app/jobs/new/SimpleJob.tsx @@ -551,6 +551,17 @@ export default function SimpleJob({ { value: 'stepped', label: 'Stepped Recovery' }, ]} /> + {modelArch?.additionalSections?.includes('train.audio_loss_multiplier') && ( + setJobConfig(value, 'config.process[0].train.audio_loss_multiplier')} + placeholder="eg. 1.0" + docKey={'train.audio_loss_multiplier'} + min={0} + /> + )}
diff --git a/ui/src/app/jobs/new/options.ts b/ui/src/app/jobs/new/options.ts index 67be1aa8..c82d3ad3 100644 --- a/ui/src/app/jobs/new/options.ts +++ b/ui/src/app/jobs/new/options.ts @@ -22,6 +22,7 @@ type AdditionalSections = | 'datasets.audio_preserve_pitch' | 'sample.ctrl_img' | 'sample.multi_ctrl_imgs' + | 'train.audio_loss_multiplier' | 'datasets.num_frames' | 'model.multistage' | 'model.layer_offloading' @@ -642,13 +643,14 @@ export const modelArchs: ModelArch[] = [ 'config.process[0].sample.fps': [24, 1], 'config.process[0].sample.width': [768, 1024], 'config.process[0].sample.height': [768, 1024], + 'config.process[0].train.audio_loss_multiplier': [1.0, undefined], 'config.process[0].train.timestep_type': ['weighted', 'sigmoid'], 'config.process[0].datasets[x].do_i2v': [false, undefined], 'config.process[0].datasets[x].do_audio': [true, undefined], 'config.process[0].datasets[x].fps': [24, undefined], }, disableSections: ['network.conv'], - additionalSections: ['sample.ctrl_img', 'datasets.num_frames', 'model.layer_offloading', 'model.low_vram', 'datasets.do_audio', 'datasets.audio_normalize', 'datasets.audio_preserve_pitch', 'datasets.do_i2v'], + additionalSections: ['sample.ctrl_img', 'datasets.num_frames', 'model.layer_offloading', 'model.low_vram', 'datasets.do_audio', 'datasets.audio_normalize', 'datasets.audio_preserve_pitch', 'datasets.do_i2v', 'train.audio_loss_multiplier'], }, { name: 'flux2_klein_4b', diff --git a/ui/src/docs.tsx b/ui/src/docs.tsx index c82d800b..0a9a3a83 100644 --- a/ui/src/docs.tsx +++ b/ui/src/docs.tsx @@ -112,8 +112,8 @@ const docs: { [key: string]: ConfigDoc } = { description: ( <> For models that support audio with video, this option will load the audio from the video and resize it to match - the video sequence. Since the video is automatically resized, the audio may drop or raise in pitch to match the new - speed of the video. It is important to prep your dataset to have the proper length before training. + the video sequence. Since the video is automatically resized, the audio may drop or raise in pitch to match the + new speed of the video. It is important to prep your dataset to have the proper length before training. ), }, @@ -121,8 +121,9 @@ const docs: { [key: string]: ConfigDoc } = { title: 'Audio Normalize', description: ( <> - When loading audio, this will normalize the audio volume to the max peaks. Useful if your dataset has varying audio - volumes. Warning, do not use if you have clips with full silence you want to keep, as it will raise the volume of those clips. + When loading audio, this will normalize the audio volume to the max peaks. Useful if your dataset has varying + audio volumes. Warning, do not use if you have clips with full silence you want to keep, as it will raise the + volume of those clips. ), }, @@ -132,7 +133,7 @@ const docs: { [key: string]: ConfigDoc } = { <> When loading audio to match the number of frames requested, this option will preserve the pitch of the audio if the length does not match training target. It is recommended to have a dataset that matches your target length, - as this option can add sound distortions. + as this option can add sound distortions. ), }, @@ -310,10 +311,21 @@ const docs: { [key: string]: ConfigDoc } = { title: 'Num Repeats', description: ( <> - Number of Repeats will allow you to repeate the items in a dataset multiple times. This is useful when you are using multiple - datasets and want to balance the number of samples from each dataset. For instance, if you have a small dataset of 10 images - and a large dataset of 100 images, you can set the small dataset to have 10 repeats to effectively make it 100 images, making - the two datasets occour equally during training. + Number of Repeats will allow you to repeate the items in a dataset multiple times. This is useful when you are + using multiple datasets and want to balance the number of samples from each dataset. For instance, if you have a + small dataset of 10 images and a large dataset of 100 images, you can set the small dataset to have 10 repeats + to effectively make it 100 images, making the two datasets occour equally during training. + + ), + }, + 'train.audio_loss_multiplier': { + title: 'Audio Loss Multiplier', + description: ( + <> + When training audio and video, sometimes the video loss is so great that it outweights the audio loss, causing + the audio to become distorted. If you are noticing this happen, you can increase the audio loss multiplier to + give more weight to the audio loss. You could try something like 2.0, 10.0 etc. Warning, setting this too high + could overfit and damage the model. ), }, diff --git a/ui/src/types.ts b/ui/src/types.ts index 07f1e1d1..5cc0c446 100644 --- a/ui/src/types.ts +++ b/ui/src/types.ts @@ -146,6 +146,7 @@ export interface TrainConfig { loss_type: 'mse' | 'mae' | 'wavelet' | 'stepped'; do_differential_guidance?: boolean; differential_guidance_scale?: number; + audio_loss_multiplier?: number; } export interface QuantizeKwargsConfig {