Added stochastic rounding to adafactor. ILora adjustments

2026-04-30 03:01:28 +00:00 · 2024-03-05 07:07:09 -07:00
parent 1325613583
commit b01e8d889a
7 changed files with 153 additions and 3 deletions
--- a/extensions_built_in/sd_trainer/SDTrainer.py
+++ b/extensions_built_in/sd_trainer/SDTrainer.py
@@ -83,6 +83,8 @@ class SDTrainer(BaseSDTrainProcess):
            self.taesd.requires_grad_(False)

    def hook_before_train_loop(self):
+        if self.train_config.do_prior_divergence:
+            self.do_prior_prediction = True
        # move vae to device if we did not cache latents
        if not self.is_latents_cached:
            self.sd.vae.eval()
@@ -290,7 +292,7 @@ class SDTrainer(BaseSDTrainProcess):
                # target = (noise * mask_multiplier) + (prior_pred * prior_mask_multiplier)
                # set masked multiplier to 1.0 so we dont double apply it
                # mask_multiplier = 1.0
-        elif prior_pred is not None:
+        elif prior_pred is not None and not self.train_config.do_prior_divergence:
            assert not self.train_config.train_turbo
            # matching adapter prediction
            target = prior_pred
@@ -347,6 +349,9 @@ class SDTrainer(BaseSDTrainProcess):
            else:
                loss = torch.nn.functional.mse_loss(pred.float(), target.float(), reduction="none")

+        if self.train_config.do_prior_divergence and prior_pred is not None:
+            loss = loss + (torch.nn.functional.mse_loss(pred.float(), prior_pred.float(), reduction="none") * -1.0)
+
        # multiply by our mask
        loss = loss * mask_multiplier