Added initial support for training i2v adapter WIP

2026-04-24 16:29:26 +00:00 · 2025-04-09 08:06:29 -06:00
parent a8680c75eb
commit 615b0d0e94
7 changed files with 682 additions and 12 deletions
--- a/toolkit/models/diffusion_feature_extraction.py
+++ b/toolkit/models/diffusion_feature_extraction.py
@@ -161,7 +161,8 @@ class DiffusionFeatureExtractor3(nn.Module):
            vae = AutoencoderTiny.from_pretrained(
                "madebyollin/taef1", torch_dtype=torch.bfloat16)
        self.vae = vae
-        image_encoder_path = "google/siglip-so400m-patch14-384"
+        # image_encoder_path = "google/siglip-so400m-patch14-384"
+        image_encoder_path = "google/siglip2-so400m-patch16-512"
        try:
            self.image_processor = SiglipImageProcessor.from_pretrained(
                image_encoder_path)
@@ -182,7 +183,11 @@ class DiffusionFeatureExtractor3(nn.Module):
        dtype = torch.bfloat16
        device = self.vae.device
        # resize to 384x384
-        images = F.interpolate(tensors_0_1, size=(384, 384),
+        if 'height' in self.image_processor.size:
+            size = self.image_processor.size['height']
+        else:
+            size = self.image_processor.crop_size['height']
+        images = F.interpolate(tensors_0_1, size=(size, size),
                               mode='bicubic', align_corners=False)

        mean = torch.tensor(self.image_processor.image_mean).to(