Add support for Wan2.2 5B

2026-04-30 03:01:28 +00:00 · 2025-07-29 05:31:54 -06:00
parent e55116d8c9
commit ca7c5c950b
11 changed files with 1241 additions and 92 deletions
--- a/extensions_built_in/diffusion_models/init.py
+++ b/extensions_built_in/diffusion_models/init.py
@@ -3,13 +3,15 @@ from .hidream import HidreamModel, HidreamE1Model
 from .f_light import FLiteModel
 from .omnigen2 import OmniGen2Model
 from .flux_kontext import FluxKontextModel
+from .wan22 import Wan22Model

 AI_TOOLKIT_MODELS = [
    # put a list of models here
-    ChromaModel, 
-    HidreamModel, 
-    HidreamE1Model, 
-    FLiteModel, 
-    OmniGen2Model, 
-    FluxKontextModel
+    ChromaModel,
+    HidreamModel,
+    HidreamE1Model,
+    FLiteModel,
+    OmniGen2Model,
+    FluxKontextModel,
+    Wan22Model,
 ]
--- a/extensions_built_in/diffusion_models/wan22/init.py
+++ b/extensions_built_in/diffusion_models/wan22/init.py
@@ -0,0 +1 @@
+from .wan22_model import Wan22Model
--- a/extensions_built_in/diffusion_models/wan22/wan22_model.py
+++ b/extensions_built_in/diffusion_models/wan22/wan22_model.py
@@ -0,0 +1,259 @@
+import torch
+from toolkit.prompt_utils import PromptEmbeds
+from PIL import Image
+from diffusers import UniPCMultistepScheduler
+import torch
+from toolkit.config_modules import GenerateImageConfig, ModelConfig
+from toolkit.samplers.custom_flowmatch_sampler import (
+    CustomFlowMatchEulerDiscreteScheduler,
+)
+from .wan22_pipeline import Wan22Pipeline
+
+from toolkit.data_transfer_object.data_loader import DataLoaderBatchDTO
+from torchvision.transforms import functional as TF
+
+from toolkit.models.wan21.wan21 import Wan21, AggressiveWanUnloadPipeline
+from toolkit.models.wan21.wan_utils import add_first_frame_conditioning_v22
+
+
+# for generation only?
+scheduler_configUniPC = {
+    "_class_name": "UniPCMultistepScheduler",
+    "_diffusers_version": "0.35.0.dev0",
+    "beta_end": 0.02,
+    "beta_schedule": "linear",
+    "beta_start": 0.0001,
+    "disable_corrector": [],
+    "dynamic_thresholding_ratio": 0.995,
+    "final_sigmas_type": "zero",
+    "flow_shift": 5.0,
+    "lower_order_final": True,
+    "num_train_timesteps": 1000,
+    "predict_x0": True,
+    "prediction_type": "flow_prediction",
+    "rescale_betas_zero_snr": False,
+    "sample_max_value": 1.0,
+    "solver_order": 2,
+    "solver_p": None,
+    "solver_type": "bh2",
+    "steps_offset": 0,
+    "thresholding": False,
+    "time_shift_type": "exponential",
+    "timestep_spacing": "linspace",
+    "trained_betas": None,
+    "use_beta_sigmas": False,
+    "use_dynamic_shifting": False,
+    "use_exponential_sigmas": False,
+    "use_flow_sigmas": True,
+    "use_karras_sigmas": False,
+}
+
+# for training. I think it is right
+scheduler_config = {
+    "num_train_timesteps": 1000,
+    "shift": 5.0,
+    "use_dynamic_shifting": False,
+}
+
+
+class Wan22Model(Wan21):
+    arch = "wan22_5b"
+    _wan_generation_scheduler_config = scheduler_configUniPC
+    _wan_expand_timesteps = True
+
+    def __init__(
+        self,
+        device,
+        model_config: ModelConfig,
+        dtype="bf16",
+        custom_pipeline=None,
+        noise_scheduler=None,
+        **kwargs,
+    ):
+        super().__init__(
+            device=device,
+            model_config=model_config,
+            dtype=dtype,
+            custom_pipeline=custom_pipeline,
+            noise_scheduler=noise_scheduler,
+            **kwargs,
+        )
+
+        self._wan_cache = None
+
+    def get_bucket_divisibility(self):
+        # 16x compression  and 2x2 patch size
+        return 32
+
+    def get_generation_pipeline(self):
+        scheduler = UniPCMultistepScheduler(**self._wan_generation_scheduler_config)
+        pipeline = Wan22Pipeline(
+            vae=self.vae,
+            transformer=self.model,
+            transformer_2=self.model,
+            text_encoder=self.text_encoder,
+            tokenizer=self.tokenizer,
+            scheduler=scheduler,
+            expand_timesteps=self._wan_expand_timesteps,
+            device=self.device_torch,
+            aggressive_offload=self.model_config.low_vram,
+        )
+
+        pipeline = pipeline.to(self.device_torch)
+
+        return pipeline
+
+    # static method to get the scheduler
+    @staticmethod
+    def get_train_scheduler():
+        scheduler = CustomFlowMatchEulerDiscreteScheduler(**scheduler_config)
+        return scheduler
+
+    def get_base_model_version(self):
+        return "wan_2.2_5b"
+
+    def generate_single_image(
+        self,
+        pipeline: AggressiveWanUnloadPipeline,
+        gen_config: GenerateImageConfig,
+        conditional_embeds: PromptEmbeds,
+        unconditional_embeds: PromptEmbeds,
+        generator: torch.Generator,
+        extra: dict,
+    ):
+        # reactivate progress bar since this is slooooow
+        pipeline.set_progress_bar_config(disable=False)
+
+        num_frames = (
+            (gen_config.num_frames - 1) // 4
+        ) * 4 + 1  # make sure it is divisible by 4 + 1
+        gen_config.num_frames = num_frames
+
+        height = gen_config.height
+        width = gen_config.width
+        noise_mask = None
+        if gen_config.ctrl_img is not None:
+            control_img = Image.open(gen_config.ctrl_img).convert("RGB")
+
+            d = self.get_bucket_divisibility()
+
+            # make sure they are divisible by d
+            height = height // d * d
+            width = width // d * d
+
+            # resize the control image
+            control_img = control_img.resize((width, height), Image.LANCZOS)
+
+            # 5. Prepare latent variables
+            num_channels_latents = self.transformer.config.in_channels
+            latents = pipeline.prepare_latents(
+                1,
+                num_channels_latents,
+                height,
+                width,
+                gen_config.num_frames,
+                torch.float32,
+                self.device_torch,
+                generator,
+                None,
+            ).to(self.torch_dtype)
+
+            first_frame_n1p1 = (
+                TF.to_tensor(control_img)
+                .unsqueeze(0)
+                .to(self.device_torch, dtype=self.torch_dtype)
+                * 2.0
+                - 1.0
+            )  # normalize to [-1, 1]
+
+            gen_config.latents, noise_mask = add_first_frame_conditioning_v22(
+                latent_model_input=latents, first_frame=first_frame_n1p1, vae=self.vae
+            )
+
+        output = pipeline(
+            prompt_embeds=conditional_embeds.text_embeds.to(
+                self.device_torch, dtype=self.torch_dtype
+            ),
+            negative_prompt_embeds=unconditional_embeds.text_embeds.to(
+                self.device_torch, dtype=self.torch_dtype
+            ),
+            height=height,
+            width=width,
+            num_inference_steps=gen_config.num_inference_steps,
+            guidance_scale=gen_config.guidance_scale,
+            latents=gen_config.latents,
+            num_frames=gen_config.num_frames,
+            generator=generator,
+            return_dict=False,
+            output_type="pil",
+            noise_mask=noise_mask,
+            **extra,
+        )[0]
+
+        # shape = [1, frames, channels, height, width]
+        batch_item = output[0]  # list of pil images
+        if gen_config.num_frames > 1:
+            return batch_item  # return the frames.
+        else:
+            # get just the first image
+            img = batch_item[0]
+        return img
+
+    def get_noise_prediction(
+        self,
+        latent_model_input: torch.Tensor,
+        timestep: torch.Tensor,  # 0 to 1000 scale
+        text_embeddings: PromptEmbeds,
+        batch: DataLoaderBatchDTO,
+        **kwargs,
+    ):
+        # videos come in (bs, num_frames, channels, height, width)
+        # images come in (bs, channels, height, width)
+
+        # for wan, only do i2v for video for now. Images do normal t2i
+        conditioned_latent = latent_model_input
+        noise_mask = None
+
+        with torch.no_grad():
+            frames = batch.tensor
+            if len(frames.shape) == 4:
+                first_frames = frames
+            elif len(frames.shape) == 5:
+                first_frames = frames[:, 0]
+                # Add conditioning using the standalone function
+                conditioned_latent, noise_mask = add_first_frame_conditioning_v22(
+                    latent_model_input=latent_model_input.to(
+                        self.device_torch, self.torch_dtype
+                    ),
+                    first_frame=first_frames.to(self.device_torch, self.torch_dtype),
+                    vae=self.vae,
+                )
+            else:
+                raise ValueError(f"Unknown frame shape {frames.shape}")
+
+            # make the noise mask
+            if noise_mask is None:
+                noise_mask = torch.ones(
+                    conditioned_latent.shape,
+                    dtype=conditioned_latent.dtype,
+                    device=conditioned_latent.device,
+                )
+            # todo write this better
+            t_chunks = torch.chunk(timestep, timestep.shape[0])
+            out_t_chunks = []
+            for t in t_chunks:
+                # seq_len: num_latent_frames * latent_height//2 * latent_width//2
+                temp_ts = (noise_mask[0][0][:, ::2, ::2] * t).flatten()
+                # batch_size, seq_len
+                temp_ts = temp_ts.unsqueeze(0)
+                out_t_chunks.append(temp_ts)
+            timestep = torch.cat(out_t_chunks, dim=0)
+
+        noise_pred = self.model(
+            hidden_states=conditioned_latent,
+            timestep=timestep,
+            encoder_hidden_states=text_embeddings.text_embeds,
+            return_dict=False,
+            **kwargs,
+        )[0]
+        return noise_pred
--- a/extensions_built_in/diffusion_models/wan22/wan22_pipeline.py
+++ b/extensions_built_in/diffusion_models/wan22/wan22_pipeline.py
@@ -0,0 +1,263 @@
+
+import torch
+from toolkit.basic import flush
+from transformers import AutoTokenizer, UMT5EncoderModel
+from diffusers import  WanPipeline, WanTransformer3DModel, AutoencoderKLWan
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler
+from typing import List
+from diffusers.pipelines.wan.pipeline_output import WanPipelineOutput
+from diffusers.pipelines.wan.pipeline_wan import XLA_AVAILABLE
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from typing import Any, Callable, Dict, List, Optional, Union
+
+
+
+class Wan22Pipeline(WanPipeline):
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: UMT5EncoderModel,
+        transformer: WanTransformer3DModel,
+        vae: AutoencoderKLWan,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        transformer_2: Optional[WanTransformer3DModel] = None,
+        boundary_ratio: Optional[float] = None,
+        expand_timesteps: bool = False,  # Wan2.2 ti2v
+        device: torch.device = torch.device("cuda"),
+        aggressive_offload: bool = False,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            transformer_2=transformer_2,
+            boundary_ratio=boundary_ratio,
+            expand_timesteps=expand_timesteps,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self._aggressive_offload = aggressive_offload
+        self._exec_device = device
+    @property
+    def _execution_device(self):
+        return self._exec_device
+    
+    def __call__(
+        self: WanPipeline,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator,
+                                  List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None],
+                  PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        noise_mask: Optional[torch.Tensor] = None,
+    ):
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # unload vae and transformer
+        vae_device = self.vae.device
+        transformer_device = self.transformer.device
+        text_encoder_device = self.text_encoder.device
+        device = self.transformer.device
+        
+        if self._aggressive_offload:
+            print("Unloading vae")
+            self.vae.to("cpu")
+            self.text_encoder.to(device)
+            flush()
+        
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if self._aggressive_offload:
+            # unload text encoder
+            print("Unloading text encoder")
+            self.text_encoder.to("cpu")
+            self.transformer.to(device)
+            flush()
+
+        transformer_dtype = self.transformer.dtype
+        prompt_embeds = prompt_embeds.to(device, transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                device, transformer_dtype)
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+        
+        mask = noise_mask
+        if mask is None:
+            mask = torch.ones(latents.shape, dtype=torch.float32, device=device)
+
+        # 6. Denoising loop
+        num_warmup_steps = len(timesteps) - \
+            num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                latent_model_input = latents.to(device, transformer_dtype)
+                if self.config.expand_timesteps:
+                    # seq_len: num_latent_frames * latent_height//2 * latent_width//2
+                    temp_ts = (mask[0][0][:, ::2, ::2] * t).flatten()
+                    # batch_size, seq_len
+                    timestep = temp_ts.unsqueeze(0).expand(latents.shape[0], -1)
+                else:
+                    timestep = t.expand(latents.shape[0])
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                if self.do_classifier_free_guidance:
+                    noise_uncond = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = noise_uncond + guidance_scale * \
+                        (noise_pred - noise_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, return_dict=False)[0]
+                
+                # apply i2v mask
+                latents = (latent_model_input * (1 - mask)) + (
+                    latents * mask
+                )
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(
+                        self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop(
+                        "prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if self._aggressive_offload:
+            # unload transformer
+            print("Unloading transformer")
+            self.transformer.to("cpu")
+            if self.transformer_2 is not None:
+                self.transformer_2.to("cpu")
+            # load vae
+            print("Loading Vae")
+            self.vae.to(vae_device)
+            flush()
+
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(
+                video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return WanPipelineOutput(frames=video)