From a9eb16c742127e8428aca47a65593ae1107e3dad Mon Sep 17 00:00:00 2001
From: lllyasviel <lyuminzhang@outlook.com>
Date: Fri, 26 Jan 2024 00:41:31 -0800
Subject: [PATCH] i

---
 README.md                                     | 118 ++++++++++++++++++
 .../sd_forge_svd/scripts/forge_svd.py         |  15 ++-
 2 files changed, 129 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 2c6ba5ab..72c11c91 100644
--- a/README.md
+++ b/README.md
@@ -214,6 +214,124 @@ Implementing Stable Video Diffusion and Zero123 are also super simple now (see a
 
 *Stable Video Diffusion:*
 
+`extensions-builtin/sd_forge_freeu/scripts/forge_svd.py`
+
+```python
+import torch
+import gradio as gr
+import os
+import pathlib
+
+from modules import script_callbacks
+from modules.paths import models_path
+from modules.ui_common import ToolButton, refresh_symbol
+from modules import shared
+
+from modules_forge.forge_util import numpy_to_pytorch, pytorch_to_numpy
+from ldm_patched.modules.sd import load_checkpoint_guess_config
+from ldm_patched.contrib.external_video_model import VideoLinearCFGGuidance, SVD_img2vid_Conditioning
+from ldm_patched.contrib.external import KSampler, VAEDecode
+
+
+opVideoLinearCFGGuidance = VideoLinearCFGGuidance()
+opSVD_img2vid_Conditioning = SVD_img2vid_Conditioning()
+opKSampler = KSampler()
+opVAEDecode = VAEDecode()
+
+svd_root = os.path.join(models_path, 'svd')
+os.makedirs(svd_root, exist_ok=True)
+svd_filenames = []
+
+
+def update_svd_filenames():
+    global svd_filenames
+    svd_filenames = [
+        pathlib.Path(x).name for x in
+        shared.walk_files(svd_root, allowed_extensions=[".pt", ".ckpt", ".safetensors"])
+    ]
+    return svd_filenames
+
+
+@torch.inference_mode()
+@torch.no_grad()
+def predict(filename, width, height, video_frames, motion_bucket_id, fps, augmentation_level,
+            sampling_seed, sampling_steps, sampling_cfg, sampling_sampler_name, sampling_scheduler,
+            sampling_denoise, guidance_min_cfg, input_image):
+    filename = os.path.join(svd_root, filename)
+    model_raw, _, vae, clip_vision = \
+        load_checkpoint_guess_config(filename, output_vae=True, output_clip=False, output_clipvision=True)
+    model = opVideoLinearCFGGuidance.patch(model_raw, guidance_min_cfg)[0]
+    init_image = numpy_to_pytorch(input_image)
+    positive, negative, latent_image = opSVD_img2vid_Conditioning.encode(
+        clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level)
+    output_latent = opKSampler.sample(model, sampling_seed, sampling_steps, sampling_cfg,
+                                      sampling_sampler_name, sampling_scheduler, positive,
+                                      negative, latent_image, sampling_denoise)[0]
+    output_pixels = opVAEDecode.decode(vae, output_latent)[0]
+    outputs = pytorch_to_numpy(output_pixels)
+    return outputs
+
+
+def on_ui_tabs():
+    with gr.Blocks() as svd_block:
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(label='Input Image', source='upload', type='numpy', height=400)
+
+                with gr.Row():
+                    filename = gr.Dropdown(label="SVD Checkpoint Filename",
+                                           choices=svd_filenames,
+                                           value=svd_filenames[0] if len(svd_filenames) > 0 else None)
+                    refresh_button = ToolButton(value=refresh_symbol, tooltip="Refresh")
+                    refresh_button.click(
+                        fn=lambda: gr.update(choices=update_svd_filenames),
+                        inputs=[], outputs=filename)
+
+                width = gr.Slider(label='Width', minimum=16, maximum=8192, step=8, value=1024)
+                height = gr.Slider(label='Height', minimum=16, maximum=8192, step=8, value=576)
+                video_frames = gr.Slider(label='Video Frames', minimum=1, maximum=4096, step=1, value=14)
+                motion_bucket_id = gr.Slider(label='Motion Bucket Id', minimum=1, maximum=1023, step=1, value=127)
+                fps = gr.Slider(label='Fps', minimum=1, maximum=1024, step=1, value=6)
+                augmentation_level = gr.Slider(label='Augmentation Level', minimum=0.0, maximum=10.0, step=0.01,
+                                               value=0.0)
+                sampling_steps = gr.Slider(label='Sampling Steps', minimum=1, maximum=200, step=1, value=20)
+                sampling_cfg = gr.Slider(label='CFG Scale', minimum=0.0, maximum=50.0, step=0.1, value=2.5)
+                sampling_denoise = gr.Slider(label='Sampling Denoise', minimum=0.0, maximum=1.0, step=0.01, value=1.0)
+                guidance_min_cfg = gr.Slider(label='Guidance Min Cfg', minimum=0.0, maximum=100.0, step=0.5, value=1.0)
+                sampling_sampler_name = gr.Radio(label='Sampler Name',
+                                                 choices=['euler', 'euler_ancestral', 'heun', 'heunpp2', 'dpm_2',
+                                                          'dpm_2_ancestral', 'lms', 'dpm_fast', 'dpm_adaptive',
+                                                          'dpmpp_2s_ancestral', 'dpmpp_sde', 'dpmpp_sde_gpu',
+                                                          'dpmpp_2m', 'dpmpp_2m_sde', 'dpmpp_2m_sde_gpu',
+                                                          'dpmpp_3m_sde', 'dpmpp_3m_sde_gpu', 'ddpm', 'lcm', 'ddim',
+                                                          'uni_pc', 'uni_pc_bh2'], value='euler')
+                sampling_scheduler = gr.Radio(label='Scheduler',
+                                              choices=['normal', 'karras', 'exponential', 'sgm_uniform', 'simple',
+                                                       'ddim_uniform'], value='karras')
+                sampling_seed = gr.Number(label='Seed', value=12345, precision=0)
+
+                generate_button = gr.Button(value="Generate")
+
+                ctrls = [filename, width, height, video_frames, motion_bucket_id, fps, augmentation_level,
+                         sampling_seed, sampling_steps, sampling_cfg, sampling_sampler_name, sampling_scheduler,
+                         sampling_denoise, guidance_min_cfg, input_image]
+
+            with gr.Column():
+                output_gallery = gr.Gallery(label='Gallery', show_label=False, object_fit='contain',
+                                            visible=True, height=1024, columns=4)
+
+        generate_button.click(predict, inputs=ctrls, outputs=[output_gallery])
+    return [(svd_block, "SVD", "svd")]
+
+
+update_svd_filenames()
+script_callbacks.on_ui_tabs(on_ui_tabs)
+```
+
+Note that although the above codes look like independent codes, they actually will automatically offload/unload any other models. For example, below is me opening webui, load SDXL, generated an image, then go to SVD, then generated image frames. You can see that the GPU memory is perfectly managed and the SDXL is moved to RAM then SVD is moved to GPU. 
+
+Note that this management is fully automatic. This makes writing extensions super simple.
+
 ![image](https://github.com/lllyasviel/stable-diffusion-webui-forge/assets/19834515/ac7ed152-cd33-4645-94af-4c43bb8c3d88)
 
 
diff --git a/extensions-builtin/sd_forge_svd/scripts/forge_svd.py b/extensions-builtin/sd_forge_svd/scripts/forge_svd.py
index a062c56c..4a6ee121 100644
--- a/extensions-builtin/sd_forge_svd/scripts/forge_svd.py
+++ b/extensions-builtin/sd_forge_svd/scripts/forge_svd.py
@@ -26,7 +26,10 @@ svd_filenames = []
 
 def update_svd_filenames():
     global svd_filenames
-    svd_filenames = [pathlib.Path(x).name for x in shared.walk_files(svd_root, allowed_extensions=[".pt", ".ckpt", ".safetensors"])]
+    svd_filenames = [
+        pathlib.Path(x).name for x in
+        shared.walk_files(svd_root, allowed_extensions=[".pt", ".ckpt", ".safetensors"])
+    ]
     return svd_filenames
 
 
@@ -36,11 +39,15 @@ def predict(filename, width, height, video_frames, motion_bucket_id, fps, augmen
             sampling_seed, sampling_steps, sampling_cfg, sampling_sampler_name, sampling_scheduler,
             sampling_denoise, guidance_min_cfg, input_image):
     filename = os.path.join(svd_root, filename)
-    model_raw, _, vae, clip_vision = load_checkpoint_guess_config(filename, output_vae=True, output_clip=False, output_clipvision=True)
+    model_raw, _, vae, clip_vision = \
+        load_checkpoint_guess_config(filename, output_vae=True, output_clip=False, output_clipvision=True)
     model = opVideoLinearCFGGuidance.patch(model_raw, guidance_min_cfg)[0]
     init_image = numpy_to_pytorch(input_image)
-    positive, negative, latent_image = opSVD_img2vid_Conditioning.encode(clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level)
-    output_latent = opKSampler.sample(model, sampling_seed, sampling_steps, sampling_cfg, sampling_sampler_name, sampling_scheduler, positive, negative, latent_image, sampling_denoise)[0]
+    positive, negative, latent_image = opSVD_img2vid_Conditioning.encode(
+        clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level)
+    output_latent = opKSampler.sample(model, sampling_seed, sampling_steps, sampling_cfg,
+                                      sampling_sampler_name, sampling_scheduler, positive,
+                                      negative, latent_image, sampling_denoise)[0]
     output_pixels = opVAEDecode.decode(vae, output_latent)[0]
     outputs = pytorch_to_numpy(output_pixels)
     return outputs