mirror of
https://github.com/ostris/ai-toolkit.git
synced 2026-04-24 16:29:26 +00:00
203 lines
10 KiB
Python
203 lines
10 KiB
Python
from typing import Union, List, Optional, Dict, Any, Tuple
|
|
|
|
import torch
|
|
from diffusers import StableDiffusionXLPipeline
|
|
from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import rescale_noise_cfg
|
|
|
|
|
|
class CustomStableDiffusionXLPipeline(StableDiffusionXLPipeline):
|
|
# def __init__(self, *args, **kwargs):
|
|
# super().__init__(*args, **kwargs)
|
|
|
|
def predict_noise(
|
|
self,
|
|
prompt: Union[str, List[str]] = None,
|
|
prompt_2: Optional[Union[str, List[str]]] = None,
|
|
guidance_scale: float = 5.0,
|
|
negative_prompt: Optional[Union[str, List[str]]] = None,
|
|
negative_prompt_2: Optional[Union[str, List[str]]] = None,
|
|
latents: Optional[torch.FloatTensor] = None,
|
|
prompt_embeds: Optional[torch.FloatTensor] = None,
|
|
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
|
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
|
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
|
|
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
|
guidance_rescale: float = 0.0,
|
|
timestep: Optional[int] = 1,
|
|
):
|
|
r"""
|
|
Function invoked when calling the pipeline for generation.
|
|
|
|
Args:
|
|
prompt (`str` or `List[str]`, *optional*):
|
|
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
|
|
instead.
|
|
prompt_2 (`str` or `List[str]`, *optional*):
|
|
The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
|
|
used in both text-encoders
|
|
guidance_scale (`float`, *optional*, defaults to 7.5):
|
|
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
|
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
|
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
|
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
|
usually at the expense of lower image quality.
|
|
negative_prompt (`str` or `List[str]`, *optional*):
|
|
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
|
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
|
less than `1`).
|
|
negative_prompt_2 (`str` or `List[str]`, *optional*):
|
|
The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
|
|
`text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
|
|
latents (`torch.FloatTensor`, *optional*):
|
|
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
|
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
|
tensor will ge generated by sampling using the supplied random `generator`.
|
|
prompt_embeds (`torch.FloatTensor`, *optional*):
|
|
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
|
provided, text embeddings will be generated from `prompt` input argument.
|
|
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
|
|
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
|
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
|
|
argument.
|
|
pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
|
|
Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
|
|
If not provided, pooled text embeddings will be generated from `prompt` input argument.
|
|
negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
|
|
Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
|
|
weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
|
|
input argument.
|
|
output_type (`str`, *optional*, defaults to `"pil"`):
|
|
The output format of the generate image. Choose between
|
|
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
|
cross_attention_kwargs (`dict`, *optional*):
|
|
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
|
`self.processor` in
|
|
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
|
guidance_rescale (`float`, *optional*, defaults to 0.7):
|
|
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
|
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
|
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
|
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
|
timestep (`int`, *optional*, defaults to `1`):
|
|
The timestep at which to generate the image. If not specified, the last timestep is used.
|
|
|
|
Examples:
|
|
|
|
Returns:
|
|
torch.FloatTensor: Predicted noise
|
|
"""
|
|
# 0. Default height and width to unet
|
|
height = self.default_sample_size * self.vae_scale_factor
|
|
width = self.default_sample_size * self.vae_scale_factor
|
|
|
|
original_size = (height, width)
|
|
target_size = (height, width)
|
|
|
|
# 2. Define call parameters
|
|
if prompt is not None and isinstance(prompt, str):
|
|
batch_size = 1
|
|
elif prompt is not None and isinstance(prompt, list):
|
|
batch_size = len(prompt)
|
|
else:
|
|
batch_size = prompt_embeds.shape[0]
|
|
|
|
device = self._execution_device
|
|
|
|
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
|
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
|
# corresponds to doing no classifier free guidance.
|
|
# do_classifier_free_guidance = guidance_scale > 1.0
|
|
do_classifier_free_guidance = guidance_scale > 1.0
|
|
|
|
# 3. Encode input prompt
|
|
text_encoder_lora_scale = (
|
|
cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
|
|
)
|
|
|
|
num_images_per_prompt = 1
|
|
|
|
(
|
|
prompt_embeds,
|
|
negative_prompt_embeds,
|
|
pooled_prompt_embeds,
|
|
negative_pooled_prompt_embeds,
|
|
) = self.encode_prompt(
|
|
prompt=prompt,
|
|
prompt_2=prompt_2,
|
|
device=device,
|
|
num_images_per_prompt=num_images_per_prompt,
|
|
do_classifier_free_guidance=do_classifier_free_guidance,
|
|
negative_prompt=negative_prompt,
|
|
negative_prompt_2=negative_prompt_2,
|
|
prompt_embeds=prompt_embeds,
|
|
negative_prompt_embeds=negative_prompt_embeds,
|
|
pooled_prompt_embeds=pooled_prompt_embeds,
|
|
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
|
|
lora_scale=text_encoder_lora_scale,
|
|
)
|
|
|
|
# 4. Prepare timesteps
|
|
self.scheduler.set_timesteps(1, device=device)
|
|
|
|
timesteps = self.scheduler.timesteps
|
|
|
|
# 5. Prepare latent variables
|
|
num_channels_latents = self.unet.config.in_channels
|
|
latents = self.prepare_latents(
|
|
batch_size * num_images_per_prompt,
|
|
num_channels_latents,
|
|
height,
|
|
width,
|
|
prompt_embeds.dtype,
|
|
device,
|
|
None,
|
|
latents,
|
|
)
|
|
|
|
# 7. Prepare added time ids & embeddings
|
|
add_text_embeds = pooled_prompt_embeds
|
|
crops_coords_top_left: Tuple[int, int] = (0, 0)
|
|
add_time_ids = self._get_add_time_ids(
|
|
original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
|
|
).to(device)
|
|
|
|
if do_classifier_free_guidance:
|
|
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
|
|
add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
|
|
add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
|
|
|
|
prompt_embeds = prompt_embeds.to(device)
|
|
add_text_embeds = add_text_embeds.to(device)
|
|
add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
|
|
|
|
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
|
|
|
latent_model_input = self.scheduler.scale_model_input(latent_model_input, timesteps)
|
|
|
|
# predict the noise residual
|
|
added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
|
|
noise_pred = self.unet(
|
|
latent_model_input,
|
|
timestep=timestep,
|
|
encoder_hidden_states=prompt_embeds,
|
|
cross_attention_kwargs=cross_attention_kwargs,
|
|
added_cond_kwargs=added_cond_kwargs,
|
|
return_dict=False,
|
|
)[0]
|
|
|
|
# perform guidance
|
|
if do_classifier_free_guidance:
|
|
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
|
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
|
|
|
if do_classifier_free_guidance and guidance_rescale > 0.0:
|
|
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
|
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
|
|
|
|
return noise_pred
|
|
|
|
def enable_model_cpu_offload(self, gpu_id=0):
|
|
print('Called cpu offload', gpu_id)
|
|
# fuck off
|
|
pass
|