diff --git a/comfy/sd.py b/comfy/sd.py index 722c0c154..aa93e101d 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -554,6 +554,8 @@ class VAE: elif "decoder.layers.1.layers.0.beta" in sd: config = {} param_key = None + self.upscale_ratio = 2048 + self.downscale_ratio = 2048 if "decoder.layers.2.layers.1.weight_v" in sd: param_key = "decoder.layers.2.layers.1.weight_v" if "decoder.layers.2.layers.1.parametrizations.weight.original1" in sd: @@ -562,6 +564,8 @@ class VAE: if sd[param_key].shape[-1] == 12: config["strides"] = [2, 4, 4, 6, 10] self.audio_sample_rate = 48000 + self.upscale_ratio = 1920 + self.downscale_ratio = 1920 self.first_stage_model = AudioOobleckVAE(**config) self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype) @@ -569,8 +573,6 @@ class VAE: self.latent_channels = 64 self.output_channels = 2 self.pad_channel_value = "replicate" - self.upscale_ratio = 2048 - self.downscale_ratio = 2048 self.latent_dim = 1 self.process_output = lambda audio: audio self.process_input = lambda audio: audio @@ -870,7 +872,7 @@ class VAE: / 3.0) return output - def decode_tiled_1d(self, samples, tile_x=128, overlap=32): + def decode_tiled_1d(self, samples, tile_x=256, overlap=32): if samples.ndim == 3: decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float() else: