fix: remove normalization of audio in LTX Mel spectrogram creation (#11990)

For LTX Audio VAE, remove normalization of audio during MEL spectrogram creation.
This aligs inference with training and prevents loud audio from being attenuated.
This commit is contained in:
Ivan Zorin
2026-01-21 01:44:28 +02:00
committed by GitHub
parent ddc541ffda
commit 965d0ed509

View File

@@ -103,20 +103,10 @@ class AudioPreprocessor:
return waveform return waveform
return torchaudio.functional.resample(waveform, source_rate, self.target_sample_rate) return torchaudio.functional.resample(waveform, source_rate, self.target_sample_rate)
@staticmethod
def normalize_amplitude(
waveform: torch.Tensor, max_amplitude: float = 0.5, eps: float = 1e-5
) -> torch.Tensor:
waveform = waveform - waveform.mean(dim=2, keepdim=True)
peak = torch.max(torch.abs(waveform)) + eps
scale = peak.clamp(max=max_amplitude) / peak
return waveform * scale
def waveform_to_mel( def waveform_to_mel(
self, waveform: torch.Tensor, waveform_sample_rate: int, device self, waveform: torch.Tensor, waveform_sample_rate: int, device
) -> torch.Tensor: ) -> torch.Tensor:
waveform = self.resample(waveform, waveform_sample_rate) waveform = self.resample(waveform, waveform_sample_rate)
waveform = self.normalize_amplitude(waveform)
mel_transform = torchaudio.transforms.MelSpectrogram( mel_transform = torchaudio.transforms.MelSpectrogram(
sample_rate=self.target_sample_rate, sample_rate=self.target_sample_rate,