diff --git a/README.md b/README.md index ea884c9..149493a 100644 --- a/README.md +++ b/README.md @@ -36,8 +36,8 @@ Original repo: https://github.com/index-tts/index-tts ``` ## Nodes -- **IndexTTS2 Simple** – speaker audio, text, optional emotion audio/vector; outputs audio + status string. Auto-selects device, FP16 on CUDA. -- **IndexTTS2 Advanced** – Simple inputs plus overrides for sampling, speech speed, pauses, CFG, seed. +- **IndexTTS2 Simple** - speaker audio, text, optional emotion audio/vector; outputs audio + status string. Auto-selects device (FP32 by default; optional FP16 toggle) and includes an output gain scaler. +- **IndexTTS2 Advanced** - Simple inputs plus overrides for sampling, speech speed, pauses, CFG, seed, FP16 toggle, and output gain. - **IndexTTS2 Emotion Vector** – eight sliders (0.0–1.4, sum <= 1.5) producing an emotion vector. - **IndexTTS2 Emotion From Text** – requires ModelScope and local QwenEmotion; turns short text into an emotion vector + summary. @@ -56,6 +56,8 @@ Original repo: https://github.com/index-tts/index-tts - 404 or load failures usually mean a missing file in `checkpoints/`; re-check the tree above. - Emotion vector sum must stay <= 1.5. - BigVGAN CUDA kernel warnings are expected; PyTorch fallback kicks in automatically. +- Hearing metallic warble? Leave `use_fp16` off; enable it only if you really need more speed and accept the artifacts. +- Need more level? Raise `output_gain` (values above 1.0 are clipped back into [-1,1]). ## Logs you should see - `Loading config.json from local directory` @@ -66,3 +68,5 @@ Original repo: https://github.com/index-tts/index-tts + + diff --git a/nodes/indextts2_node.py b/nodes/indextts2_node.py index d5e36c8..cfb181d 100644 --- a/nodes/indextts2_node.py +++ b/nodes/indextts2_node.py @@ -1,4 +1,4 @@ -import gc +import gc import os import sys import tempfile @@ -340,6 +340,8 @@ class IndexTTS2Simple: "optional": { "emotion_audio": ("AUDIO",), "emotion_vector": ("EMOTION_VECTOR",), + "use_fp16": ("BOOLEAN", {"default": False}), + "output_gain": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 4.0, "step": 0.05}), }, } @@ -352,7 +354,7 @@ class IndexTTS2Simple: text: str, emotion_control_weight: float, emotion_audio=None, - emotion_vector=None): + emotion_vector=None, use_fp16=False, output_gain=1.0): if not isinstance(text, str) or len(text.strip()) == 0: raise ValueError("Text is empty. Please provide text to synthesize.") @@ -377,17 +379,28 @@ class IndexTTS2Simple: raise FileNotFoundError(f"Model directory not found: {resolved_model_dir}") resolved_device = _resolve_device("auto") + use_fp16_flag = bool(use_fp16) tts2 = _get_tts2_model( config_path=resolved_config, model_dir=resolved_model_dir, device=resolved_device, use_cuda_kernel=False, - use_fp16=True, + use_fp16=use_fp16_flag, ) emo_alpha = max(0.0, min(1.0, float(emotion_control_weight))) - emo_vector = None ui_msgs = [] + ui_msgs.append(f"Model precision: {'FP16' if use_fp16_flag else 'FP32'}") + + try: + gain_value = float(output_gain) + except (TypeError, ValueError): + gain_value = 1.0 + if not math.isfinite(gain_value): + gain_value = 1.0 + gain_value = max(0.0, min(4.0, gain_value)) + + emo_vector = None if emotion_vector is not None: try: vec = list(emotion_vector) @@ -461,6 +474,13 @@ class IndexTTS2Simple: if mono.ndim != 1: mono = mono.flatten() + if gain_value != 1.0: + mono = np.clip(mono * gain_value, -1.0, 1.0) + ui_msgs.append(f"Output gain applied: {gain_value:.2f}x") + waveform = torch.from_numpy(mono[None, None, :].astype(np.float32)) #(B=1, C=1, N) info_text = "\n".join(ui_msgs) if ui_msgs else "" return ({"sample_rate": int(sr), "waveform": waveform}, info_text) + + + diff --git a/nodes/indextts2_node_advanced.py b/nodes/indextts2_node_advanced.py index d569d19..139a09b 100644 --- a/nodes/indextts2_node_advanced.py +++ b/nodes/indextts2_node_advanced.py @@ -1,4 +1,4 @@ -import os +import os import random import numpy as np @@ -123,6 +123,8 @@ class IndexTTS2Advanced: "typical_sampling": ("BOOLEAN", {"default": False}), "typical_mass": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 2000.0, "step": 0.01}), "speech_speed": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 4.0, "step": 0.05}), + "use_fp16": ("BOOLEAN", {"default": False}), + "output_gain": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 4.0, "step": 0.05}), }, } @@ -150,7 +152,9 @@ class IndexTTS2Advanced: max_mel_tokens: int = 1500, typical_sampling: bool = False, typical_mass: float = 0.9, - speech_speed: float = 1.0): + speech_speed: float = 1.0, + use_fp16: bool = False, + output_gain: float = 1.0): if not isinstance(text, str) or len(text.strip()) == 0: raise ValueError("Text is empty. Please provide text to synthesize.") @@ -181,12 +185,13 @@ class IndexTTS2Advanced: raise FileNotFoundError(f"Model directory not found: {resolved_model_dir}") resolved_device = _resolve_device("auto") + use_fp16_flag = _coerce_bool(use_fp16, False) tts2 = _get_tts2_model( config_path=resolved_config, model_dir=resolved_model_dir, device=resolved_device, use_cuda_kernel=False, - use_fp16=True, + use_fp16=use_fp16_flag, ) torch_mod = None @@ -219,6 +224,9 @@ class IndexTTS2Advanced: emo_alpha = max(0.0, min(1.0, float(emotion_control_weight))) emo_audio_prompt = emo_path if emo_path else prompt_path ui_msgs = [] + ui_msgs.append(f"Model precision: {'FP16' if use_fp16_flag else 'FP32'}") + + gain_value = _coerce_float(output_gain, 1.0, clamp=(0.0, 4.0)) emo_vector_arg = None if emotion_vector is not None: @@ -330,11 +338,16 @@ class IndexTTS2Advanced: if mono.ndim != 1: mono = mono.flatten() - waveform = torch_lib.from_numpy(mono[None, None, :].astype(np.float32)) - info_lines = [] if ui_msgs: info_lines.extend(ui_msgs) + + if gain_value != 1.0: + mono = np.clip(mono * gain_value, -1.0, 1.0) + info_lines.append(f"Output gain applied: {gain_value:.2f}x") + + waveform = torch_lib.from_numpy(mono[None, None, :].astype(np.float32)) + info_lines.append(f"Seed: {seed_info}") if do_sample: info_lines.append(f"Sampling: temp={temperature:.2f}, top_p={top_p:.2f}, top_k={top_k}") @@ -348,3 +361,5 @@ class IndexTTS2Advanced: info_text = "\n".join(info_lines) return ({"sample_rate": int(sr), "waveform": waveform}, info_text) + +