Merge branch 'dev-fp32-gain-toggles'

This commit is contained in:
snicolast
2025-10-08 15:35:06 +13:00
5 changed files with 231 additions and 17 deletions

View File

@@ -17,11 +17,27 @@ Original repo: https://github.com/index-tts/index-tts
```
## Models
- Create `checkpoints/` in the repo root and copy the IndexTTS-2 release there (https://huggingface.co/IndexTeam/IndexTTS-2/tree/main). Missing files will be cached from Hugging Face automatically.
- Create `checkpoints/` in the repo root and copy the IndexTTS-2 release there (https://huggingface.co/IndexTeam/IndexTTS-2/tree/main). Missing files will be cached from Hugging Face automatically, but a full local copy keeps everything offline.
- For full offline use download once and place the files below:
- `facebook/w2v-bert-2.0` -> `checkpoints/w2v-bert-2.0/` (the loader checks this folder before contacting Hugging Face)
- BigVGAN config and weights -> `checkpoints/bigvgan/`
- MaskGCT semantic codec -> `checkpoints/semantic_codec/model.safetensors`
- CAMPPlus model -> `checkpoints/campplus_cn_common.bin`
- Optional: QwenEmotion (`qwen0.6bemo4-merge/`) for the text-to-emotion helper node
- Typical layout:
```
checkpoints/
config.yaml, gpt.pth, s2mel.pth, bpe.model, feat*.pt, wav2vec2bert_stats.pt
bigvgan/{config.json,bigvgan_generator.pt}
semantic_codec/model.safetensors
campplus_cn_common.bin
qwen0.6bemo4-merge/[model files]
w2v-bert-2.0/[HF files]
```
## Nodes
- **IndexTTS2 Simple** speaker audio, text, optional emotion audio/vector; outputs audio + status string. Auto-selects device, FP16 on CUDA.
- **IndexTTS2 Advanced** Simple inputs plus overrides for sampling, speech speed, pauses, CFG, seed.
- **IndexTTS2 Simple** - speaker audio, text, optional emotion audio/vector; outputs audio + status string. Auto-selects device (FP32 by default; optional FP16 toggle) and includes an output gain scaler.
- **IndexTTS2 Advanced** - Simple inputs plus overrides for sampling, speech speed, pauses, CFG, seed, FP16 toggle, and output gain.
- **IndexTTS2 Emotion Vector** eight sliders (0.01.4, sum <= 1.5) producing an emotion vector.
- **IndexTTS2 Emotion From Text** requires ModelScope and local QwenEmotion; turns short text into an emotion vector + summary.
@@ -36,4 +52,14 @@ Original repo: https://github.com/index-tts/index-tts
## Troubleshooting
- Windows only so far; DeepSpeed is disabled.
- Install `wetext` if the module is missing on first launch.
- If w2v-bert keeps downloading, confirm `checkpoints/w2v-bert-2.0/` exists (or set `W2V_BERT_LOCAL_DIR`).
- 404 or load failures usually mean a missing file in `checkpoints/`; re-check the tree above.
- Emotion vector sum must stay <= 1.5.
- BigVGAN CUDA kernel warnings are expected; PyTorch fallback kicks in automatically.
- Hearing metallic warble? Leave `use_fp16` off; enable it only if you really need more speed and accept the artifacts.
- Need more level? Raise `output_gain` (values above 1.0 are clipped back into [-1,1]).
## Logs you should see
- `Loading config.json from local directory`
- `SeamlessM4TFeatureExtractor loaded from: checkpoints/w2v-bert-2.0/`
- Model paths pointing at your `checkpoints/` tree.

View File

@@ -1,18 +1,22 @@
from .nodes.indextts2_node import IndexTTS2Simple
from .nodes.indextts2_node import IndexTTS2Simple
from .nodes.indextts2_node_advanced import IndexTTS2Advanced
from .nodes.indextts2_node_emovec import IndexTTS2EmotionVector
from .nodes.indextts2_node_emotext import IndexTTS2EmotionFromText
from .nodes.indextts2_save_audio import IndexTTS2SaveAudio
NODE_CLASS_MAPPINGS = {
"IndexTTS2Simple": IndexTTS2Simple,
"IndexTTS2Advanced": IndexTTS2Advanced,
"IndexTTS2EmotionVector": IndexTTS2EmotionVector,
"IndexTTS2EmotionFromText": IndexTTS2EmotionFromText,
"IndexTTS2EmotionVector": IndexTTS2EmotionVector,
"IndexTTS2SaveAudio": IndexTTS2SaveAudio,
"IndexTTS2Simple": IndexTTS2Simple,
}
NODE_DISPLAY_NAME_MAPPINGS = {
"IndexTTS2Simple": "IndexTTS2 Simple",
"IndexTTS2Advanced": "IndexTTS2 Advanced",
"IndexTTS2EmotionVector": "IndexTTS2 Emotion Vector",
"IndexTTS2EmotionFromText": "IndexTTS2 Emotion From Text",
"IndexTTS2EmotionVector": "IndexTTS2 Emotion Vector",
"IndexTTS2SaveAudio": "IndexTTS2 Save Audio",
"IndexTTS2Simple": "IndexTTS2 Simple",
}

View File

@@ -1,8 +1,9 @@
import gc
import gc
import os
import sys
import tempfile
import threading
import math
from functools import wraps
from typing import Any, Dict, Tuple
@@ -340,6 +341,8 @@ class IndexTTS2Simple:
"optional": {
"emotion_audio": ("AUDIO",),
"emotion_vector": ("EMOTION_VECTOR",),
"use_fp16": ("BOOLEAN", {"default": False}),
"output_gain": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 4.0, "step": 0.05}),
},
}
@@ -352,7 +355,7 @@ class IndexTTS2Simple:
text: str,
emotion_control_weight: float,
emotion_audio=None,
emotion_vector=None):
emotion_vector=None, use_fp16=False, output_gain=1.0):
if not isinstance(text, str) or len(text.strip()) == 0:
raise ValueError("Text is empty. Please provide text to synthesize.")
@@ -377,17 +380,28 @@ class IndexTTS2Simple:
raise FileNotFoundError(f"Model directory not found: {resolved_model_dir}")
resolved_device = _resolve_device("auto")
use_fp16_flag = bool(use_fp16)
tts2 = _get_tts2_model(
config_path=resolved_config,
model_dir=resolved_model_dir,
device=resolved_device,
use_cuda_kernel=False,
use_fp16=True,
use_fp16=use_fp16_flag,
)
emo_alpha = max(0.0, min(1.0, float(emotion_control_weight)))
emo_vector = None
ui_msgs = []
ui_msgs.append(f"Model precision: {'FP16' if use_fp16_flag else 'FP32'}")
try:
gain_value = float(output_gain)
except (TypeError, ValueError):
gain_value = 1.0
if not math.isfinite(gain_value):
gain_value = 1.0
gain_value = max(0.0, min(4.0, gain_value))
emo_vector = None
if emotion_vector is not None:
try:
vec = list(emotion_vector)
@@ -461,6 +475,13 @@ class IndexTTS2Simple:
if mono.ndim != 1:
mono = mono.flatten()
if gain_value != 1.0:
mono = np.clip(mono * gain_value, -1.0, 1.0)
ui_msgs.append(f"Output gain applied: {gain_value:.2f}x")
waveform = torch.from_numpy(mono[None, None, :].astype(np.float32)) #(B=1, C=1, N)
info_text = "\n".join(ui_msgs) if ui_msgs else ""
return ({"sample_rate": int(sr), "waveform": waveform}, info_text)

View File

@@ -1,4 +1,4 @@
import os
import os
import random
import numpy as np
@@ -123,6 +123,8 @@ class IndexTTS2Advanced:
"typical_sampling": ("BOOLEAN", {"default": False}),
"typical_mass": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 2000.0, "step": 0.01}),
"speech_speed": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 4.0, "step": 0.05}),
"use_fp16": ("BOOLEAN", {"default": False}),
"output_gain": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 4.0, "step": 0.05}),
},
}
@@ -150,7 +152,9 @@ class IndexTTS2Advanced:
max_mel_tokens: int = 1500,
typical_sampling: bool = False,
typical_mass: float = 0.9,
speech_speed: float = 1.0):
speech_speed: float = 1.0,
use_fp16: bool = False,
output_gain: float = 1.0):
if not isinstance(text, str) or len(text.strip()) == 0:
raise ValueError("Text is empty. Please provide text to synthesize.")
@@ -181,12 +185,13 @@ class IndexTTS2Advanced:
raise FileNotFoundError(f"Model directory not found: {resolved_model_dir}")
resolved_device = _resolve_device("auto")
use_fp16_flag = _coerce_bool(use_fp16, False)
tts2 = _get_tts2_model(
config_path=resolved_config,
model_dir=resolved_model_dir,
device=resolved_device,
use_cuda_kernel=False,
use_fp16=True,
use_fp16=use_fp16_flag,
)
torch_mod = None
@@ -219,6 +224,9 @@ class IndexTTS2Advanced:
emo_alpha = max(0.0, min(1.0, float(emotion_control_weight)))
emo_audio_prompt = emo_path if emo_path else prompt_path
ui_msgs = []
ui_msgs.append(f"Model precision: {'FP16' if use_fp16_flag else 'FP32'}")
gain_value = _coerce_float(output_gain, 1.0, clamp=(0.0, 4.0))
emo_vector_arg = None
if emotion_vector is not None:
@@ -330,11 +338,16 @@ class IndexTTS2Advanced:
if mono.ndim != 1:
mono = mono.flatten()
waveform = torch_lib.from_numpy(mono[None, None, :].astype(np.float32))
info_lines = []
if ui_msgs:
info_lines.extend(ui_msgs)
if gain_value != 1.0:
mono = np.clip(mono * gain_value, -1.0, 1.0)
info_lines.append(f"Output gain applied: {gain_value:.2f}x")
waveform = torch_lib.from_numpy(mono[None, None, :].astype(np.float32))
info_lines.append(f"Seed: {seed_info}")
if do_sample:
info_lines.append(f"Sampling: temp={temperature:.2f}, top_p={top_p:.2f}, top_k={top_k}")
@@ -348,3 +361,5 @@ class IndexTTS2Advanced:
info_text = "\n".join(info_lines)
return ({"sample_rate": int(sr), "waveform": waveform}, info_text)

View File

@@ -0,0 +1,148 @@
import os
from typing import List
import numpy as np
import folder_paths
class IndexTTS2SaveAudio:
@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"audio": ("AUDIO",),
"name": ("STRING", {"default": "tts2", "placeholder": "file name prefix"}),
"format": ("COMBO", {"options": ["wav", "mp3"], "default": "wav"}),
},
"optional": {
"normalize_peak": ("BOOLEAN", {"default": False, "tooltip": "Normalize peak to ~0.98 before saving."}),
# WAV
"wav_pcm": ("COMBO", {"options": ["pcm16", "pcm24", "f32"], "default": "pcm16"}),
# MP3
"mp3_bitrate": ("COMBO", {"options": ["128k", "192k", "256k", "320k"], "default": "320k"}),
},
}
RETURN_TYPES = ("AUDIO", "STRING")
RETURN_NAMES = ("audio", "saved_path")
FUNCTION = "save"
CATEGORY = "Audio/IndexTTS"
def _normalize(self, mono: np.ndarray):
peak = float(np.max(np.abs(mono))) if mono.size else 0.0
if peak > 1e-6:
mono = np.clip(mono * (0.98 / peak), -1.0, 1.0)
return mono
def _save_wav(self, path: str, data: np.ndarray, sr: int, pcm: str):
try:
import soundfile as sf # type: ignore
subtype = {
"pcm16": "PCM_16",
"pcm24": "PCM_24",
"f32": "FLOAT",
}.get(pcm, "PCM_16")
sf.write(path, data.T, sr, subtype=subtype, format="WAV")
return True
except Exception:
# Fallback to wave for PCM16 only
if pcm != "pcm16":
raise
import wave, contextlib
pcm16 = (np.clip(data, -1.0, 1.0) * 32767.0).astype(np.int16)
with contextlib.closing(wave.open(path, "wb")) as wf:
wf.setnchannels(int(data.shape[0]))
wf.setsampwidth(2)
wf.setframerate(int(sr))
wf.writeframes(pcm16.T.tobytes())
return True
def _compose_paths(self, name_prefix: str, batch_count: int) -> List[str]:
output_dir = folder_paths.get_output_directory()
# Use Comfy's helper to build prefix and a counter
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
f"audio/{name_prefix}", output_dir
)
paths = []
for b in range(batch_count):
filename_with_batch = filename.replace("%batch_num%", str(b))
file = f"{filename_with_batch}_{counter:05}_"
paths.append(os.path.join(full_output_folder, file))
counter += 1
return paths
def _save_with_av(self, fmt: str, audio, filename_prefix: str, quality: str = "320k") -> List[str]:
try:
from comfy_extras import nodes_audio as ce_audio # type: ignore
except Exception as e:
raise RuntimeError(f"PyAV save requires comfy_extras.nodes_audio: {e}")
if fmt == "mp3":
saver = ce_audio.SaveAudioMP3()
ui = saver.save_mp3(audio, filename_prefix=filename_prefix, format="mp3", quality=quality)
else:
raise ValueError(f"Unsupported format for AV saver (mp3 only): {fmt}")
results = ui.get("ui", {}).get("audio", [])
base = folder_paths.get_output_directory()
out: List[str] = []
for item in results:
sub = item.get("subfolder") or ""
out.append(os.path.join(base, sub, item.get("filename", "")))
return out
def save(self, audio, name: str, format: str,
normalize_peak: bool = False,
wav_pcm: str = "pcm16",
mp3_bitrate: str = "320k"):
# Extract waveform
import torch
wav = audio["waveform"]
sr = int(audio["sample_rate"]) if isinstance(audio.get("sample_rate"), (int, float)) else 22050
if hasattr(wav, "cpu"):
wav = wav.cpu().numpy()
wav = np.asarray(wav)
# Shape: (B, C, N)
if wav.ndim != 3:
raise ValueError("AUDIO input must be shaped (B, C, N)")
# Prepare per-batch data as float32 in [-1,1]
batch = []
for b in range(wav.shape[0]):
np_w = wav[b]
if np_w.dtype == np.int16:
np_w = np_w.astype(np.float32) / 32767.0
elif np_w.dtype != np.float32:
np_w = np_w.astype(np.float32)
# Keep original channels; expect 1 or 2 generally
if normalize_peak:
if np_w.shape[0] == 1:
np_w[0] = self._normalize(np_w[0])
else:
# Normalize jointly to keep relative balance
peak = float(np.max(np.abs(np_w))) if np_w.size else 0.0
if peak > 1e-6:
np_w = np.clip(np_w * (0.98 / peak), -1.0, 1.0)
batch.append(np_w)
name_prefix = (name or "tts2").strip() or "tts2"
paths: List[str] = []
if format == "wav":
base_paths = self._compose_paths(name_prefix, len(batch))
for np_w, base in zip(batch, base_paths):
out_path = base + ".wav"
os.makedirs(os.path.dirname(out_path), exist_ok=True)
self._save_wav(out_path, np_w, sr, wav_pcm)
paths.append(out_path)
elif format == "mp3":
paths = self._save_with_av("mp3", audio, filename_prefix=f"audio/{name_prefix}", quality=mp3_bitrate)
else:
raise ValueError(f"Unsupported format: {format}")
saved = "\n".join(paths)
# passthrough audio so the graph can continue if needed
return (audio, saved)