mirror of
https://github.com/snicolast/ComfyUI-IndexTTS2.git
synced 2026-01-26 14:39:44 +00:00
node changes for audio
This commit is contained in:
@@ -36,8 +36,8 @@ Original repo: https://github.com/index-tts/index-tts
|
||||
```
|
||||
|
||||
## Nodes
|
||||
- **IndexTTS2 Simple** – speaker audio, text, optional emotion audio/vector; outputs audio + status string. Auto-selects device, FP16 on CUDA.
|
||||
- **IndexTTS2 Advanced** – Simple inputs plus overrides for sampling, speech speed, pauses, CFG, seed.
|
||||
- **IndexTTS2 Simple** - speaker audio, text, optional emotion audio/vector; outputs audio + status string. Auto-selects device (FP32 by default; optional FP16 toggle) and includes an output gain scaler.
|
||||
- **IndexTTS2 Advanced** - Simple inputs plus overrides for sampling, speech speed, pauses, CFG, seed, FP16 toggle, and output gain.
|
||||
- **IndexTTS2 Emotion Vector** – eight sliders (0.0–1.4, sum <= 1.5) producing an emotion vector.
|
||||
- **IndexTTS2 Emotion From Text** – requires ModelScope and local QwenEmotion; turns short text into an emotion vector + summary.
|
||||
|
||||
@@ -56,6 +56,8 @@ Original repo: https://github.com/index-tts/index-tts
|
||||
- 404 or load failures usually mean a missing file in `checkpoints/`; re-check the tree above.
|
||||
- Emotion vector sum must stay <= 1.5.
|
||||
- BigVGAN CUDA kernel warnings are expected; PyTorch fallback kicks in automatically.
|
||||
- Hearing metallic warble? Leave `use_fp16` off; enable it only if you really need more speed and accept the artifacts.
|
||||
- Need more level? Raise `output_gain` (values above 1.0 are clipped back into [-1,1]).
|
||||
|
||||
## Logs you should see
|
||||
- `Loading config.json from local directory`
|
||||
@@ -66,3 +68,5 @@ Original repo: https://github.com/index-tts/index-tts
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import gc
|
||||
import gc
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
@@ -340,6 +340,8 @@ class IndexTTS2Simple:
|
||||
"optional": {
|
||||
"emotion_audio": ("AUDIO",),
|
||||
"emotion_vector": ("EMOTION_VECTOR",),
|
||||
"use_fp16": ("BOOLEAN", {"default": False}),
|
||||
"output_gain": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 4.0, "step": 0.05}),
|
||||
},
|
||||
}
|
||||
|
||||
@@ -352,7 +354,7 @@ class IndexTTS2Simple:
|
||||
text: str,
|
||||
emotion_control_weight: float,
|
||||
emotion_audio=None,
|
||||
emotion_vector=None):
|
||||
emotion_vector=None, use_fp16=False, output_gain=1.0):
|
||||
|
||||
if not isinstance(text, str) or len(text.strip()) == 0:
|
||||
raise ValueError("Text is empty. Please provide text to synthesize.")
|
||||
@@ -377,17 +379,28 @@ class IndexTTS2Simple:
|
||||
raise FileNotFoundError(f"Model directory not found: {resolved_model_dir}")
|
||||
|
||||
resolved_device = _resolve_device("auto")
|
||||
use_fp16_flag = bool(use_fp16)
|
||||
tts2 = _get_tts2_model(
|
||||
config_path=resolved_config,
|
||||
model_dir=resolved_model_dir,
|
||||
device=resolved_device,
|
||||
use_cuda_kernel=False,
|
||||
use_fp16=True,
|
||||
use_fp16=use_fp16_flag,
|
||||
)
|
||||
|
||||
emo_alpha = max(0.0, min(1.0, float(emotion_control_weight)))
|
||||
emo_vector = None
|
||||
ui_msgs = []
|
||||
ui_msgs.append(f"Model precision: {'FP16' if use_fp16_flag else 'FP32'}")
|
||||
|
||||
try:
|
||||
gain_value = float(output_gain)
|
||||
except (TypeError, ValueError):
|
||||
gain_value = 1.0
|
||||
if not math.isfinite(gain_value):
|
||||
gain_value = 1.0
|
||||
gain_value = max(0.0, min(4.0, gain_value))
|
||||
|
||||
emo_vector = None
|
||||
if emotion_vector is not None:
|
||||
try:
|
||||
vec = list(emotion_vector)
|
||||
@@ -461,6 +474,13 @@ class IndexTTS2Simple:
|
||||
if mono.ndim != 1:
|
||||
mono = mono.flatten()
|
||||
|
||||
if gain_value != 1.0:
|
||||
mono = np.clip(mono * gain_value, -1.0, 1.0)
|
||||
ui_msgs.append(f"Output gain applied: {gain_value:.2f}x")
|
||||
|
||||
waveform = torch.from_numpy(mono[None, None, :].astype(np.float32)) #(B=1, C=1, N)
|
||||
info_text = "\n".join(ui_msgs) if ui_msgs else ""
|
||||
return ({"sample_rate": int(sr), "waveform": waveform}, info_text)
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import os
|
||||
import os
|
||||
import random
|
||||
import numpy as np
|
||||
|
||||
@@ -123,6 +123,8 @@ class IndexTTS2Advanced:
|
||||
"typical_sampling": ("BOOLEAN", {"default": False}),
|
||||
"typical_mass": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 2000.0, "step": 0.01}),
|
||||
"speech_speed": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 4.0, "step": 0.05}),
|
||||
"use_fp16": ("BOOLEAN", {"default": False}),
|
||||
"output_gain": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 4.0, "step": 0.05}),
|
||||
},
|
||||
}
|
||||
|
||||
@@ -150,7 +152,9 @@ class IndexTTS2Advanced:
|
||||
max_mel_tokens: int = 1500,
|
||||
typical_sampling: bool = False,
|
||||
typical_mass: float = 0.9,
|
||||
speech_speed: float = 1.0):
|
||||
speech_speed: float = 1.0,
|
||||
use_fp16: bool = False,
|
||||
output_gain: float = 1.0):
|
||||
|
||||
if not isinstance(text, str) or len(text.strip()) == 0:
|
||||
raise ValueError("Text is empty. Please provide text to synthesize.")
|
||||
@@ -181,12 +185,13 @@ class IndexTTS2Advanced:
|
||||
raise FileNotFoundError(f"Model directory not found: {resolved_model_dir}")
|
||||
|
||||
resolved_device = _resolve_device("auto")
|
||||
use_fp16_flag = _coerce_bool(use_fp16, False)
|
||||
tts2 = _get_tts2_model(
|
||||
config_path=resolved_config,
|
||||
model_dir=resolved_model_dir,
|
||||
device=resolved_device,
|
||||
use_cuda_kernel=False,
|
||||
use_fp16=True,
|
||||
use_fp16=use_fp16_flag,
|
||||
)
|
||||
|
||||
torch_mod = None
|
||||
@@ -219,6 +224,9 @@ class IndexTTS2Advanced:
|
||||
emo_alpha = max(0.0, min(1.0, float(emotion_control_weight)))
|
||||
emo_audio_prompt = emo_path if emo_path else prompt_path
|
||||
ui_msgs = []
|
||||
ui_msgs.append(f"Model precision: {'FP16' if use_fp16_flag else 'FP32'}")
|
||||
|
||||
gain_value = _coerce_float(output_gain, 1.0, clamp=(0.0, 4.0))
|
||||
|
||||
emo_vector_arg = None
|
||||
if emotion_vector is not None:
|
||||
@@ -330,11 +338,16 @@ class IndexTTS2Advanced:
|
||||
if mono.ndim != 1:
|
||||
mono = mono.flatten()
|
||||
|
||||
waveform = torch_lib.from_numpy(mono[None, None, :].astype(np.float32))
|
||||
|
||||
info_lines = []
|
||||
if ui_msgs:
|
||||
info_lines.extend(ui_msgs)
|
||||
|
||||
if gain_value != 1.0:
|
||||
mono = np.clip(mono * gain_value, -1.0, 1.0)
|
||||
info_lines.append(f"Output gain applied: {gain_value:.2f}x")
|
||||
|
||||
waveform = torch_lib.from_numpy(mono[None, None, :].astype(np.float32))
|
||||
|
||||
info_lines.append(f"Seed: {seed_info}")
|
||||
if do_sample:
|
||||
info_lines.append(f"Sampling: temp={temperature:.2f}, top_p={top_p:.2f}, top_k={top_k}")
|
||||
@@ -348,3 +361,5 @@ class IndexTTS2Advanced:
|
||||
info_text = "\n".join(info_lines)
|
||||
return ({"sample_rate": int(sr), "waveform": waveform}, info_text)
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user