save audio - embedded player

This commit is contained in:
snicolast
2025-10-13 20:15:52 +13:00
parent 7f426f5033
commit 5b33e02bc0
5 changed files with 223 additions and 24 deletions

View File

@@ -8,6 +8,7 @@ Original repo: https://github.com/index-tts/index-tts
![ComfyUI-IndexTTS2 nodes](images/overview.png)
## Updates
- 2025-10-13: Save Audio node now acts as an output node with an embedded player overlay for instant preview inside the graph (no need for downstream preview nodes).
- 2025-10-08: Default FP32 with optional FP16 toggle, output gain control, and a Save Audio helper node (wav/mp3 + quality parameters).
- 2025-09-22: Added IndexTTS2 Advanced node exposing sampling, speed, seed, and other generation controls.
@@ -27,7 +28,7 @@ Original repo: https://github.com/index-tts/index-tts
- **IndexTTS2 Advanced** - Simple inputs plus overrides for sampling, speech speed, pauses, CFG, seed, FP16 toggle, and output gain.
- **IndexTTS2 Emotion Vector** eight sliders (0.01.4, sum <= 1.5) producing an emotion vector.
- **IndexTTS2 Emotion From Text** requires ModelScope and local QwenEmotion; turns short text into an emotion vector + summary.
- **IndexTTS2 Save Audio** - saves generated audio tensors to disk with wav/mp3 options.
- **IndexTTS2 Save Audio** - saves generated audio tensors to disk with wav/mp3 options and surfaces an inline player directly on the node after execution.
## Examples
- Speaker audio -> IndexTTS2 Simple -> Preview/Save Audio

View File

@@ -20,3 +20,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
"IndexTTS2Simple": "IndexTTS2 Simple",
}
WEB_DIRECTORY = "./web"
__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS", "WEB_DIRECTORY"]

Binary file not shown.

Before

Width:  |  Height:  |  Size: 233 KiB

After

Width:  |  Height:  |  Size: 293 KiB

View File

@@ -4,6 +4,9 @@ import numpy as np
import folder_paths
class IndexTTS2SaveAudio:
def __init__(self):
self._ui_type = "output"
@classmethod
def INPUT_TYPES(cls):
return {
@@ -21,9 +24,9 @@ class IndexTTS2SaveAudio:
},
}
RETURN_TYPES = ("AUDIO", "STRING")
RETURN_NAMES = ("audio", "saved_path")
RETURN_TYPES: tuple = ()
FUNCTION = "save"
OUTPUT_NODE = True
CATEGORY = "Audio/IndexTTS"
def _normalize(self, mono: np.ndarray):
@@ -55,21 +58,29 @@ class IndexTTS2SaveAudio:
wf.writeframes(pcm16.T.tobytes())
return True
def _compose_paths(self, name_prefix: str, batch_count: int) -> List[str]:
def _compose_paths(self, name_prefix: str, batch_count: int, extension: str):
output_dir = folder_paths.get_output_directory()
# Use Comfy's helper to build prefix and a counter
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
f"audio/{name_prefix}", output_dir
)
paths = []
entries = []
normalized_subfolder = subfolder or ""
for b in range(batch_count):
filename_with_batch = filename.replace("%batch_num%", str(b))
file = f"{filename_with_batch}_{counter:05}_"
paths.append(os.path.join(full_output_folder, file))
file = f"{filename_with_batch}_{counter:05}_.{extension}"
entries.append(
{
"abs_path": os.path.join(full_output_folder, file),
"filename": file,
"subfolder": normalized_subfolder,
"type": self._ui_type,
}
)
counter += 1
return paths
return entries
def _save_with_av(self, fmt: str, audio, filename_prefix: str, quality: str = "320k") -> List[str]:
def _save_with_av(self, fmt: str, audio, filename_prefix: str, quality: str = "320k") -> List[dict]:
try:
from comfy_extras import nodes_audio as ce_audio # type: ignore
except Exception as e:
@@ -82,12 +93,14 @@ class IndexTTS2SaveAudio:
raise ValueError(f"Unsupported format for AV saver (mp3 only): {fmt}")
results = ui.get("ui", {}).get("audio", [])
base = folder_paths.get_output_directory()
out: List[str] = []
for item in results:
sub = item.get("subfolder") or ""
out.append(os.path.join(base, sub, item.get("filename", "")))
return out
return [
{
"filename": item.get("filename", ""),
"subfolder": item.get("subfolder") or "",
"type": item.get("type") or self._ui_type,
}
for item in results
]
def save(self, audio, name: str, format: str,
normalize_peak: bool = False,
@@ -124,23 +137,27 @@ class IndexTTS2SaveAudio:
batch.append(np_w)
name_prefix = (name or "tts2").strip() or "tts2"
paths: List[str] = []
ui_results: List[dict] = []
if format == "wav":
base_paths = self._compose_paths(name_prefix, len(batch))
for np_w, base in zip(batch, base_paths):
out_path = base + ".wav"
entries = self._compose_paths(name_prefix, len(batch), "wav")
for np_w, entry in zip(batch, entries):
out_path = entry["abs_path"]
os.makedirs(os.path.dirname(out_path), exist_ok=True)
self._save_wav(out_path, np_w, sr, wav_pcm)
paths.append(out_path)
ui_results.append(
{
"filename": entry["filename"],
"subfolder": entry["subfolder"],
"type": entry["type"],
}
)
elif format == "mp3":
paths = self._save_with_av("mp3", audio, filename_prefix=f"audio/{name_prefix}", quality=mp3_bitrate)
ui_results = self._save_with_av("mp3", audio, filename_prefix=f"audio/{name_prefix}", quality=mp3_bitrate)
else:
raise ValueError(f"Unsupported format: {format}")
saved = "\n".join(paths)
# passthrough audio so the graph can continue if needed
return (audio, saved)
return {"ui": {"audio": ui_results}}

177
web/js/save_audio_player.js Normal file
View File

@@ -0,0 +1,177 @@
import { app } from '../../../scripts/app.js';
import { api } from '../../../scripts/api.js';
const MARGIN = 6;
let stylesInjected = false;
function ensureStyles() {
if (stylesInjected) return;
stylesInjected = true;
const style = document.createElement('style');
style.textContent = `
.indextts2-audio-player {
position: absolute;
z-index: 12;
pointer-events: auto;
display: flex;
flex-direction: column;
gap: 4px;
padding: 6px;
background: var(--bg-color, var(--comfy-menu-bg, #2a2a2a));
border: 1px solid var(--border-color, #444);
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.35);
min-width: 180px;
transition: opacity 0.2s ease, filter 0.2s ease;
}
.indextts2-audio-player[data-state="inactive"] {
opacity: 0.9;
}
.indextts2-audio-player[data-state="inactive"] audio {
pointer-events: none;
filter: grayscale(0.8);
opacity: 0.42;
}
.indextts2-audio-player__title {
font-size: 11px;
font-weight: 600;
color: var(--descrip-text, #c7c7c7);
}
`;
document.head.appendChild(style);
}
function buildTransformStyle(ctx, widgetWidth, y) {
const { canvas } = ctx;
const rect = canvas.getBoundingClientRect();
const matrix = new DOMMatrix()
.scaleSelf(rect.width / canvas.width, rect.height / canvas.height)
.multiplySelf(ctx.getTransform())
.translateSelf(MARGIN, y + MARGIN);
return {
transformOrigin: '0 0',
transform: matrix.toString(),
left: `${rect.left + window.scrollX}px`,
top: `${rect.top + window.scrollY}px`,
};
}
function buildAudioUrl(item) {
if (!item || !item.filename) {
return null;
}
const params = new URLSearchParams({
filename: item.filename,
type: item.type || 'output',
});
if (item.subfolder) {
params.set('subfolder', item.subfolder);
}
let url = api.apiURL(`/view?${params.toString()}`);
if (typeof app.getRandParam === 'function') {
url += app.getRandParam();
}
return url;
}
function setState(container, audioEl, titleEl, clip) {
if (clip?.url) {
if (audioEl.src !== clip.url) {
audioEl.src = clip.url;
audioEl.load();
}
titleEl.textContent = clip.title ?? 'Audio preview';
container.dataset.state = 'active';
} else {
audioEl.pause();
audioEl.removeAttribute('src');
audioEl.load();
titleEl.textContent = 'No audio yet';
container.dataset.state = 'inactive';
}
}
app.registerExtension({
name: 'IndexTTS2.SaveAudioPlayer',
beforeRegisterNodeDef(nodeType, nodeData) {
if (nodeData?.name !== 'IndexTTS2SaveAudio') {
return;
}
const originalOnNodeCreated = nodeType.prototype.onNodeCreated;
nodeType.prototype.onNodeCreated = function (...args) {
originalOnNodeCreated?.apply(this, args);
ensureStyles();
const container = document.createElement('div');
container.className = 'indextts2-audio-player';
const title = document.createElement('div');
title.className = 'indextts2-audio-player__title';
title.textContent = 'No audio yet';
const audio = document.createElement('audio');
audio.controls = true;
audio.style.width = '100%';
container.appendChild(title);
container.appendChild(audio);
document.body.appendChild(container);
const widget = {
name: 'indextts2_audio_widget',
type: 'indextts2_audio_widget',
draw(ctx, node, widgetWidth, y) {
if (!container.isConnected) {
document.body.appendChild(container);
}
const baseWidth = Math.max(160, (node.size?.[0] ?? widgetWidth) - MARGIN * 2);
container.style.width = `${baseWidth}px`;
const style = buildTransformStyle(ctx, widgetWidth, y);
Object.assign(container.style, style);
},
serialize: false,
computeSize() {
return [220, 90];
},
};
this.addCustomWidget(widget);
this.size = [this.size[0], Math.max(this.size[1], 120)];
const originalOnRemoved = this.onRemoved;
this.onRemoved = function () {
container.remove();
originalOnRemoved?.apply(this, arguments);
};
setState(container, audio, title, null);
const originalOnExecuted = this.onExecuted;
this.onExecuted = function (message) {
originalOnExecuted?.apply(this, arguments);
const audioResults =
message?.ui?.audio ||
message?.audio ||
[];
if (Array.isArray(audioResults) && audioResults.length > 0) {
const latest = audioResults[audioResults.length - 1];
const url = buildAudioUrl(latest);
if (url) {
setState(container, audio, title, {
url,
title: latest.filename || 'Audio preview',
});
return;
}
}
setState(container, audio, title, null);
};
};
},
});