voice bleeding fix, audio quality, input speakers tags, zero-shot voices

This commit is contained in:
WildAi
2025-09-24 17:42:30 +03:00
parent d04665d073
commit 696ef69152
6 changed files with 260 additions and 247 deletions

View File

@@ -1,9 +1,71 @@
{
"id": "b91265e5-1b03-4b63-8dc3-4abd9a030e08",
"revision": 0,
"last_node_id": 11,
"last_link_id": 29,
"last_node_id": 14,
"last_link_id": 44,
"nodes": [
{
"id": 3,
"type": "SaveAudio",
"pos": [
-1040,
-1130
],
"size": [
270,
112
],
"flags": {},
"order": 6,
"mode": 0,
"inputs": [
{
"name": "audio",
"type": "AUDIO",
"link": 27
}
],
"outputs": [],
"properties": {
"Node name for S&R": "SaveAudio",
"cnr_id": "comfy-core",
"ver": "0.3.52",
"ue_properties": {
"widget_ue_connectable": {
"filename_prefix": true,
"audioUI": true
},
"version": "7.0.1"
}
},
"widgets_values": [
"audio/VibeVoice"
]
},
{
"id": 13,
"type": "MarkdownNote",
"pos": [
-1898.1748046875,
-1409.22314453125
],
"size": [
1035.619873046875,
211.96694946289062
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [],
"outputs": [],
"title": "Note",
"properties": {},
"widgets_values": [
"# ComfyUI-VibeVoice\n\nVibeVoice is a novel framework by Microsoft for generating expressive, long-form, multi-speaker conversational audio. It excels at creating natural-sounding dialogue, podcasts, and more, with consistent voices for up to 4 speakers.\n\n**✨ Key Features:**\n* **Multi-Speaker TTS:** Generate conversations with up to 4 distinct voices in a single audio output.\n* **High-Fidelity Voice Cloning:** Use any audio file (`.wav`, `.mp3`) as a reference for a speaker's voice.\n* **Hybrid Generation Mode:** Mix and match cloned voices with high-quality, zero-shot generated voices in the same script.\n* **Flexible Scripting:** Use simple `[1]` tags or the classic `Speaker 1:` format to write your dialogue.\n* **Advanced Attention Mechanisms:** Choose between `eager`, `sdpa`, `flash_attention_2`, and the high-performance `sage` attention for fine-tuned control over speed and compatibility.\n* **Robust 4-Bit Quantization:** Run the large language model component in 4-bit mode to significantly reduce VRAM usage.\n* **Automatic Model Management:** Models are downloaded automatically and managed efficiently by ComfyUI to save VRAM."
],
"color": "#233",
"bgcolor": "#355"
},
{
"id": 4,
"type": "LoadAudio",
@@ -12,26 +74,24 @@
-1130
],
"size": [
274.080078125,
272.9800720214844,
136
],
"flags": {},
"order": 0,
"order": 1,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "AUDIO",
"type": "AUDIO",
"links": [
28
]
"links": []
}
],
"properties": {
"Node name for S&R": "LoadAudio",
"cnr_id": "comfy-core",
"ver": "0.3.52",
"Node name for S&R": "LoadAudio",
"ue_properties": {
"widget_ue_connectable": {
"audio": true,
@@ -51,30 +111,28 @@
"id": 8,
"type": "LoadAudio",
"pos": [
-1900,
-940
-1901.10009765625,
-948.7998046875
],
"size": [
274.080078125,
136
],
"flags": {},
"order": 1,
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [
{
"name": "AUDIO",
"type": "AUDIO",
"links": [
29
]
"links": []
}
],
"properties": {
"Node name for S&R": "LoadAudio",
"cnr_id": "comfy-core",
"ver": "0.3.52",
"Node name for S&R": "LoadAudio",
"ue_properties": {
"widget_ue_connectable": {
"audio": true,
@@ -91,71 +149,52 @@
]
},
{
"id": 10,
"id": 12,
"type": "MarkdownNote",
"pos": [
-1030,
-960
-1915.701904296875,
-762.380126953125
],
"size": [
420,
210
312.85455322265625,
292.8734130859375
],
"flags": {},
"order": 2,
"order": 3,
"mode": 0,
"inputs": [],
"outputs": [],
"title": "Notes",
"properties": {
"ue_properties": {
"widget_ue_connectable": {},
"version": "7.0.1"
}
},
"title": "Note",
"properties": {},
"widgets_values": [
"## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length | Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-0.5B-Streaming | - | - | On the way |\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-Large| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-Large) |"
"### Scripting and Voice Modes\n\n#### Speaker Tagging\nYou can assign lines to speakers in two ways. Both are treated identically.\n\n* **Modern Format (Recommended):** `[1] This is the first speaker.`\n* **Classic Format:** `Speaker 1: This is the first speaker.`\n\nYou can also add an optional colon to the modern format (e.g., `[1]: ...`). The node handles all variations consistently.\n\n#### Hybrid Voice Generation\nThis is a powerful feature that lets you mix cloned voices and generated (zero-shot) voices.\n\n* **To Clone a Voice:** Connect a `Load Audio` node to the speaker's input (e.g., `speaker_1_voice`).\n* **To Generate a Voice:** Leave the speaker's input empty. The model will create a unique, high-quality voice for that speaker."
],
"color": "#432",
"bgcolor": "#653"
"color": "#233",
"bgcolor": "#355"
},
{
"id": 3,
"type": "SaveAudio",
"id": 14,
"type": "MarkdownNote",
"pos": [
-1040,
-1130
-1048.3660888671875,
-960.8771362304688
],
"size": [
270,
112
280.797607421875,
487.02728271484375
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"name": "audio",
"type": "AUDIO",
"link": 27
}
],
"inputs": [],
"outputs": [],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.52",
"Node name for S&R": "SaveAudio",
"ue_properties": {
"widget_ue_connectable": {
"filename_prefix": true,
"audioUI": true
},
"version": "7.0.1"
}
},
"title": "Note",
"properties": {},
"widgets_values": [
"audio/VibeVoice"
]
"## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length | Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-Large| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-Large) |\n\n## Support \n\n- Don't know how to update PyTorch?\n- Need help with ComfyUI?\n- Need technical support?\n\n### Or do you just have questions? Then join the [@TokenDiffusion Hub](https://t.me/TokenDiff_hub) group\n\n### AI news [TokenDiffusion](https://t.me/TokenDiff)"
],
"color": "#233",
"bgcolor": "#355"
},
{
"id": 11,
@@ -165,24 +204,24 @@
-1130
],
"size": [
460,
510
475.3999938964844,
662.9000244140625
],
"flags": {},
"order": 3,
"order": 5,
"mode": 0,
"inputs": [
{
"name": "speaker_1_voice",
"shape": 7,
"type": "AUDIO",
"link": 28
"link": null
},
{
"name": "speaker_2_voice",
"shape": 7,
"type": "AUDIO",
"link": 29
"link": null
},
{
"name": "speaker_3_voice",
@@ -207,9 +246,9 @@
}
],
"properties": {
"Node name for S&R": "VibeVoiceTTS",
"cnr_id": "ComfyUI-VibeVoice",
"ver": "37803a884fb8f9b43c38286f6d654c7f97181a73",
"Node name for S&R": "VibeVoiceTTS",
"ue_properties": {
"widget_ue_connectable": {
"model_name": true,
@@ -229,12 +268,12 @@
},
"widgets_values": [
"VibeVoice-1.5B",
"Speaker 1: I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\nSpeaker 2: Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\nSpeaker 1: A nightmare? That's the same excuse you used last time. I'm starting to think you just don't care. It's easier to say 'work was crazy' than to just admit that I'm not a priority for you anymore.",
"[1] I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\n[2] Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\n",
false,
"flash_attention_2",
1.3,
10,
1,
471935335072093,
"fixed",
true,
0.95,
@@ -254,37 +293,21 @@
3,
0,
"AUDIO"
],
[
28,
4,
0,
11,
0,
"AUDIO"
],
[
29,
8,
0,
11,
1,
"AUDIO"
]
],
"groups": [],
"config": {},
"extra": {
"ue_links": [],
"links_added_by_ue": [],
"ds": {
"scale": 1.2100000000000002,
"scale": 0.8264462809917354,
"offset": [
2000,
1230
2015.701904296875,
1509.22314453125
]
},
"frontendVersion": "1.25.11",
"ue_links": [],
"links_added_by_ue": [],
"frontendVersion": "1.26.11",
"VHS_latentpreview": false,
"VHS_latentpreviewrate": 0,
"VHS_MetadataImage": true,

Binary file not shown.

Before

Width:  |  Height:  |  Size: 145 KiB

After

Width:  |  Height:  |  Size: 154 KiB

View File

@@ -34,27 +34,52 @@ def parse_script_1_based(script: str) -> tuple[list[tuple[int, str]], list[int]]
Parses a 1-based speaker script into a list of (speaker_id, text) tuples
and a list of unique speaker IDs in the order of their first appearance.
Internally, it converts speaker IDs to 0-based for the model.
Supports two formats:
1. Speaker 1: Some text...
2. [1] Some text...
If no speaker markers are found, the entire script is assigned to Speaker 1.
"""
parsed_lines = []
speaker_ids_in_script = [] # This will store the 1-based IDs from the script
line_format_regex = re.compile(r'^(?:Speaker\s+(\d+)\s*:|\[(\d+)\])\s*(.*)$', re.IGNORECASE)
for line in script.strip().split("\n"):
if not (line := line.strip()): continue
match = re.match(r'^Speaker\s+(\d+)\s*:\s*(.*)$', line, re.IGNORECASE)
match = line_format_regex.match(line)
if match:
speaker_id = int(match.group(1))
speaker_id_str = match.group(1) or match.group(2)
speaker_id = int(speaker_id_str)
text_content = match.group(3)
if match.group(1) is None and text_content.lstrip().startswith(':'):
colon_index = text_content.find(':')
text_content = text_content[colon_index + 1:]
if speaker_id < 1:
logger.warning(f"Speaker ID must be 1 or greater. Skipping line: '{line}'")
continue
text = ' ' + match.group(2).strip()
# Internally, the model expects 0-based indexing for speakers
text = text_content.strip() # REMOVED the prepended space ' ' +
internal_speaker_id = speaker_id - 1
parsed_lines.append((internal_speaker_id, text))
if speaker_id not in speaker_ids_in_script:
speaker_ids_in_script.append(speaker_id)
else:
logger.warning(f"Could not parse line, skipping: '{line}'")
logger.warning(f"Could not parse speaker marker, treating as part of previous line if any, or ignoring: '{line}'")
if not parsed_lines and script.strip():
logger.info("No speaker markers found. Treating entire text as a single utterance for Speaker 1.")
parsed_lines.append((0, ' ' + script.strip()))
speaker_ids_in_script.append(1)
return parsed_lines, sorted(list(set(speaker_ids_in_script)))
def preprocess_comfy_audio(audio_dict: dict, target_sr: int = 24000) -> np.ndarray:
"""
Converts a ComfyUI AUDIO dict to a mono NumPy array, resampling if necessary.

View File

@@ -480,11 +480,14 @@ class VibeVoiceForConditionalGenerationInference(VibeVoicePreTrainedModel, Gener
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
if is_prefill:
# we process the speech inputs only during the first generation step
prefill_inputs = {
"speech_tensors": speech_tensors.to(device=device),
"speech_masks": speech_masks.to(device),
"speech_input_mask": speech_input_mask.to(device),
}
# Conditionally add speech tensors to prefill_inputs only if they exist.
prefill_inputs = {}
if speech_tensors is not None:
prefill_inputs["speech_tensors"] = speech_tensors.to(device=device)
if speech_masks is not None:
prefill_inputs["speech_masks"] = speech_masks.to(device)
if speech_input_mask is not None:
prefill_inputs["speech_input_mask"] = speech_input_mask.to(device)
is_prefill = False
else:
_ = model_inputs.pop('inputs_embeds', None)

View File

@@ -147,8 +147,10 @@ class VibeVoiceProcessor:
def __call__(
self,
text: Optional[Union[str, List[str], TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
voice_samples: Optional[Union[List[Union[str, np.ndarray]], List[List[Union[str, np.ndarray]]]]] = None,
text: Optional[List[str]] = None,
parsed_scripts: Optional[List[List[Tuple[int, str]]]] = None, # <-- ADDED
voice_samples: Optional[List[List[Optional[Union[str, np.ndarray]]]]] = None,
speaker_ids_for_prompt: Optional[List[List[int]]] = None,
padding: Union[bool, str, PaddingStrategy] = True,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
@@ -189,31 +191,26 @@ class VibeVoiceProcessor:
- **speech_masks** -- Speech masks (if voice_samples provided)
- **speech_input_mask** -- Boolean masks indicating speech token positions
"""
# Handle single vs batch input
if isinstance(text, str) or (isinstance(text, list) and len(text) > 0 and not isinstance(text[0], str)):
# Single input
texts = [text]
is_batched = False
else:
# Batch input
texts = text
is_batched = True
# Handle voice samples
if voice_samples is not None:
if not is_batched or (isinstance(voice_samples[0], (str, np.ndarray))):
# Single set of voice samples
voice_samples_list = [voice_samples]
else:
# Batch of voice samples
voice_samples_list = voice_samples
else:
voice_samples_list = [None] * len(texts)
# Process each input
if parsed_scripts is None:
if text is None:
raise ValueError("Either 'text' or 'parsed_scripts' must be provided.")
# Fallback for raw text input (though the node won't use this path)
from ..modules.utils import parse_script_1_based
parsed_scripts = [parse_script_1_based(t)[0] for t in text]
num_scripts = len(parsed_scripts)
voice_samples_list = voice_samples if voice_samples is not None else [[] for _ in range(num_scripts)]
speaker_ids_list = speaker_ids_for_prompt if speaker_ids_for_prompt is not None else [[] for _ in range(num_scripts)]
all_encodings = []
for text_input, voice_input in zip(texts, voice_samples_list):
encoding = self._process_single(text_input, voice_input)
for i in range(num_scripts):
# Pass all three corresponding items to _process_single
encoding = self._process_single(
parsed_scripts[i],
voice_samples_list[i],
speaker_ids_list[i]
)
all_encodings.append(encoding)
# Combine batch
@@ -230,62 +227,38 @@ class VibeVoiceProcessor:
def _process_single(
self,
text: Union[str, TextInput],
voice_samples: Optional[List[Union[str, np.ndarray]]] = None,
parsed_script: List[Tuple[int, str]],
voice_samples: List[Optional[Union[str, np.ndarray]]],
speaker_ids: List[int],
) -> Dict[str, Any]:
"""Process a single podcast script."""
# Determine if text is a file path or direct script
script = None
if isinstance(text, str):
# Check if it's a file path
if text.endswith('.json') and os.path.exists(text):
script = self._convert_json_to_script(text)
elif text.endswith('.txt') and os.path.exists(text):
script = self._convert_text_to_script(text)
else:
# Assume it's the script content directly
script = text
if script is None:
raise ValueError(f"Could not process input text: {text}")
# Parse the script
parsed_lines = self._parse_script(script)
all_speakers = list(set(speaker_id for speaker_id, _ in parsed_lines))
# Create system prompt
# system_tokens = self.tokenizer.encode(self.system_prompt, add_special_tokens=False)
system_tokens = self.tokenizer.encode(self.system_prompt)
# Process voice samples if provided
if voice_samples:
voice_tokens, voice_speech_inputs, voice_speech_masks = self._create_voice_prompt(voice_samples[:len(all_speakers)])
else:
voice_tokens, voice_speech_inputs, voice_speech_masks = [], [], []
# Build full token sequence
voice_tokens, voice_speech_inputs, voice_speech_masks = self._create_voice_prompt(
voice_samples, speaker_ids
)
full_tokens = system_tokens + voice_tokens
speech_input_mask = [False] * len(system_tokens) + voice_speech_masks
# Add text input section
full_tokens += self.tokenizer.encode(' Text input:\n', add_special_tokens=False)
speech_input_mask += [False] * len(self.tokenizer.encode(' Text input:\n', add_special_tokens=False))
for speaker_id, speaker_text in parsed_lines:
speaker_text_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:{speaker_text}\n", add_special_tokens=False)
full_tokens += speaker_text_tokens
speech_input_mask += [False] * len(speaker_text_tokens)
# Add speech output section
full_tokens += self.tokenizer.encode(' Speech output:\n', add_special_tokens=False) + [self.tokenizer.speech_start_id]
speech_input_mask += [False] * (len(self.tokenizer.encode(' Speech output:\n', add_special_tokens=False)) + 1)
dialogue_lines = []
for speaker_id_0_based, text_chunk in parsed_script:
speaker_id_1_based = speaker_id_0_based + 1
dialogue_lines.append(f"Speaker {speaker_id_1_based}: : {text_chunk}")
full_dialogue_script = "\n".join(dialogue_lines)
final_prompt_text = f" Text input:\n{full_dialogue_script}\n Speech output:\n"
prompt_tokens = self.tokenizer.encode(final_prompt_text, add_special_tokens=False)
full_tokens += prompt_tokens + [self.tokenizer.speech_start_id]
speech_input_mask += [False] * (len(prompt_tokens) + 1)
return {
"input_ids": full_tokens,
"speech_inputs": voice_speech_inputs if voice_speech_inputs else None,
"speech_input_mask": speech_input_mask,
"parsed_script": parsed_lines,
"all_speakers": all_speakers,
}
def _batch_encode(
@@ -298,11 +271,9 @@ class VibeVoiceProcessor:
return_attention_mask: bool = True,
) -> BatchEncoding:
"""Combine multiple encodings into a batch with padding."""
# Extract input_ids and create attention_mask
input_ids_list = [enc["input_ids"] for enc in encodings]
speech_input_masks_list = [enc["speech_input_mask"] for enc in encodings]
# Determine padding strategy
if isinstance(padding, bool):
padding_strategy = PaddingStrategy.LONGEST if padding else PaddingStrategy.DO_NOT_PAD
elif isinstance(padding, str):
@@ -347,15 +318,11 @@ class VibeVoiceProcessor:
# No padding, just create attention masks
attention_masks = [[1] * len(ids) for ids in input_ids_list] if return_attention_mask else None
# Process speech inputs
all_speech_inputs = []
has_speech = False
for enc in encodings:
if enc["speech_inputs"] is not None:
if enc.get("speech_inputs"):
all_speech_inputs.extend(enc["speech_inputs"])
has_speech = True
# Prepare batch encoding
batch_encoding = BatchEncoding()
# Handle tensor conversion
@@ -370,79 +337,79 @@ class VibeVoiceProcessor:
batch_encoding["attention_mask"] = attention_masks
batch_encoding["speech_input_mask"] = speech_input_masks_list
# Process speech tensors if present
if has_speech:
speech_dict = self.prepare_speech_inputs(
all_speech_inputs,
return_tensors=return_tensors,
)
if all_speech_inputs:
speech_dict = self.prepare_speech_inputs(all_speech_inputs, return_tensors=return_tensors)
batch_encoding["speech_tensors"] = speech_dict["padded_speeches"]
batch_encoding["speech_masks"] = speech_dict["speech_masks"]
else:
batch_encoding["speech_tensors"] = None
batch_encoding["speech_masks"] = None
# Add metadata
batch_encoding["parsed_scripts"] = [enc["parsed_script"] for enc in encodings]
batch_encoding["all_speakers_list"] = [enc["all_speakers"] for enc in encodings]
return batch_encoding
def _create_voice_prompt(
self,
speaker_samples: List[Union[str, np.ndarray]]
speaker_samples: List[Optional[Union[str, np.ndarray]]],
speaker_ids: List[int]
) -> Tuple[List[int], List[np.ndarray], List[bool]]:
"""
Create voice prompt tokens and process audio samples.
This function now handles `None` in the speaker_samples list for zero-shot speakers.
Returns:
tuple: (voice_tokens, voice_speech_inputs, voice_speech_masks)
"""
if not any(s is not None for s in speaker_samples):
return [], [], []
vae_token_id = self.tokenizer.speech_diffusion_id
voice_full_tokens = self.tokenizer.encode(' Voice input:\n', add_special_tokens=False)
voice_speech_inputs = []
voice_speech_masks = [False] * len(voice_full_tokens)
for speaker_id, speaker_audio in enumerate(speaker_samples):
prefix_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:", add_special_tokens=False)
for speaker_id, speaker_audio in zip(speaker_ids, speaker_samples):
# Process audio
if isinstance(speaker_audio, str):
# Load audio from file
wav = self.audio_processor._load_audio_from_path(speaker_audio)
if speaker_audio is not None:
logger.info(f"Creating voice prompt for Speaker {speaker_id} from reference audio.")
prefix_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:", add_special_tokens=False)
newline_tokens = self.tokenizer.encode('\n', add_special_tokens=False)
if isinstance(speaker_audio, str):
wav = self.audio_processor._load_audio_from_path(speaker_audio)
else:
wav = np.array(speaker_audio, dtype=np.float32)
if self.db_normalize and self.audio_normalizer:
wav = self.audio_normalizer(wav)
vae_tok_len = math.ceil(wav.shape[0] / self.speech_tok_compress_ratio)
speaker_tokens = (
prefix_tokens +
[self.tokenizer.speech_start_id] +
[vae_token_id] * vae_tok_len +
[self.tokenizer.speech_end_id] +
newline_tokens
)
vae_input_mask = (
[False] * len(prefix_tokens) +
[False] + # for speech_start_id
[True] * vae_tok_len +
[False] + # for speech_end_id
[False] * len(newline_tokens)
)
voice_speech_inputs.append(wav)
voice_full_tokens.extend(speaker_tokens)
voice_speech_masks.extend(vae_input_mask)
else:
wav = np.array(speaker_audio, dtype=np.float32)
# Apply normalization if needed
if self.db_normalize and self.audio_normalizer:
wav = self.audio_normalizer(wav)
# Calculate token length based on compression ratio
# if speaker_audio.endswith('.pt') or speaker_audio.endswith('.npy'):
# vae_tok_len = wav.shape[0]
# else:
vae_tok_len = math.ceil(wav.shape[0] / self.speech_tok_compress_ratio)
# Build tokens and masks
speaker_tokens = (prefix_tokens +
[self.tokenizer.speech_start_id] +
[vae_token_id] * vae_tok_len +
[self.tokenizer.speech_end_id] +
self.tokenizer.encode('\n', add_special_tokens=False))
vae_input_mask = ([False] * len(prefix_tokens) +
[False] +
[True] * vae_tok_len +
[False] +
[False])
voice_full_tokens.extend(speaker_tokens)
voice_speech_masks.extend(vae_input_mask)
voice_speech_inputs.append(wav)
logger.info(f"Skipping voice prompt for Speaker {speaker_id} (zero-shot).")
return voice_full_tokens, voice_speech_inputs, voice_speech_masks
def prepare_speech_inputs(
self,
speech_inputs: List[np.ndarray],
@@ -481,10 +448,7 @@ class VibeVoiceProcessor:
padded_speeches[i, :len(speech)] = speech
speech_masks[i, :vae_tok_length] = True
result = {
"padded_speeches": padded_speeches,
"speech_masks": speech_masks,
}
result = {"padded_speeches": padded_speeches, "speech_masks": speech_masks}
# Convert to tensors if requested
if return_tensors == "pt":
@@ -584,12 +548,10 @@ class VibeVoiceProcessor:
parsed_lines = []
speaker_ids = []
# First pass: parse all lines and collect speaker IDs
for line in lines:
if not line.strip():
continue
# Use regex to handle edge cases like multiple colons
match = re.match(r'^Speaker\s+(\d+)\s*:\s*(.*)$', line.strip(), re.IGNORECASE)
if match:

View File

@@ -27,8 +27,8 @@ class VibeVoiceTTSNode:
}),
"text": ("STRING", {
"multiline": True,
"default": "Speaker 1: Hello from ComfyUI!\nSpeaker 2: VibeVoice sounds amazing.",
"tooltip": "The script for the conversation. Use 'Speaker 1:', 'Speaker 2:', etc. to assign lines to different voices. Each speaker line should be on a new line."
"default": "[1] Hello, this is a cloned voice.\n[2] And this is a generated voice, how cool is that?",
"tooltip": "The script for generation. Use '[1]' or 'Speaker 1:' for speakers. If a speaker in the script lacks a reference voice, it will be generated via zero-shot TTS."
}),
"quantize_llm_4bit": ("BOOLEAN", {
"default": False, "label_on": "Q4 (LLM only)", "label_off": "Full precision",
@@ -72,10 +72,10 @@ class VibeVoiceTTSNode:
}),
},
"optional": {
"speaker_1_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 1' in the script."}),
"speaker_2_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 2' in the script."}),
"speaker_3_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 3' in the script."}),
"speaker_4_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 4' in the script."}),
"speaker_1_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 1' or '[1]' in the script."}),
"speaker_2_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 2' or '[2]' in the script."}),
"speaker_3_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 3' or '[3]' in the script."}),
"speaker_4_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 4' or '[4]' in the script."}),
}
}
@@ -113,23 +113,23 @@ class VibeVoiceTTSNode:
parsed_lines_0_based, speaker_ids_1_based = parse_script_1_based(text)
if not parsed_lines_0_based:
raise ValueError("Script is empty or invalid. Use 'Speaker 1:', 'Speaker 2:', etc. format.")
raise ValueError("Script is empty or invalid. Please provide text to generate.")
full_script = "\n".join([f"Speaker {spk+1}: {txt}" for spk, txt in parsed_lines_0_based])
# full_script = "\n".join([f"Speaker {spk+1}: {txt}" for spk, txt in parsed_lines_0_based]) # <-- REMOVED: This was the cause of the bug.
speaker_inputs = {i: kwargs.get(f"speaker_{i}_voice") for i in range(1, 5)}
voice_samples_np = [preprocess_comfy_audio(speaker_inputs[sid]) for sid in speaker_ids_1_based]
if any(v is None for v in voice_samples_np):
missing_ids = [sid for sid, v in zip(speaker_ids_1_based, voice_samples_np) if v is None]
raise ValueError(f"Script requires voices for Speakers {missing_ids}, but they were not provided.")
voice_samples_np = [preprocess_comfy_audio(speaker_inputs.get(sid)) for sid in speaker_ids_1_based]
set_vibevoice_seed(seed)
try:
inputs = processor(
text=[full_script], voice_samples=[voice_samples_np], padding=True,
return_tensors="pt", return_attention_mask=True
parsed_scripts=[parsed_lines_0_based],
voice_samples=[voice_samples_np],
speaker_ids_for_prompt=[speaker_ids_1_based],
padding=True,
return_tensors="pt",
return_attention_mask=True
)
for key, value in inputs.items():
@@ -155,7 +155,7 @@ class VibeVoiceTTSNode:
def progress_callback(step, total_steps):
pbar.update(1)
if model_management.interrupt_current_processing:
raise comfy.model_management.InterruptProcessingException()
raise model_management.InterruptProcessingException()
try:
outputs = model.generate(
@@ -172,13 +172,13 @@ class VibeVoiceTTSNode:
logger.error("This might be due to invalid input data, GPU memory issues, or incompatible attention mode.")
logger.error("Try restarting ComfyUI, using different audio files, or switching to 'eager' attention mode.")
raise e
except comfy.model_management.InterruptProcessingException:
except model_management.InterruptProcessingException:
logger.info("VibeVoice generation interrupted by user")
raise
finally:
pbar.update_absolute(inference_steps)
except comfy.model_management.InterruptProcessingException:
except model_management.InterruptProcessingException:
logger.info("VibeVoice TTS generation was cancelled")
return ({"waveform": torch.zeros((1, 1, 24000), dtype=torch.float32), "sample_rate": 24000},)