mirror of
https://github.com/wildminder/ComfyUI-VibeVoice.git
synced 2026-04-30 11:41:35 +00:00
voice bleeding fix, audio quality, input speakers tags, zero-shot voices
This commit is contained in:
@@ -1,9 +1,71 @@
|
||||
{
|
||||
"id": "b91265e5-1b03-4b63-8dc3-4abd9a030e08",
|
||||
"revision": 0,
|
||||
"last_node_id": 11,
|
||||
"last_link_id": 29,
|
||||
"last_node_id": 14,
|
||||
"last_link_id": 44,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 3,
|
||||
"type": "SaveAudio",
|
||||
"pos": [
|
||||
-1040,
|
||||
-1130
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
112
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"link": 27
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"properties": {
|
||||
"Node name for S&R": "SaveAudio",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.52",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"filename_prefix": true,
|
||||
"audioUI": true
|
||||
},
|
||||
"version": "7.0.1"
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"audio/VibeVoice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "MarkdownNote",
|
||||
"pos": [
|
||||
-1898.1748046875,
|
||||
-1409.22314453125
|
||||
],
|
||||
"size": [
|
||||
1035.619873046875,
|
||||
211.96694946289062
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [],
|
||||
"title": "Note",
|
||||
"properties": {},
|
||||
"widgets_values": [
|
||||
"# ComfyUI-VibeVoice\n\nVibeVoice is a novel framework by Microsoft for generating expressive, long-form, multi-speaker conversational audio. It excels at creating natural-sounding dialogue, podcasts, and more, with consistent voices for up to 4 speakers.\n\n**✨ Key Features:**\n* **Multi-Speaker TTS:** Generate conversations with up to 4 distinct voices in a single audio output.\n* **High-Fidelity Voice Cloning:** Use any audio file (`.wav`, `.mp3`) as a reference for a speaker's voice.\n* **Hybrid Generation Mode:** Mix and match cloned voices with high-quality, zero-shot generated voices in the same script.\n* **Flexible Scripting:** Use simple `[1]` tags or the classic `Speaker 1:` format to write your dialogue.\n* **Advanced Attention Mechanisms:** Choose between `eager`, `sdpa`, `flash_attention_2`, and the high-performance `sage` attention for fine-tuned control over speed and compatibility.\n* **Robust 4-Bit Quantization:** Run the large language model component in 4-bit mode to significantly reduce VRAM usage.\n* **Automatic Model Management:** Models are downloaded automatically and managed efficiently by ComfyUI to save VRAM."
|
||||
],
|
||||
"color": "#233",
|
||||
"bgcolor": "#355"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "LoadAudio",
|
||||
@@ -12,26 +74,24 @@
|
||||
-1130
|
||||
],
|
||||
"size": [
|
||||
274.080078125,
|
||||
272.9800720214844,
|
||||
136
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "AUDIO",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
28
|
||||
]
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadAudio",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.52",
|
||||
"Node name for S&R": "LoadAudio",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"audio": true,
|
||||
@@ -51,30 +111,28 @@
|
||||
"id": 8,
|
||||
"type": "LoadAudio",
|
||||
"pos": [
|
||||
-1900,
|
||||
-940
|
||||
-1901.10009765625,
|
||||
-948.7998046875
|
||||
],
|
||||
"size": [
|
||||
274.080078125,
|
||||
136
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "AUDIO",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
29
|
||||
]
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadAudio",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.52",
|
||||
"Node name for S&R": "LoadAudio",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"audio": true,
|
||||
@@ -91,71 +149,52 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"id": 12,
|
||||
"type": "MarkdownNote",
|
||||
"pos": [
|
||||
-1030,
|
||||
-960
|
||||
-1915.701904296875,
|
||||
-762.380126953125
|
||||
],
|
||||
"size": [
|
||||
420,
|
||||
210
|
||||
312.85455322265625,
|
||||
292.8734130859375
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [],
|
||||
"title": "Notes",
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.0.1"
|
||||
}
|
||||
},
|
||||
"title": "Note",
|
||||
"properties": {},
|
||||
"widgets_values": [
|
||||
"## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length | Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-0.5B-Streaming | - | - | On the way |\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-Large| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-Large) |"
|
||||
"### Scripting and Voice Modes\n\n#### Speaker Tagging\nYou can assign lines to speakers in two ways. Both are treated identically.\n\n* **Modern Format (Recommended):** `[1] This is the first speaker.`\n* **Classic Format:** `Speaker 1: This is the first speaker.`\n\nYou can also add an optional colon to the modern format (e.g., `[1]: ...`). The node handles all variations consistently.\n\n#### Hybrid Voice Generation\nThis is a powerful feature that lets you mix cloned voices and generated (zero-shot) voices.\n\n* **To Clone a Voice:** Connect a `Load Audio` node to the speaker's input (e.g., `speaker_1_voice`).\n* **To Generate a Voice:** Leave the speaker's input empty. The model will create a unique, high-quality voice for that speaker."
|
||||
],
|
||||
"color": "#432",
|
||||
"bgcolor": "#653"
|
||||
"color": "#233",
|
||||
"bgcolor": "#355"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "SaveAudio",
|
||||
"id": 14,
|
||||
"type": "MarkdownNote",
|
||||
"pos": [
|
||||
-1040,
|
||||
-1130
|
||||
-1048.3660888671875,
|
||||
-960.8771362304688
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
112
|
||||
280.797607421875,
|
||||
487.02728271484375
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"link": 27
|
||||
}
|
||||
],
|
||||
"inputs": [],
|
||||
"outputs": [],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.52",
|
||||
"Node name for S&R": "SaveAudio",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"filename_prefix": true,
|
||||
"audioUI": true
|
||||
},
|
||||
"version": "7.0.1"
|
||||
}
|
||||
},
|
||||
"title": "Note",
|
||||
"properties": {},
|
||||
"widgets_values": [
|
||||
"audio/VibeVoice"
|
||||
]
|
||||
"## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length | Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-Large| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-Large) |\n\n## Support \n\n- Don't know how to update PyTorch?\n- Need help with ComfyUI?\n- Need technical support?\n\n### Or do you just have questions? Then join the [@TokenDiffusion Hub](https://t.me/TokenDiff_hub) group\n\n### AI news [TokenDiffusion](https://t.me/TokenDiff)"
|
||||
],
|
||||
"color": "#233",
|
||||
"bgcolor": "#355"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
@@ -165,24 +204,24 @@
|
||||
-1130
|
||||
],
|
||||
"size": [
|
||||
460,
|
||||
510
|
||||
475.3999938964844,
|
||||
662.9000244140625
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "speaker_1_voice",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 28
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "speaker_2_voice",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 29
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "speaker_3_voice",
|
||||
@@ -207,9 +246,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VibeVoiceTTS",
|
||||
"cnr_id": "ComfyUI-VibeVoice",
|
||||
"ver": "37803a884fb8f9b43c38286f6d654c7f97181a73",
|
||||
"Node name for S&R": "VibeVoiceTTS",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"model_name": true,
|
||||
@@ -229,12 +268,12 @@
|
||||
},
|
||||
"widgets_values": [
|
||||
"VibeVoice-1.5B",
|
||||
"Speaker 1: I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\nSpeaker 2: Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\nSpeaker 1: A nightmare? That's the same excuse you used last time. I'm starting to think you just don't care. It's easier to say 'work was crazy' than to just admit that I'm not a priority for you anymore.",
|
||||
"[1] I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\n[2] Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\n",
|
||||
false,
|
||||
"flash_attention_2",
|
||||
1.3,
|
||||
10,
|
||||
1,
|
||||
471935335072093,
|
||||
"fixed",
|
||||
true,
|
||||
0.95,
|
||||
@@ -254,37 +293,21 @@
|
||||
3,
|
||||
0,
|
||||
"AUDIO"
|
||||
],
|
||||
[
|
||||
28,
|
||||
4,
|
||||
0,
|
||||
11,
|
||||
0,
|
||||
"AUDIO"
|
||||
],
|
||||
[
|
||||
29,
|
||||
8,
|
||||
0,
|
||||
11,
|
||||
1,
|
||||
"AUDIO"
|
||||
]
|
||||
],
|
||||
"groups": [],
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ue_links": [],
|
||||
"links_added_by_ue": [],
|
||||
"ds": {
|
||||
"scale": 1.2100000000000002,
|
||||
"scale": 0.8264462809917354,
|
||||
"offset": [
|
||||
2000,
|
||||
1230
|
||||
2015.701904296875,
|
||||
1509.22314453125
|
||||
]
|
||||
},
|
||||
"frontendVersion": "1.25.11",
|
||||
"ue_links": [],
|
||||
"links_added_by_ue": [],
|
||||
"frontendVersion": "1.26.11",
|
||||
"VHS_latentpreview": false,
|
||||
"VHS_latentpreviewrate": 0,
|
||||
"VHS_MetadataImage": true,
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 145 KiB After Width: | Height: | Size: 154 KiB |
@@ -34,27 +34,52 @@ def parse_script_1_based(script: str) -> tuple[list[tuple[int, str]], list[int]]
|
||||
Parses a 1-based speaker script into a list of (speaker_id, text) tuples
|
||||
and a list of unique speaker IDs in the order of their first appearance.
|
||||
Internally, it converts speaker IDs to 0-based for the model.
|
||||
|
||||
Supports two formats:
|
||||
1. Speaker 1: Some text...
|
||||
2. [1] Some text...
|
||||
|
||||
If no speaker markers are found, the entire script is assigned to Speaker 1.
|
||||
"""
|
||||
parsed_lines = []
|
||||
speaker_ids_in_script = [] # This will store the 1-based IDs from the script
|
||||
|
||||
line_format_regex = re.compile(r'^(?:Speaker\s+(\d+)\s*:|\[(\d+)\])\s*(.*)$', re.IGNORECASE)
|
||||
|
||||
for line in script.strip().split("\n"):
|
||||
if not (line := line.strip()): continue
|
||||
match = re.match(r'^Speaker\s+(\d+)\s*:\s*(.*)$', line, re.IGNORECASE)
|
||||
|
||||
match = line_format_regex.match(line)
|
||||
if match:
|
||||
speaker_id = int(match.group(1))
|
||||
speaker_id_str = match.group(1) or match.group(2)
|
||||
speaker_id = int(speaker_id_str)
|
||||
text_content = match.group(3)
|
||||
|
||||
if match.group(1) is None and text_content.lstrip().startswith(':'):
|
||||
colon_index = text_content.find(':')
|
||||
text_content = text_content[colon_index + 1:]
|
||||
|
||||
if speaker_id < 1:
|
||||
logger.warning(f"Speaker ID must be 1 or greater. Skipping line: '{line}'")
|
||||
continue
|
||||
text = ' ' + match.group(2).strip()
|
||||
# Internally, the model expects 0-based indexing for speakers
|
||||
|
||||
text = text_content.strip() # REMOVED the prepended space ' ' +
|
||||
internal_speaker_id = speaker_id - 1
|
||||
parsed_lines.append((internal_speaker_id, text))
|
||||
|
||||
if speaker_id not in speaker_ids_in_script:
|
||||
speaker_ids_in_script.append(speaker_id)
|
||||
else:
|
||||
logger.warning(f"Could not parse line, skipping: '{line}'")
|
||||
logger.warning(f"Could not parse speaker marker, treating as part of previous line if any, or ignoring: '{line}'")
|
||||
|
||||
if not parsed_lines and script.strip():
|
||||
logger.info("No speaker markers found. Treating entire text as a single utterance for Speaker 1.")
|
||||
parsed_lines.append((0, ' ' + script.strip()))
|
||||
speaker_ids_in_script.append(1)
|
||||
|
||||
return parsed_lines, sorted(list(set(speaker_ids_in_script)))
|
||||
|
||||
|
||||
def preprocess_comfy_audio(audio_dict: dict, target_sr: int = 24000) -> np.ndarray:
|
||||
"""
|
||||
Converts a ComfyUI AUDIO dict to a mono NumPy array, resampling if necessary.
|
||||
|
||||
@@ -480,11 +480,14 @@ class VibeVoiceForConditionalGenerationInference(VibeVoicePreTrainedModel, Gener
|
||||
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
|
||||
if is_prefill:
|
||||
# we process the speech inputs only during the first generation step
|
||||
prefill_inputs = {
|
||||
"speech_tensors": speech_tensors.to(device=device),
|
||||
"speech_masks": speech_masks.to(device),
|
||||
"speech_input_mask": speech_input_mask.to(device),
|
||||
}
|
||||
# Conditionally add speech tensors to prefill_inputs only if they exist.
|
||||
prefill_inputs = {}
|
||||
if speech_tensors is not None:
|
||||
prefill_inputs["speech_tensors"] = speech_tensors.to(device=device)
|
||||
if speech_masks is not None:
|
||||
prefill_inputs["speech_masks"] = speech_masks.to(device)
|
||||
if speech_input_mask is not None:
|
||||
prefill_inputs["speech_input_mask"] = speech_input_mask.to(device)
|
||||
is_prefill = False
|
||||
else:
|
||||
_ = model_inputs.pop('inputs_embeds', None)
|
||||
|
||||
@@ -147,8 +147,10 @@ class VibeVoiceProcessor:
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: Optional[Union[str, List[str], TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
|
||||
voice_samples: Optional[Union[List[Union[str, np.ndarray]], List[List[Union[str, np.ndarray]]]]] = None,
|
||||
text: Optional[List[str]] = None,
|
||||
parsed_scripts: Optional[List[List[Tuple[int, str]]]] = None, # <-- ADDED
|
||||
voice_samples: Optional[List[List[Optional[Union[str, np.ndarray]]]]] = None,
|
||||
speaker_ids_for_prompt: Optional[List[List[int]]] = None,
|
||||
padding: Union[bool, str, PaddingStrategy] = True,
|
||||
truncation: Union[bool, str, TruncationStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
@@ -189,31 +191,26 @@ class VibeVoiceProcessor:
|
||||
- **speech_masks** -- Speech masks (if voice_samples provided)
|
||||
- **speech_input_mask** -- Boolean masks indicating speech token positions
|
||||
"""
|
||||
# Handle single vs batch input
|
||||
if isinstance(text, str) or (isinstance(text, list) and len(text) > 0 and not isinstance(text[0], str)):
|
||||
# Single input
|
||||
texts = [text]
|
||||
is_batched = False
|
||||
else:
|
||||
# Batch input
|
||||
texts = text
|
||||
is_batched = True
|
||||
|
||||
# Handle voice samples
|
||||
if voice_samples is not None:
|
||||
if not is_batched or (isinstance(voice_samples[0], (str, np.ndarray))):
|
||||
# Single set of voice samples
|
||||
voice_samples_list = [voice_samples]
|
||||
else:
|
||||
# Batch of voice samples
|
||||
voice_samples_list = voice_samples
|
||||
else:
|
||||
voice_samples_list = [None] * len(texts)
|
||||
|
||||
# Process each input
|
||||
if parsed_scripts is None:
|
||||
if text is None:
|
||||
raise ValueError("Either 'text' or 'parsed_scripts' must be provided.")
|
||||
# Fallback for raw text input (though the node won't use this path)
|
||||
from ..modules.utils import parse_script_1_based
|
||||
parsed_scripts = [parse_script_1_based(t)[0] for t in text]
|
||||
|
||||
num_scripts = len(parsed_scripts)
|
||||
voice_samples_list = voice_samples if voice_samples is not None else [[] for _ in range(num_scripts)]
|
||||
speaker_ids_list = speaker_ids_for_prompt if speaker_ids_for_prompt is not None else [[] for _ in range(num_scripts)]
|
||||
|
||||
all_encodings = []
|
||||
for text_input, voice_input in zip(texts, voice_samples_list):
|
||||
encoding = self._process_single(text_input, voice_input)
|
||||
for i in range(num_scripts):
|
||||
# Pass all three corresponding items to _process_single
|
||||
encoding = self._process_single(
|
||||
parsed_scripts[i],
|
||||
voice_samples_list[i],
|
||||
speaker_ids_list[i]
|
||||
)
|
||||
all_encodings.append(encoding)
|
||||
|
||||
# Combine batch
|
||||
@@ -230,62 +227,38 @@ class VibeVoiceProcessor:
|
||||
|
||||
def _process_single(
|
||||
self,
|
||||
text: Union[str, TextInput],
|
||||
voice_samples: Optional[List[Union[str, np.ndarray]]] = None,
|
||||
parsed_script: List[Tuple[int, str]],
|
||||
voice_samples: List[Optional[Union[str, np.ndarray]]],
|
||||
speaker_ids: List[int],
|
||||
) -> Dict[str, Any]:
|
||||
"""Process a single podcast script."""
|
||||
# Determine if text is a file path or direct script
|
||||
script = None
|
||||
if isinstance(text, str):
|
||||
# Check if it's a file path
|
||||
if text.endswith('.json') and os.path.exists(text):
|
||||
script = self._convert_json_to_script(text)
|
||||
elif text.endswith('.txt') and os.path.exists(text):
|
||||
script = self._convert_text_to_script(text)
|
||||
else:
|
||||
# Assume it's the script content directly
|
||||
script = text
|
||||
|
||||
if script is None:
|
||||
raise ValueError(f"Could not process input text: {text}")
|
||||
|
||||
# Parse the script
|
||||
parsed_lines = self._parse_script(script)
|
||||
all_speakers = list(set(speaker_id for speaker_id, _ in parsed_lines))
|
||||
|
||||
# Create system prompt
|
||||
# system_tokens = self.tokenizer.encode(self.system_prompt, add_special_tokens=False)
|
||||
|
||||
system_tokens = self.tokenizer.encode(self.system_prompt)
|
||||
|
||||
# Process voice samples if provided
|
||||
if voice_samples:
|
||||
voice_tokens, voice_speech_inputs, voice_speech_masks = self._create_voice_prompt(voice_samples[:len(all_speakers)])
|
||||
else:
|
||||
voice_tokens, voice_speech_inputs, voice_speech_masks = [], [], []
|
||||
|
||||
# Build full token sequence
|
||||
voice_tokens, voice_speech_inputs, voice_speech_masks = self._create_voice_prompt(
|
||||
voice_samples, speaker_ids
|
||||
)
|
||||
|
||||
full_tokens = system_tokens + voice_tokens
|
||||
speech_input_mask = [False] * len(system_tokens) + voice_speech_masks
|
||||
|
||||
# Add text input section
|
||||
full_tokens += self.tokenizer.encode(' Text input:\n', add_special_tokens=False)
|
||||
speech_input_mask += [False] * len(self.tokenizer.encode(' Text input:\n', add_special_tokens=False))
|
||||
|
||||
for speaker_id, speaker_text in parsed_lines:
|
||||
speaker_text_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:{speaker_text}\n", add_special_tokens=False)
|
||||
full_tokens += speaker_text_tokens
|
||||
speech_input_mask += [False] * len(speaker_text_tokens)
|
||||
|
||||
# Add speech output section
|
||||
full_tokens += self.tokenizer.encode(' Speech output:\n', add_special_tokens=False) + [self.tokenizer.speech_start_id]
|
||||
speech_input_mask += [False] * (len(self.tokenizer.encode(' Speech output:\n', add_special_tokens=False)) + 1)
|
||||
dialogue_lines = []
|
||||
for speaker_id_0_based, text_chunk in parsed_script:
|
||||
speaker_id_1_based = speaker_id_0_based + 1
|
||||
dialogue_lines.append(f"Speaker {speaker_id_1_based}: : {text_chunk}")
|
||||
|
||||
full_dialogue_script = "\n".join(dialogue_lines)
|
||||
|
||||
final_prompt_text = f" Text input:\n{full_dialogue_script}\n Speech output:\n"
|
||||
|
||||
prompt_tokens = self.tokenizer.encode(final_prompt_text, add_special_tokens=False)
|
||||
|
||||
full_tokens += prompt_tokens + [self.tokenizer.speech_start_id]
|
||||
speech_input_mask += [False] * (len(prompt_tokens) + 1)
|
||||
|
||||
return {
|
||||
"input_ids": full_tokens,
|
||||
"speech_inputs": voice_speech_inputs if voice_speech_inputs else None,
|
||||
"speech_input_mask": speech_input_mask,
|
||||
"parsed_script": parsed_lines,
|
||||
"all_speakers": all_speakers,
|
||||
}
|
||||
|
||||
def _batch_encode(
|
||||
@@ -298,11 +271,9 @@ class VibeVoiceProcessor:
|
||||
return_attention_mask: bool = True,
|
||||
) -> BatchEncoding:
|
||||
"""Combine multiple encodings into a batch with padding."""
|
||||
# Extract input_ids and create attention_mask
|
||||
input_ids_list = [enc["input_ids"] for enc in encodings]
|
||||
speech_input_masks_list = [enc["speech_input_mask"] for enc in encodings]
|
||||
|
||||
# Determine padding strategy
|
||||
if isinstance(padding, bool):
|
||||
padding_strategy = PaddingStrategy.LONGEST if padding else PaddingStrategy.DO_NOT_PAD
|
||||
elif isinstance(padding, str):
|
||||
@@ -347,15 +318,11 @@ class VibeVoiceProcessor:
|
||||
# No padding, just create attention masks
|
||||
attention_masks = [[1] * len(ids) for ids in input_ids_list] if return_attention_mask else None
|
||||
|
||||
# Process speech inputs
|
||||
all_speech_inputs = []
|
||||
has_speech = False
|
||||
for enc in encodings:
|
||||
if enc["speech_inputs"] is not None:
|
||||
if enc.get("speech_inputs"):
|
||||
all_speech_inputs.extend(enc["speech_inputs"])
|
||||
has_speech = True
|
||||
|
||||
# Prepare batch encoding
|
||||
|
||||
batch_encoding = BatchEncoding()
|
||||
|
||||
# Handle tensor conversion
|
||||
@@ -370,79 +337,79 @@ class VibeVoiceProcessor:
|
||||
batch_encoding["attention_mask"] = attention_masks
|
||||
batch_encoding["speech_input_mask"] = speech_input_masks_list
|
||||
|
||||
# Process speech tensors if present
|
||||
if has_speech:
|
||||
speech_dict = self.prepare_speech_inputs(
|
||||
all_speech_inputs,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
if all_speech_inputs:
|
||||
speech_dict = self.prepare_speech_inputs(all_speech_inputs, return_tensors=return_tensors)
|
||||
batch_encoding["speech_tensors"] = speech_dict["padded_speeches"]
|
||||
batch_encoding["speech_masks"] = speech_dict["speech_masks"]
|
||||
else:
|
||||
batch_encoding["speech_tensors"] = None
|
||||
batch_encoding["speech_masks"] = None
|
||||
|
||||
# Add metadata
|
||||
batch_encoding["parsed_scripts"] = [enc["parsed_script"] for enc in encodings]
|
||||
batch_encoding["all_speakers_list"] = [enc["all_speakers"] for enc in encodings]
|
||||
|
||||
return batch_encoding
|
||||
|
||||
def _create_voice_prompt(
|
||||
self,
|
||||
speaker_samples: List[Union[str, np.ndarray]]
|
||||
speaker_samples: List[Optional[Union[str, np.ndarray]]],
|
||||
speaker_ids: List[int]
|
||||
) -> Tuple[List[int], List[np.ndarray], List[bool]]:
|
||||
"""
|
||||
Create voice prompt tokens and process audio samples.
|
||||
This function now handles `None` in the speaker_samples list for zero-shot speakers.
|
||||
|
||||
Returns:
|
||||
tuple: (voice_tokens, voice_speech_inputs, voice_speech_masks)
|
||||
"""
|
||||
if not any(s is not None for s in speaker_samples):
|
||||
return [], [], []
|
||||
|
||||
vae_token_id = self.tokenizer.speech_diffusion_id
|
||||
|
||||
voice_full_tokens = self.tokenizer.encode(' Voice input:\n', add_special_tokens=False)
|
||||
voice_speech_inputs = []
|
||||
voice_speech_masks = [False] * len(voice_full_tokens)
|
||||
|
||||
for speaker_id, speaker_audio in enumerate(speaker_samples):
|
||||
prefix_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:", add_special_tokens=False)
|
||||
for speaker_id, speaker_audio in zip(speaker_ids, speaker_samples):
|
||||
|
||||
# Process audio
|
||||
if isinstance(speaker_audio, str):
|
||||
# Load audio from file
|
||||
wav = self.audio_processor._load_audio_from_path(speaker_audio)
|
||||
if speaker_audio is not None:
|
||||
logger.info(f"Creating voice prompt for Speaker {speaker_id} from reference audio.")
|
||||
prefix_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:", add_special_tokens=False)
|
||||
newline_tokens = self.tokenizer.encode('\n', add_special_tokens=False)
|
||||
|
||||
if isinstance(speaker_audio, str):
|
||||
wav = self.audio_processor._load_audio_from_path(speaker_audio)
|
||||
else:
|
||||
wav = np.array(speaker_audio, dtype=np.float32)
|
||||
|
||||
if self.db_normalize and self.audio_normalizer:
|
||||
wav = self.audio_normalizer(wav)
|
||||
|
||||
|
||||
vae_tok_len = math.ceil(wav.shape[0] / self.speech_tok_compress_ratio)
|
||||
speaker_tokens = (
|
||||
prefix_tokens +
|
||||
[self.tokenizer.speech_start_id] +
|
||||
[vae_token_id] * vae_tok_len +
|
||||
[self.tokenizer.speech_end_id] +
|
||||
newline_tokens
|
||||
)
|
||||
|
||||
vae_input_mask = (
|
||||
[False] * len(prefix_tokens) +
|
||||
[False] + # for speech_start_id
|
||||
[True] * vae_tok_len +
|
||||
[False] + # for speech_end_id
|
||||
[False] * len(newline_tokens)
|
||||
)
|
||||
voice_speech_inputs.append(wav)
|
||||
voice_full_tokens.extend(speaker_tokens)
|
||||
voice_speech_masks.extend(vae_input_mask)
|
||||
else:
|
||||
wav = np.array(speaker_audio, dtype=np.float32)
|
||||
|
||||
# Apply normalization if needed
|
||||
if self.db_normalize and self.audio_normalizer:
|
||||
wav = self.audio_normalizer(wav)
|
||||
|
||||
# Calculate token length based on compression ratio
|
||||
# if speaker_audio.endswith('.pt') or speaker_audio.endswith('.npy'):
|
||||
# vae_tok_len = wav.shape[0]
|
||||
# else:
|
||||
vae_tok_len = math.ceil(wav.shape[0] / self.speech_tok_compress_ratio)
|
||||
|
||||
# Build tokens and masks
|
||||
speaker_tokens = (prefix_tokens +
|
||||
[self.tokenizer.speech_start_id] +
|
||||
[vae_token_id] * vae_tok_len +
|
||||
[self.tokenizer.speech_end_id] +
|
||||
self.tokenizer.encode('\n', add_special_tokens=False))
|
||||
|
||||
vae_input_mask = ([False] * len(prefix_tokens) +
|
||||
[False] +
|
||||
[True] * vae_tok_len +
|
||||
[False] +
|
||||
[False])
|
||||
|
||||
voice_full_tokens.extend(speaker_tokens)
|
||||
voice_speech_masks.extend(vae_input_mask)
|
||||
voice_speech_inputs.append(wav)
|
||||
|
||||
logger.info(f"Skipping voice prompt for Speaker {speaker_id} (zero-shot).")
|
||||
|
||||
|
||||
return voice_full_tokens, voice_speech_inputs, voice_speech_masks
|
||||
|
||||
|
||||
def prepare_speech_inputs(
|
||||
self,
|
||||
speech_inputs: List[np.ndarray],
|
||||
@@ -481,10 +448,7 @@ class VibeVoiceProcessor:
|
||||
padded_speeches[i, :len(speech)] = speech
|
||||
speech_masks[i, :vae_tok_length] = True
|
||||
|
||||
result = {
|
||||
"padded_speeches": padded_speeches,
|
||||
"speech_masks": speech_masks,
|
||||
}
|
||||
result = {"padded_speeches": padded_speeches, "speech_masks": speech_masks}
|
||||
|
||||
# Convert to tensors if requested
|
||||
if return_tensors == "pt":
|
||||
@@ -584,12 +548,10 @@ class VibeVoiceProcessor:
|
||||
parsed_lines = []
|
||||
speaker_ids = []
|
||||
|
||||
# First pass: parse all lines and collect speaker IDs
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# Use regex to handle edge cases like multiple colons
|
||||
match = re.match(r'^Speaker\s+(\d+)\s*:\s*(.*)$', line.strip(), re.IGNORECASE)
|
||||
|
||||
if match:
|
||||
|
||||
@@ -27,8 +27,8 @@ class VibeVoiceTTSNode:
|
||||
}),
|
||||
"text": ("STRING", {
|
||||
"multiline": True,
|
||||
"default": "Speaker 1: Hello from ComfyUI!\nSpeaker 2: VibeVoice sounds amazing.",
|
||||
"tooltip": "The script for the conversation. Use 'Speaker 1:', 'Speaker 2:', etc. to assign lines to different voices. Each speaker line should be on a new line."
|
||||
"default": "[1] Hello, this is a cloned voice.\n[2] And this is a generated voice, how cool is that?",
|
||||
"tooltip": "The script for generation. Use '[1]' or 'Speaker 1:' for speakers. If a speaker in the script lacks a reference voice, it will be generated via zero-shot TTS."
|
||||
}),
|
||||
"quantize_llm_4bit": ("BOOLEAN", {
|
||||
"default": False, "label_on": "Q4 (LLM only)", "label_off": "Full precision",
|
||||
@@ -72,10 +72,10 @@ class VibeVoiceTTSNode:
|
||||
}),
|
||||
},
|
||||
"optional": {
|
||||
"speaker_1_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 1' in the script."}),
|
||||
"speaker_2_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 2' in the script."}),
|
||||
"speaker_3_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 3' in the script."}),
|
||||
"speaker_4_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 4' in the script."}),
|
||||
"speaker_1_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 1' or '[1]' in the script."}),
|
||||
"speaker_2_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 2' or '[2]' in the script."}),
|
||||
"speaker_3_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 3' or '[3]' in the script."}),
|
||||
"speaker_4_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 4' or '[4]' in the script."}),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -113,23 +113,23 @@ class VibeVoiceTTSNode:
|
||||
|
||||
parsed_lines_0_based, speaker_ids_1_based = parse_script_1_based(text)
|
||||
if not parsed_lines_0_based:
|
||||
raise ValueError("Script is empty or invalid. Use 'Speaker 1:', 'Speaker 2:', etc. format.")
|
||||
raise ValueError("Script is empty or invalid. Please provide text to generate.")
|
||||
|
||||
full_script = "\n".join([f"Speaker {spk+1}: {txt}" for spk, txt in parsed_lines_0_based])
|
||||
# full_script = "\n".join([f"Speaker {spk+1}: {txt}" for spk, txt in parsed_lines_0_based]) # <-- REMOVED: This was the cause of the bug.
|
||||
|
||||
speaker_inputs = {i: kwargs.get(f"speaker_{i}_voice") for i in range(1, 5)}
|
||||
voice_samples_np = [preprocess_comfy_audio(speaker_inputs[sid]) for sid in speaker_ids_1_based]
|
||||
|
||||
if any(v is None for v in voice_samples_np):
|
||||
missing_ids = [sid for sid, v in zip(speaker_ids_1_based, voice_samples_np) if v is None]
|
||||
raise ValueError(f"Script requires voices for Speakers {missing_ids}, but they were not provided.")
|
||||
|
||||
voice_samples_np = [preprocess_comfy_audio(speaker_inputs.get(sid)) for sid in speaker_ids_1_based]
|
||||
|
||||
set_vibevoice_seed(seed)
|
||||
|
||||
try:
|
||||
inputs = processor(
|
||||
text=[full_script], voice_samples=[voice_samples_np], padding=True,
|
||||
return_tensors="pt", return_attention_mask=True
|
||||
parsed_scripts=[parsed_lines_0_based],
|
||||
voice_samples=[voice_samples_np],
|
||||
speaker_ids_for_prompt=[speaker_ids_1_based],
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
return_attention_mask=True
|
||||
)
|
||||
|
||||
for key, value in inputs.items():
|
||||
@@ -155,7 +155,7 @@ class VibeVoiceTTSNode:
|
||||
def progress_callback(step, total_steps):
|
||||
pbar.update(1)
|
||||
if model_management.interrupt_current_processing:
|
||||
raise comfy.model_management.InterruptProcessingException()
|
||||
raise model_management.InterruptProcessingException()
|
||||
|
||||
try:
|
||||
outputs = model.generate(
|
||||
@@ -172,13 +172,13 @@ class VibeVoiceTTSNode:
|
||||
logger.error("This might be due to invalid input data, GPU memory issues, or incompatible attention mode.")
|
||||
logger.error("Try restarting ComfyUI, using different audio files, or switching to 'eager' attention mode.")
|
||||
raise e
|
||||
except comfy.model_management.InterruptProcessingException:
|
||||
except model_management.InterruptProcessingException:
|
||||
logger.info("VibeVoice generation interrupted by user")
|
||||
raise
|
||||
finally:
|
||||
pbar.update_absolute(inference_steps)
|
||||
|
||||
except comfy.model_management.InterruptProcessingException:
|
||||
except model_management.InterruptProcessingException:
|
||||
logger.info("VibeVoice TTS generation was cancelled")
|
||||
return ({"waveform": torch.zeros((1, 1, 24000), dtype=torch.float32), "sample_rate": 24000},)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user