voice bleeding fix, audio quality, input speakers tags, zero-shot voices

2026-04-30 11:41:35 +00:00 · 2025-09-24 17:42:30 +03:00
parent d04665d073
commit 696ef69152
6 changed files with 260 additions and 247 deletions
--- a/example_workflows/VibeVoice_example.json
+++ b/example_workflows/VibeVoice_example.json
@@ -1,9 +1,71 @@
 {
  "id": "b91265e5-1b03-4b63-8dc3-4abd9a030e08",
  "revision": 0,
-  "last_node_id": 11,
-  "last_link_id": 29,
+  "last_node_id": 14,
+  "last_link_id": 44,
  "nodes": [
+    {
+      "id": 3,
+      "type": "SaveAudio",
+      "pos": [
+        -1040,
+        -1130
+      ],
+      "size": [
+        270,
+        112
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": 27
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "SaveAudio",
+        "cnr_id": "comfy-core",
+        "ver": "0.3.52",
+        "ue_properties": {
+          "widget_ue_connectable": {
+            "filename_prefix": true,
+            "audioUI": true
+          },
+          "version": "7.0.1"
+        }
+      },
+      "widgets_values": [
+        "audio/VibeVoice"
+      ]
+    },
+    {
+      "id": 13,
+      "type": "MarkdownNote",
+      "pos": [
+        -1898.1748046875,
+        -1409.22314453125
+      ],
+      "size": [
+        1035.619873046875,
+        211.96694946289062
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "title": "Note",
+      "properties": {},
+      "widgets_values": [
+        "# ComfyUI-VibeVoice\n\nVibeVoice is a novel framework by Microsoft for generating expressive, long-form, multi-speaker conversational audio. It excels at creating natural-sounding dialogue, podcasts, and more, with consistent voices for up to 4 speakers.\n\n**✨ Key Features:**\n*   **Multi-Speaker TTS:** Generate conversations with up to 4 distinct voices in a single audio output.\n*   **High-Fidelity Voice Cloning:** Use any audio file (`.wav`, `.mp3`) as a reference for a speaker's voice.\n*   **Hybrid Generation Mode:** Mix and match cloned voices with high-quality, zero-shot generated voices in the same script.\n*   **Flexible Scripting:** Use simple `[1]` tags or the classic `Speaker 1:` format to write your dialogue.\n*   **Advanced Attention Mechanisms:** Choose between `eager`, `sdpa`, `flash_attention_2`, and the high-performance `sage` attention for fine-tuned control over speed and compatibility.\n*   **Robust 4-Bit Quantization:** Run the large language model component in 4-bit mode to significantly reduce VRAM usage.\n*   **Automatic Model Management:** Models are downloaded automatically and managed efficiently by ComfyUI to save VRAM."
+      ],
+      "color": "#233",
+      "bgcolor": "#355"
+    },
    {
      "id": 4,
      "type": "LoadAudio",
@@ -12,26 +74,24 @@
        -1130
      ],
      "size": [
-        274.080078125,
+        272.9800720214844,
        136
      ],
      "flags": {},
-      "order": 0,
+      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
-          "links": [
-            28
-          ]
+          "links": []
        }
      ],
      "properties": {
+        "Node name for S&R": "LoadAudio",
        "cnr_id": "comfy-core",
        "ver": "0.3.52",
-        "Node name for S&R": "LoadAudio",
        "ue_properties": {
          "widget_ue_connectable": {
            "audio": true,
@@ -51,30 +111,28 @@
      "id": 8,
      "type": "LoadAudio",
      "pos": [
-        -1900,
-        -940
+        -1901.10009765625,
+        -948.7998046875
      ],
      "size": [
        274.080078125,
        136
      ],
      "flags": {},
-      "order": 1,
+      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
-          "links": [
-            29
-          ]
+          "links": []
        }
      ],
      "properties": {
+        "Node name for S&R": "LoadAudio",
        "cnr_id": "comfy-core",
        "ver": "0.3.52",
-        "Node name for S&R": "LoadAudio",
        "ue_properties": {
          "widget_ue_connectable": {
            "audio": true,
@@ -91,71 +149,52 @@
      ]
    },
    {
-      "id": 10,
+      "id": 12,
      "type": "MarkdownNote",
      "pos": [
-        -1030,
-        -960
+        -1915.701904296875,
+        -762.380126953125
      ],
      "size": [
-        420,
-        210
+        312.85455322265625,
+        292.8734130859375
      ],
      "flags": {},
-      "order": 2,
+      "order": 3,
      "mode": 0,
      "inputs": [],
      "outputs": [],
-      "title": "Notes",
-      "properties": {
-        "ue_properties": {
-          "widget_ue_connectable": {},
-          "version": "7.0.1"
-        }
-      },
+      "title": "Note",
+      "properties": {},
      "widgets_values": [
-        "## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length |  Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-0.5B-Streaming | - | - | On the way |\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-Large| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-Large) |"
+        "### Scripting and Voice Modes\n\n#### Speaker Tagging\nYou can assign lines to speakers in two ways. Both are treated identically.\n\n*   **Modern Format (Recommended):** `[1] This is the first speaker.`\n*   **Classic Format:** `Speaker 1: This is the first speaker.`\n\nYou can also add an optional colon to the modern format (e.g., `[1]: ...`). The node handles all variations consistently.\n\n#### Hybrid Voice Generation\nThis is a powerful feature that lets you mix cloned voices and generated (zero-shot) voices.\n\n*   **To Clone a Voice:** Connect a `Load Audio` node to the speaker's input (e.g., `speaker_1_voice`).\n*   **To Generate a Voice:** Leave the speaker's input empty. The model will create a unique, high-quality voice for that speaker."
      ],
-      "color": "#432",
-      "bgcolor": "#653"
+      "color": "#233",
+      "bgcolor": "#355"
    },
    {
-      "id": 3,
-      "type": "SaveAudio",
+      "id": 14,
+      "type": "MarkdownNote",
      "pos": [
-        -1040,
-        -1130
+        -1048.3660888671875,
+        -960.8771362304688
      ],
      "size": [
-        270,
-        112
+        280.797607421875,
+        487.02728271484375
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
-      "inputs": [
-        {
-          "name": "audio",
-          "type": "AUDIO",
-          "link": 27
-        }
-      ],
+      "inputs": [],
      "outputs": [],
-      "properties": {
-        "cnr_id": "comfy-core",
-        "ver": "0.3.52",
-        "Node name for S&R": "SaveAudio",
-        "ue_properties": {
-          "widget_ue_connectable": {
-            "filename_prefix": true,
-            "audioUI": true
-          },
-          "version": "7.0.1"
-        }
-      },
+      "title": "Note",
+      "properties": {},
      "widgets_values": [
-        "audio/VibeVoice"
-      ]
+        "## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length |  Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-Large| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-Large) |\n\n## Support \n\n- Don't know how to update PyTorch?\n- Need help with ComfyUI?\n- Need technical support?\n\n### Or do you just have questions? Then join the [@TokenDiffusion Hub](https://t.me/TokenDiff_hub) group\n\n### AI news [TokenDiffusion](https://t.me/TokenDiff)"
+      ],
+      "color": "#233",
+      "bgcolor": "#355"
    },
    {
      "id": 11,
@@ -165,24 +204,24 @@
        -1130
      ],
      "size": [
-        460,
-        510
+        475.3999938964844,
+        662.9000244140625
      ],
      "flags": {},
-      "order": 3,
+      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "speaker_1_voice",
          "shape": 7,
          "type": "AUDIO",
-          "link": 28
+          "link": null
        },
        {
          "name": "speaker_2_voice",
          "shape": 7,
          "type": "AUDIO",
-          "link": 29
+          "link": null
        },
        {
          "name": "speaker_3_voice",
@@ -207,9 +246,9 @@
        }
      ],
      "properties": {
+        "Node name for S&R": "VibeVoiceTTS",
        "cnr_id": "ComfyUI-VibeVoice",
        "ver": "37803a884fb8f9b43c38286f6d654c7f97181a73",
-        "Node name for S&R": "VibeVoiceTTS",
        "ue_properties": {
          "widget_ue_connectable": {
            "model_name": true,
@@ -229,12 +268,12 @@
      },
      "widgets_values": [
        "VibeVoice-1.5B",
-        "Speaker 1: I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\nSpeaker 2: Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\nSpeaker 1: A nightmare? That's the same excuse you used last time. I'm starting to think you just don't care. It's easier to say 'work was crazy' than to just admit that I'm not a priority for you anymore.",
+        "[1] I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\n[2] Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\n",
        false,
        "flash_attention_2",
        1.3,
        10,
-        1,
+        471935335072093,
        "fixed",
        true,
        0.95,
@@ -254,37 +293,21 @@
      3,
      0,
      "AUDIO"
-    ],
-    [
-      28,
-      4,
-      0,
-      11,
-      0,
-      "AUDIO"
-    ],
-    [
-      29,
-      8,
-      0,
-      11,
-      1,
-      "AUDIO"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
-    "ue_links": [],
-    "links_added_by_ue": [],
    "ds": {
-      "scale": 1.2100000000000002,
+      "scale": 0.8264462809917354,
      "offset": [
-        2000,
-        1230
+        2015.701904296875,
+        1509.22314453125
      ]
    },
-    "frontendVersion": "1.25.11",
+    "ue_links": [],
+    "links_added_by_ue": [],
+    "frontendVersion": "1.26.11",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
--- a/example_workflows/VibeVoice_example.png
+++ b/example_workflows/VibeVoice_example.png
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -34,27 +34,52 @@ def parse_script_1_based(script: str) -> tuple[list[tuple[int, str]], list[int]]
    Parses a 1-based speaker script into a list of (speaker_id, text) tuples
    and a list of unique speaker IDs in the order of their first appearance.
    Internally, it converts speaker IDs to 0-based for the model.
+
+    Supports two formats:
+    1. Speaker 1: Some text...
+    2. [1] Some text...
+
+    If no speaker markers are found, the entire script is assigned to Speaker 1.
    """
    parsed_lines = []
    speaker_ids_in_script = [] # This will store the 1-based IDs from the script
+    
+    line_format_regex = re.compile(r'^(?:Speaker\s+(\d+)\s*:|\[(\d+)\])\s*(.*)$', re.IGNORECASE)
+
    for line in script.strip().split("\n"):
        if not (line := line.strip()): continue
-        match = re.match(r'^Speaker\s+(\d+)\s*:\s*(.*)$', line, re.IGNORECASE)
+        
+        match = line_format_regex.match(line)
        if match:
-            speaker_id = int(match.group(1))
+            speaker_id_str = match.group(1) or match.group(2)
+            speaker_id = int(speaker_id_str)
+            text_content = match.group(3)
+
+            if match.group(1) is None and text_content.lstrip().startswith(':'):
+                colon_index = text_content.find(':')
+                text_content = text_content[colon_index + 1:]
+
            if speaker_id < 1:
                logger.warning(f"Speaker ID must be 1 or greater. Skipping line: '{line}'")
                continue
-            text = ' ' + match.group(2).strip()
-            # Internally, the model expects 0-based indexing for speakers
+
+            text = text_content.strip() # REMOVED the prepended space ' ' +
            internal_speaker_id = speaker_id - 1
            parsed_lines.append((internal_speaker_id, text))
+            
            if speaker_id not in speaker_ids_in_script:
                speaker_ids_in_script.append(speaker_id)
        else:
-            logger.warning(f"Could not parse line, skipping: '{line}'")
+            logger.warning(f"Could not parse speaker marker, treating as part of previous line if any, or ignoring: '{line}'")
+
+    if not parsed_lines and script.strip():
+        logger.info("No speaker markers found. Treating entire text as a single utterance for Speaker 1.")
+        parsed_lines.append((0, ' ' + script.strip()))
+        speaker_ids_in_script.append(1)
+
    return parsed_lines, sorted(list(set(speaker_ids_in_script)))

+
 def preprocess_comfy_audio(audio_dict: dict, target_sr: int = 24000) -> np.ndarray:
    """
    Converts a ComfyUI AUDIO dict to a mono NumPy array, resampling if necessary.
--- a/vibevoice/modular/modeling_vibevoice_inference.py
+++ b/vibevoice/modular/modeling_vibevoice_inference.py
@@ -480,11 +480,14 @@ class VibeVoiceForConditionalGenerationInference(VibeVoicePreTrainedModel, Gener
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
            if is_prefill:
                # we process the speech inputs only during the first generation step
-                prefill_inputs = {
-                    "speech_tensors": speech_tensors.to(device=device),
-                    "speech_masks": speech_masks.to(device),
-                    "speech_input_mask": speech_input_mask.to(device),
-                }
+                # Conditionally add speech tensors to prefill_inputs only if they exist.
+                prefill_inputs = {}
+                if speech_tensors is not None:
+                    prefill_inputs["speech_tensors"] = speech_tensors.to(device=device)
+                if speech_masks is not None:
+                    prefill_inputs["speech_masks"] = speech_masks.to(device)
+                if speech_input_mask is not None:
+                    prefill_inputs["speech_input_mask"] = speech_input_mask.to(device)
                is_prefill = False
            else:
                _ = model_inputs.pop('inputs_embeds', None)
--- a/vibevoice/processor/vibevoice_processor.py
+++ b/vibevoice/processor/vibevoice_processor.py
@@ -147,8 +147,10 @@ class VibeVoiceProcessor:
    
    def __call__(
        self,
-        text: Optional[Union[str, List[str], TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
-        voice_samples: Optional[Union[List[Union[str, np.ndarray]], List[List[Union[str, np.ndarray]]]]] = None,
+        text: Optional[List[str]] = None,
+        parsed_scripts: Optional[List[List[Tuple[int, str]]]] = None, # <-- ADDED
+        voice_samples: Optional[List[List[Optional[Union[str, np.ndarray]]]]] = None,
+        speaker_ids_for_prompt: Optional[List[List[int]]] = None,
        padding: Union[bool, str, PaddingStrategy] = True,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
@@ -189,31 +191,26 @@ class VibeVoiceProcessor:
                - **speech_masks** -- Speech masks (if voice_samples provided)
                - **speech_input_mask** -- Boolean masks indicating speech token positions
        """
-        # Handle single vs batch input
-        if isinstance(text, str) or (isinstance(text, list) and len(text) > 0 and not isinstance(text[0], str)):
-            # Single input
-            texts = [text]
-            is_batched = False
-        else:
-            # Batch input
-            texts = text
-            is_batched = True
-            
-        # Handle voice samples
-        if voice_samples is not None:
-            if not is_batched or (isinstance(voice_samples[0], (str, np.ndarray))):
-                # Single set of voice samples
-                voice_samples_list = [voice_samples]
-            else:
-                # Batch of voice samples
-                voice_samples_list = voice_samples
-        else:
-            voice_samples_list = [None] * len(texts)
        
-        # Process each input
+        if parsed_scripts is None:
+            if text is None:
+                raise ValueError("Either 'text' or 'parsed_scripts' must be provided.")
+            # Fallback for raw text input (though the node won't use this path)
+            from ..modules.utils import parse_script_1_based
+            parsed_scripts = [parse_script_1_based(t)[0] for t in text]
+
+        num_scripts = len(parsed_scripts)
+        voice_samples_list = voice_samples if voice_samples is not None else [[] for _ in range(num_scripts)]
+        speaker_ids_list = speaker_ids_for_prompt if speaker_ids_for_prompt is not None else [[] for _ in range(num_scripts)]
+        
        all_encodings = []
-        for text_input, voice_input in zip(texts, voice_samples_list):
-            encoding = self._process_single(text_input, voice_input)
+        for i in range(num_scripts):
+            # Pass all three corresponding items to _process_single
+            encoding = self._process_single(
+                parsed_scripts[i],
+                voice_samples_list[i],
+                speaker_ids_list[i]
+            )
            all_encodings.append(encoding)
            
        # Combine batch
@@ -230,62 +227,38 @@ class VibeVoiceProcessor:
    
    def _process_single(
        self,
-        text: Union[str, TextInput],
-        voice_samples: Optional[List[Union[str, np.ndarray]]] = None,
+        parsed_script: List[Tuple[int, str]],
+        voice_samples: List[Optional[Union[str, np.ndarray]]],
+        speaker_ids: List[int],
    ) -> Dict[str, Any]:
-        """Process a single podcast script."""
-        # Determine if text is a file path or direct script
-        script = None
-        if isinstance(text, str):
-            # Check if it's a file path
-            if text.endswith('.json') and os.path.exists(text):
-                script = self._convert_json_to_script(text)
-            elif text.endswith('.txt') and os.path.exists(text):
-                script = self._convert_text_to_script(text)
-            else:
-                # Assume it's the script content directly
-                script = text
-        
-        if script is None:
-            raise ValueError(f"Could not process input text: {text}")
-        
-        # Parse the script
-        parsed_lines = self._parse_script(script)
-        all_speakers = list(set(speaker_id for speaker_id, _ in parsed_lines))
-        
-        # Create system prompt
-        # system_tokens = self.tokenizer.encode(self.system_prompt, add_special_tokens=False)
+
        system_tokens = self.tokenizer.encode(self.system_prompt)
        
-        # Process voice samples if provided
-        if voice_samples:
-            voice_tokens, voice_speech_inputs, voice_speech_masks = self._create_voice_prompt(voice_samples[:len(all_speakers)])
-        else:
-            voice_tokens, voice_speech_inputs, voice_speech_masks = [], [], []
-        
-        # Build full token sequence
+        voice_tokens, voice_speech_inputs, voice_speech_masks = self._create_voice_prompt(
+            voice_samples, speaker_ids
+        )
+
        full_tokens = system_tokens + voice_tokens
        speech_input_mask = [False] * len(system_tokens) + voice_speech_masks
        
-        # Add text input section
-        full_tokens += self.tokenizer.encode(' Text input:\n', add_special_tokens=False)
-        speech_input_mask += [False] * len(self.tokenizer.encode(' Text input:\n', add_special_tokens=False))
-        
-        for speaker_id, speaker_text in parsed_lines:
-            speaker_text_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:{speaker_text}\n", add_special_tokens=False)
-            full_tokens += speaker_text_tokens
-            speech_input_mask += [False] * len(speaker_text_tokens)
-        
-        # Add speech output section
-        full_tokens += self.tokenizer.encode(' Speech output:\n', add_special_tokens=False) + [self.tokenizer.speech_start_id]
-        speech_input_mask += [False] * (len(self.tokenizer.encode(' Speech output:\n', add_special_tokens=False)) + 1)
+        dialogue_lines = []
+        for speaker_id_0_based, text_chunk in parsed_script:
+            speaker_id_1_based = speaker_id_0_based + 1
+            dialogue_lines.append(f"Speaker {speaker_id_1_based}: : {text_chunk}")
        
+        full_dialogue_script = "\n".join(dialogue_lines)
+
+        final_prompt_text = f" Text input:\n{full_dialogue_script}\n Speech output:\n"
+
+        prompt_tokens = self.tokenizer.encode(final_prompt_text, add_special_tokens=False)
+
+        full_tokens += prompt_tokens + [self.tokenizer.speech_start_id]
+        speech_input_mask += [False] * (len(prompt_tokens) + 1)
+
        return {
            "input_ids": full_tokens,
            "speech_inputs": voice_speech_inputs if voice_speech_inputs else None,
            "speech_input_mask": speech_input_mask,
-            "parsed_script": parsed_lines,
-            "all_speakers": all_speakers,
        }
    
    def _batch_encode(
@@ -298,11 +271,9 @@ class VibeVoiceProcessor:
        return_attention_mask: bool = True,
    ) -> BatchEncoding:
        """Combine multiple encodings into a batch with padding."""
-        # Extract input_ids and create attention_mask
        input_ids_list = [enc["input_ids"] for enc in encodings]
        speech_input_masks_list = [enc["speech_input_mask"] for enc in encodings]
        
-        # Determine padding strategy
        if isinstance(padding, bool):
            padding_strategy = PaddingStrategy.LONGEST if padding else PaddingStrategy.DO_NOT_PAD
        elif isinstance(padding, str):
@@ -347,15 +318,11 @@ class VibeVoiceProcessor:
            # No padding, just create attention masks
            attention_masks = [[1] * len(ids) for ids in input_ids_list] if return_attention_mask else None
            
-        # Process speech inputs
        all_speech_inputs = []
-        has_speech = False
        for enc in encodings:
-            if enc["speech_inputs"] is not None:
+            if enc.get("speech_inputs"):
                all_speech_inputs.extend(enc["speech_inputs"])
-                has_speech = True
-                
-        # Prepare batch encoding
+        
        batch_encoding = BatchEncoding()
        
        # Handle tensor conversion
@@ -370,79 +337,79 @@ class VibeVoiceProcessor:
                batch_encoding["attention_mask"] = attention_masks
            batch_encoding["speech_input_mask"] = speech_input_masks_list
            
-        # Process speech tensors if present
-        if has_speech:
-            speech_dict = self.prepare_speech_inputs(
-                all_speech_inputs,
-                return_tensors=return_tensors,
-            )
+        if all_speech_inputs:
+            speech_dict = self.prepare_speech_inputs(all_speech_inputs, return_tensors=return_tensors)
            batch_encoding["speech_tensors"] = speech_dict["padded_speeches"]
            batch_encoding["speech_masks"] = speech_dict["speech_masks"]
        else:
            batch_encoding["speech_tensors"] = None
            batch_encoding["speech_masks"] = None
            
-        # Add metadata
-        batch_encoding["parsed_scripts"] = [enc["parsed_script"] for enc in encodings]
-        batch_encoding["all_speakers_list"] = [enc["all_speakers"] for enc in encodings]
-        
        return batch_encoding

    def _create_voice_prompt(
        self, 
-        speaker_samples: List[Union[str, np.ndarray]]
+        speaker_samples: List[Optional[Union[str, np.ndarray]]],
+        speaker_ids: List[int]
    ) -> Tuple[List[int], List[np.ndarray], List[bool]]:
        """
        Create voice prompt tokens and process audio samples.
+        This function now handles `None` in the speaker_samples list for zero-shot speakers.
        
        Returns:
            tuple: (voice_tokens, voice_speech_inputs, voice_speech_masks)
        """
+        if not any(s is not None for s in speaker_samples):
+            return [], [], []
+
        vae_token_id = self.tokenizer.speech_diffusion_id
        
        voice_full_tokens = self.tokenizer.encode(' Voice input:\n', add_special_tokens=False)
        voice_speech_inputs = []
        voice_speech_masks = [False] * len(voice_full_tokens)
        
-        for speaker_id, speaker_audio in enumerate(speaker_samples):
-            prefix_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:", add_special_tokens=False)
+        for speaker_id, speaker_audio in zip(speaker_ids, speaker_samples):
            
-            # Process audio
-            if isinstance(speaker_audio, str):
-                # Load audio from file
-                wav = self.audio_processor._load_audio_from_path(speaker_audio)
+            if speaker_audio is not None:
+                logger.info(f"Creating voice prompt for Speaker {speaker_id} from reference audio.")
+                prefix_tokens = self.tokenizer.encode(f" Speaker {speaker_id}:", add_special_tokens=False)
+                newline_tokens = self.tokenizer.encode('\n', add_special_tokens=False)
+
+                if isinstance(speaker_audio, str):
+                    wav = self.audio_processor._load_audio_from_path(speaker_audio)
+                else:
+                    wav = np.array(speaker_audio, dtype=np.float32)
+                
+                if self.db_normalize and self.audio_normalizer:
+                    wav = self.audio_normalizer(wav)
+
+            
+                vae_tok_len = math.ceil(wav.shape[0] / self.speech_tok_compress_ratio)
+                speaker_tokens = (
+                    prefix_tokens + 
+                    [self.tokenizer.speech_start_id] + 
+                    [vae_token_id] * vae_tok_len + 
+                    [self.tokenizer.speech_end_id] + 
+                    newline_tokens
+                )
+                
+                vae_input_mask = (
+                    [False] * len(prefix_tokens) + 
+                    [False] + # for speech_start_id
+                    [True] * vae_tok_len + 
+                    [False] + # for speech_end_id
+                    [False] * len(newline_tokens)
+                )
+                voice_speech_inputs.append(wav)
+                voice_full_tokens.extend(speaker_tokens)
+                voice_speech_masks.extend(vae_input_mask)
            else:
-                wav = np.array(speaker_audio, dtype=np.float32)
-            
-            # Apply normalization if needed
-            if self.db_normalize and self.audio_normalizer:
-                wav = self.audio_normalizer(wav)
-            
-            # Calculate token length based on compression ratio
-            # if speaker_audio.endswith('.pt') or speaker_audio.endswith('.npy'):
-            #     vae_tok_len = wav.shape[0]
-            # else:
-            vae_tok_len = math.ceil(wav.shape[0] / self.speech_tok_compress_ratio)
-            
-            # Build tokens and masks
-            speaker_tokens = (prefix_tokens + 
-                            [self.tokenizer.speech_start_id] + 
-                            [vae_token_id] * vae_tok_len + 
-                            [self.tokenizer.speech_end_id] + 
-                            self.tokenizer.encode('\n', add_special_tokens=False))
-            
-            vae_input_mask = ([False] * len(prefix_tokens) + 
-                            [False] + 
-                            [True] * vae_tok_len + 
-                            [False] + 
-                            [False])
-            
-            voice_full_tokens.extend(speaker_tokens)
-            voice_speech_masks.extend(vae_input_mask)
-            voice_speech_inputs.append(wav)
-            
+                logger.info(f"Skipping voice prompt for Speaker {speaker_id} (zero-shot).")
+        
+
        return voice_full_tokens, voice_speech_inputs, voice_speech_masks

+
    def prepare_speech_inputs(
        self,
        speech_inputs: List[np.ndarray],
@@ -481,10 +448,7 @@ class VibeVoiceProcessor:
            padded_speeches[i, :len(speech)] = speech
            speech_masks[i, :vae_tok_length] = True
        
-        result = {
-            "padded_speeches": padded_speeches,
-            "speech_masks": speech_masks,
-        }
+        result = {"padded_speeches": padded_speeches, "speech_masks": speech_masks}
        
        # Convert to tensors if requested
        if return_tensors == "pt":
@@ -584,12 +548,10 @@ class VibeVoiceProcessor:
        parsed_lines = []
        speaker_ids = []
                
-        # First pass: parse all lines and collect speaker IDs
        for line in lines:
            if not line.strip():
                continue
                
-            # Use regex to handle edge cases like multiple colons
            match = re.match(r'^Speaker\s+(\d+)\s*:\s*(.*)$', line.strip(), re.IGNORECASE)
            
            if match:
--- a/vibevoice_nodes.py
+++ b/vibevoice_nodes.py
@@ -27,8 +27,8 @@ class VibeVoiceTTSNode:
                }),
                "text": ("STRING", {
                    "multiline": True, 
-                    "default": "Speaker 1: Hello from ComfyUI!\nSpeaker 2: VibeVoice sounds amazing.",
-                    "tooltip": "The script for the conversation. Use 'Speaker 1:', 'Speaker 2:', etc. to assign lines to different voices. Each speaker line should be on a new line."
+                    "default": "[1] Hello, this is a cloned voice.\n[2] And this is a generated voice, how cool is that?",
+                    "tooltip": "The script for generation. Use '[1]' or 'Speaker 1:' for speakers. If a speaker in the script lacks a reference voice, it will be generated via zero-shot TTS."
                }),
                "quantize_llm_4bit": ("BOOLEAN", {
                    "default": False, "label_on": "Q4 (LLM only)", "label_off": "Full precision",
@@ -72,10 +72,10 @@ class VibeVoiceTTSNode:
                }),
            },
            "optional": {
-                "speaker_1_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 1' in the script."}),
-                "speaker_2_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 2' in the script."}),
-                "speaker_3_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 3' in the script."}),
-                "speaker_4_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 4' in the script."}),
+                "speaker_1_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 1' or '[1]' in the script."}),
+                "speaker_2_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 2' or '[2]' in the script."}),
+                "speaker_3_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 3' or '[3]' in the script."}),
+                "speaker_4_voice": ("AUDIO", {"tooltip": "Reference audio for 'Speaker 4' or '[4]' in the script."}),
            }
        }
    
@@ -113,23 +113,23 @@ class VibeVoiceTTSNode:
        
        parsed_lines_0_based, speaker_ids_1_based = parse_script_1_based(text)
        if not parsed_lines_0_based:
-            raise ValueError("Script is empty or invalid. Use 'Speaker 1:', 'Speaker 2:', etc. format.")
+            raise ValueError("Script is empty or invalid. Please provide text to generate.")
            
-        full_script = "\n".join([f"Speaker {spk+1}: {txt}" for spk, txt in parsed_lines_0_based])
+        # full_script = "\n".join([f"Speaker {spk+1}: {txt}" for spk, txt in parsed_lines_0_based]) # <-- REMOVED: This was the cause of the bug.
        
        speaker_inputs = {i: kwargs.get(f"speaker_{i}_voice") for i in range(1, 5)}
-        voice_samples_np = [preprocess_comfy_audio(speaker_inputs[sid]) for sid in speaker_ids_1_based]
-        
-        if any(v is None for v in voice_samples_np):
-            missing_ids = [sid for sid, v in zip(speaker_ids_1_based, voice_samples_np) if v is None]
-            raise ValueError(f"Script requires voices for Speakers {missing_ids}, but they were not provided.")
-        
+        voice_samples_np = [preprocess_comfy_audio(speaker_inputs.get(sid)) for sid in speaker_ids_1_based]
+
        set_vibevoice_seed(seed)
        
        try:
            inputs = processor(
-                text=[full_script], voice_samples=[voice_samples_np], padding=True,
-                return_tensors="pt", return_attention_mask=True
+                parsed_scripts=[parsed_lines_0_based],
+                voice_samples=[voice_samples_np], 
+                speaker_ids_for_prompt=[speaker_ids_1_based],
+                padding=True,
+                return_tensors="pt", 
+                return_attention_mask=True
            )
            
            for key, value in inputs.items():
@@ -155,7 +155,7 @@ class VibeVoiceTTSNode:
                def progress_callback(step, total_steps):
                    pbar.update(1)
                    if model_management.interrupt_current_processing:
-                        raise comfy.model_management.InterruptProcessingException()
+                        raise model_management.InterruptProcessingException()

                try:
                    outputs = model.generate(
@@ -172,13 +172,13 @@ class VibeVoiceTTSNode:
                        logger.error("This might be due to invalid input data, GPU memory issues, or incompatible attention mode.")
                        logger.error("Try restarting ComfyUI, using different audio files, or switching to 'eager' attention mode.")
                    raise e
-                except comfy.model_management.InterruptProcessingException:
+                except model_management.InterruptProcessingException:
                    logger.info("VibeVoice generation interrupted by user")
                    raise
                finally:
                    pbar.update_absolute(inference_steps)

-        except comfy.model_management.InterruptProcessingException:
+        except model_management.InterruptProcessingException:
            logger.info("VibeVoice TTS generation was cancelled")
            return ({"waveform": torch.zeros((1, 1, 24000), dtype=torch.float32), "sample_rate": 24000},)