diff --git a/__init__.py b/__init__.py index d9526d4..d48be41 100644 --- a/__init__.py +++ b/__init__.py @@ -11,14 +11,15 @@ import folder_paths from .vibevoice_nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS -# logger +# Configure a logger for the entire custom node package logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + if not logger.hasHandlers(): handler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter(f"[ComfyUI-VibeVoice] %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) - logger.setLevel(logging.INFO) VIBEVOICE_MODEL_SUBDIR = os.path.join("tts", "VibeVoice") diff --git a/example_workflows/VibeVoice_example.json b/example_workflows/VibeVoice_example.json index 1a496bc..ed12a3a 100644 --- a/example_workflows/VibeVoice_example.json +++ b/example_workflows/VibeVoice_example.json @@ -1,11 +1,11 @@ { "id": "b91265e5-1b03-4b63-8dc3-4abd9a030e08", "revision": 0, - "last_node_id": 5, - "last_link_id": 16, + "last_node_id": 10, + "last_link_id": 24, "nodes": [ { - "id": 2, + "id": 4, "type": "LoadAudio", "pos": [ -1900, @@ -24,14 +24,14 @@ "name": "AUDIO", "type": "AUDIO", "links": [ - 15 + 21 ] } ], "properties": { + "Node name for S&R": "LoadAudio", "cnr_id": "comfy-core", "ver": "0.3.52", - "Node name for S&R": "LoadAudio", "ue_properties": { "widget_ue_connectable": { "audio": true, @@ -42,13 +42,13 @@ } }, "widgets_values": [ - "male_petergriffin.wav", + "male_rickmorty.mp3", null, null ] }, { - "id": 4, + "id": 8, "type": "LoadAudio", "pos": [ -1900, @@ -67,14 +67,14 @@ "name": "AUDIO", "type": "AUDIO", "links": [ - 16 + 24 ] } ], "properties": { + "Node name for S&R": "LoadAudio", "cnr_id": "comfy-core", "ver": "0.3.52", - "Node name for S&R": "LoadAudio", "ue_properties": { "widget_ue_connectable": { "audio": true, @@ -85,7 +85,7 @@ } }, "widgets_values": [ - "male_rickmorty.mp3", + "male_stewie.mp3", null, null ] @@ -102,20 +102,20 @@ 112 ], "flags": {}, - "order": 3, + "order": 4, "mode": 0, "inputs": [ { "name": "audio", "type": "AUDIO", - "link": 13 + "link": 23 } ], "outputs": [], "properties": { + "Node name for S&R": "SaveAudio", "cnr_id": "comfy-core", "ver": "0.3.52", - "Node name for S&R": "SaveAudio", "ue_properties": { "widget_ue_connectable": { "filename_prefix": true, @@ -129,31 +129,55 @@ ] }, { - "id": 5, + "id": 10, + "type": "MarkdownNote", + "pos": [ + -1030, + -960 + ], + "size": [ + 420, + 210 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [], + "title": "Notes", + "properties": {}, + "widgets_values": [ + "## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length | Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-0.5B-Streaming | - | - | On the way |\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-7B-Preview| 32K | ~45 min | [HF link](https://huggingface.co/WestZhang/VibeVoice-Large-pt) |" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 9, "type": "VibeVoiceTTS", "pos": [ -1570, -1130 ], "size": [ - 460, - 460 + 480, + 490 ], "flags": {}, - "order": 2, + "order": 3, "mode": 0, "inputs": [ { "name": "speaker_1_voice", "shape": 7, "type": "AUDIO", - "link": 15 + "link": 24 }, { "name": "speaker_2_voice", "shape": 7, "type": "AUDIO", - "link": 16 + "link": 21 }, { "name": "speaker_3_voice", @@ -173,7 +197,7 @@ "name": "AUDIO", "type": "AUDIO", "links": [ - 13 + 23 ] } ], @@ -182,11 +206,12 @@ }, "widgets_values": [ "VibeVoice-1.5B", - "Speaker 1: Hey, remember \"See You Again\"?\nSpeaker 2: Yeah… from Furious 7, right? That song always hits deep.\nSpeaker 1: Let me try to sing a part of it for you. \"It's been a long day… without you, my friend. And I'll tell you all about it when I see you again…\"\nSpeaker 2: Wow… that line. Every time.\nSpeaker 1: Yeah, and then this part always makes me think of the people I've lost. \"We've come a long way… from where we began. Oh, I'll tell you all about it when I see you again…\"\nSpeaker 2: It's beautiful, really. It's not just sad—it's like… hopeful.\nSpeaker 1: Right? Like no matter how far apart we are, there's still that promise.", + "Speaker 1: I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\nSpeaker 2: Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\nSpeaker 1: A nightmare? That's the same excuse you used last time. I'm starting to think you just don't care. It's easier to say 'work was crazy' than to just admit that I'm not a priority for you anymore.", + "flash_attention_2", 1.3, - 50, - 42, - "fixed", + 30, + 309317081412002, + "randomize", true, 0.95, 0.95, @@ -198,28 +223,28 @@ ], "links": [ [ - 13, - 5, + 21, + 4, + 0, + 9, + 1, + "AUDIO" + ], + [ + 23, + 9, 0, 3, 0, "AUDIO" ], [ - 15, - 2, + 24, + 8, 0, - 5, + 9, 0, "AUDIO" - ], - [ - 16, - 4, - 0, - 5, - 1, - "AUDIO" ] ], "groups": [], @@ -228,10 +253,10 @@ "ue_links": [], "links_added_by_ue": [], "ds": { - "scale": 1.310999419150025, + "scale": 1.0834705943388634, "offset": [ - 2000, - 1230 + 2057.223518869778, + 1246.6132796718712 ] }, "frontendVersion": "1.25.10", diff --git a/example_workflows/VibeVoice_example.png b/example_workflows/VibeVoice_example.png index 5f353db..bec95db 100644 Binary files a/example_workflows/VibeVoice_example.png and b/example_workflows/VibeVoice_example.png differ diff --git a/vibevoice_nodes.py b/vibevoice_nodes.py index 29cbbb3..1b75e2d 100644 --- a/vibevoice_nodes.py +++ b/vibevoice_nodes.py @@ -3,22 +3,30 @@ import re import torch import numpy as np import random -from huggingface_hub import snapshot_download +from huggingface_hub import hf_hub_download, snapshot_download import logging -import librosa + import gc import folder_paths import comfy.model_management as model_management import comfy.model_patcher from comfy.utils import ProgressBar +from comfy.model_management import throw_exception_if_processing_interrupted - -from transformers import set_seed +from transformers import set_seed, AutoTokenizer from .vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference from .vibevoice.processor.vibevoice_processor import VibeVoiceProcessor +from .vibevoice.processor.vibevoice_tokenizer_processor import VibeVoiceTokenizerProcessor +from .vibevoice.modular.modular_vibevoice_text_tokenizer import VibeVoiceTextTokenizerFast -logger = logging.getLogger("comfyui_vibevoice") +try: + import librosa +except ImportError: + print("VibeVoice Node: `librosa` is not installed. Resampling of reference audio will not be available.") + librosa = None + +logger = logging.getLogger(__name__) LOADED_MODELS = {} VIBEVOICE_PATCHER_CACHE = {} @@ -27,10 +35,12 @@ MODEL_CONFIGS = { "VibeVoice-1.5B": { "repo_id": "microsoft/VibeVoice-1.5B", "size_gb": 3.0, + "tokenizer_repo": "Qwen/Qwen2.5-1.5B" }, "VibeVoice-Large-pt": { "repo_id": "WestZhang/VibeVoice-Large-pt", "size_gb": 14.0, + "tokenizer_repo": "Qwen/Qwen2.5-7B" } } @@ -80,7 +90,7 @@ class VibeVoiceModelHandler(torch.nn.Module): self.size = int(MODEL_CONFIGS[model_pack_name].get("size_gb", 4.0) * (1024**3)) def load_model(self, device, attention_mode="eager"): - self.model, self.processor = VibeVoiceLoader.load_model(self.model_pack_name, attention_mode) + self.model, self.processor = VibeVoiceLoader.load_model(self.model_pack_name, device , attention_mode) self.model.to(device) class VibeVoicePatcher(comfy.model_patcher.ModelPatcher): @@ -170,7 +180,7 @@ class VibeVoiceLoader: return attention_mode @staticmethod - def load_model(model_name: str, attention_mode: str = "eager"): + def load_model(model_name: str, device, attention_mode: str = "eager"): # Validate attention mode if attention_mode not in ATTENTION_MODES: logger.warning(f"Unknown attention mode '{attention_mode}', falling back to eager") @@ -185,10 +195,19 @@ class VibeVoiceLoader: model_path = VibeVoiceLoader.get_model_path(model_name) - print(f"Loading VibeVoice model components from: {model_path}") - processor = VibeVoiceProcessor.from_pretrained(model_path) + logger.info(f"Loading VibeVoice model components from: {model_path}") + - torch_dtype = model_management.text_encoder_dtype(model_management.get_torch_device()) + tokenizer_repo = MODEL_CONFIGS[model_name].get("tokenizer_repo") + try: + tokenizer_file_path = hf_hub_download(repo_id=tokenizer_repo, filename="tokenizer.json") + except Exception as e: + raise RuntimeError(f"Could not download tokenizer.json for {tokenizer_repo}. Error: {e}") + + vibevoice_tokenizer = VibeVoiceTextTokenizerFast(tokenizer_file=tokenizer_file_path) + audio_processor = VibeVoiceTokenizerProcessor() + processor = VibeVoiceProcessor(tokenizer=vibevoice_tokenizer, audio_processor=audio_processor) + torch_dtype = model_management.text_encoder_dtype(device) device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "" # Check compatibility and potentially fall back to safer mode @@ -196,15 +215,15 @@ class VibeVoiceLoader: attention_mode, torch_dtype, device_name ) - print(f"Requested attention mode: {attention_mode}") + logger.info(f"Requested attention mode: {attention_mode}") if final_attention_mode != attention_mode: - print(f"Using attention mode: {final_attention_mode} (automatic fallback)") + logger.info(f"Using attention mode: {final_attention_mode} (automatic fallback)") # Update cache key to reflect actual mode used cache_key = f"{model_name}_attn_{final_attention_mode}" if cache_key in LOADED_MODELS: return LOADED_MODELS[cache_key] else: - print(f"Using attention mode: {final_attention_mode}") + logger.info(f"Using attention mode: {final_attention_mode}") logger.info(f"Final attention implementation: {final_attention_mode}") @@ -236,6 +255,7 @@ class VibeVoiceLoader: model_path, torch_dtype=torch_dtype, attn_implementation=final_attention_mode, + device_map=device ) model.eval() @@ -329,6 +349,8 @@ def preprocess_comfy_audio(audio_dict: dict, target_sr: int = 24000) -> np.ndarr waveform = waveform / max_val if original_sr != target_sr: + if librosa is None: + raise ImportError("`librosa` package is required for audio resampling. Please install it with `pip install librosa`.") logger.warning(f"Resampling reference audio from {original_sr}Hz to {target_sr}Hz.") waveform = librosa.resample(y=waveform, orig_sr=original_sr, target_sr=target_sr) @@ -339,6 +361,12 @@ def preprocess_comfy_audio(audio_dict: dict, target_sr: int = 24000) -> np.ndarr return waveform.astype(np.float32) +def check_for_interrupt(): + try: + throw_exception_if_processing_interrupted() + return False + except: + return True class VibeVoiceTTSNode: @classmethod @@ -508,7 +536,7 @@ class VibeVoiceTTSNode: outputs = model.generate( **inputs, max_new_tokens=None, cfg_scale=cfg_scale, tokenizer=processor.tokenizer, generation_config=generation_config, - verbose=False + verbose=False, stop_check_fn=check_for_interrupt ) # Note: The model.generate method doesn't support progress callbacks in the current VibeVoice implementation # But we check for interruption at the start and end of generation