small fixes

This commit is contained in:
WildAi
2025-08-28 15:35:04 +03:00
parent 4da796065a
commit 48541d816d
4 changed files with 110 additions and 56 deletions

View File

@@ -11,14 +11,15 @@ import folder_paths
from .vibevoice_nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS from .vibevoice_nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
# logger # Configure a logger for the entire custom node package
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)
if not logger.hasHandlers(): if not logger.hasHandlers():
handler = logging.StreamHandler(sys.stdout) handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(f"[ComfyUI-VibeVoice] %(message)s") formatter = logging.Formatter(f"[ComfyUI-VibeVoice] %(message)s")
handler.setFormatter(formatter) handler.setFormatter(formatter)
logger.addHandler(handler) logger.addHandler(handler)
logger.setLevel(logging.INFO)
VIBEVOICE_MODEL_SUBDIR = os.path.join("tts", "VibeVoice") VIBEVOICE_MODEL_SUBDIR = os.path.join("tts", "VibeVoice")

View File

@@ -1,11 +1,11 @@
{ {
"id": "b91265e5-1b03-4b63-8dc3-4abd9a030e08", "id": "b91265e5-1b03-4b63-8dc3-4abd9a030e08",
"revision": 0, "revision": 0,
"last_node_id": 5, "last_node_id": 10,
"last_link_id": 16, "last_link_id": 24,
"nodes": [ "nodes": [
{ {
"id": 2, "id": 4,
"type": "LoadAudio", "type": "LoadAudio",
"pos": [ "pos": [
-1900, -1900,
@@ -24,14 +24,14 @@
"name": "AUDIO", "name": "AUDIO",
"type": "AUDIO", "type": "AUDIO",
"links": [ "links": [
15 21
] ]
} }
], ],
"properties": { "properties": {
"Node name for S&R": "LoadAudio",
"cnr_id": "comfy-core", "cnr_id": "comfy-core",
"ver": "0.3.52", "ver": "0.3.52",
"Node name for S&R": "LoadAudio",
"ue_properties": { "ue_properties": {
"widget_ue_connectable": { "widget_ue_connectable": {
"audio": true, "audio": true,
@@ -42,13 +42,13 @@
} }
}, },
"widgets_values": [ "widgets_values": [
"male_petergriffin.wav", "male_rickmorty.mp3",
null, null,
null null
] ]
}, },
{ {
"id": 4, "id": 8,
"type": "LoadAudio", "type": "LoadAudio",
"pos": [ "pos": [
-1900, -1900,
@@ -67,14 +67,14 @@
"name": "AUDIO", "name": "AUDIO",
"type": "AUDIO", "type": "AUDIO",
"links": [ "links": [
16 24
] ]
} }
], ],
"properties": { "properties": {
"Node name for S&R": "LoadAudio",
"cnr_id": "comfy-core", "cnr_id": "comfy-core",
"ver": "0.3.52", "ver": "0.3.52",
"Node name for S&R": "LoadAudio",
"ue_properties": { "ue_properties": {
"widget_ue_connectable": { "widget_ue_connectable": {
"audio": true, "audio": true,
@@ -85,7 +85,7 @@
} }
}, },
"widgets_values": [ "widgets_values": [
"male_rickmorty.mp3", "male_stewie.mp3",
null, null,
null null
] ]
@@ -102,20 +102,20 @@
112 112
], ],
"flags": {}, "flags": {},
"order": 3, "order": 4,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "audio", "name": "audio",
"type": "AUDIO", "type": "AUDIO",
"link": 13 "link": 23
} }
], ],
"outputs": [], "outputs": [],
"properties": { "properties": {
"Node name for S&R": "SaveAudio",
"cnr_id": "comfy-core", "cnr_id": "comfy-core",
"ver": "0.3.52", "ver": "0.3.52",
"Node name for S&R": "SaveAudio",
"ue_properties": { "ue_properties": {
"widget_ue_connectable": { "widget_ue_connectable": {
"filename_prefix": true, "filename_prefix": true,
@@ -129,31 +129,55 @@
] ]
}, },
{ {
"id": 5, "id": 10,
"type": "MarkdownNote",
"pos": [
-1030,
-960
],
"size": [
420,
210
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [],
"outputs": [],
"title": "Notes",
"properties": {},
"widgets_values": [
"## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length | Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-0.5B-Streaming | - | - | On the way |\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-7B-Preview| 32K | ~45 min | [HF link](https://huggingface.co/WestZhang/VibeVoice-Large-pt) |"
],
"color": "#432",
"bgcolor": "#653"
},
{
"id": 9,
"type": "VibeVoiceTTS", "type": "VibeVoiceTTS",
"pos": [ "pos": [
-1570, -1570,
-1130 -1130
], ],
"size": [ "size": [
460, 480,
460 490
], ],
"flags": {}, "flags": {},
"order": 2, "order": 3,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "speaker_1_voice", "name": "speaker_1_voice",
"shape": 7, "shape": 7,
"type": "AUDIO", "type": "AUDIO",
"link": 15 "link": 24
}, },
{ {
"name": "speaker_2_voice", "name": "speaker_2_voice",
"shape": 7, "shape": 7,
"type": "AUDIO", "type": "AUDIO",
"link": 16 "link": 21
}, },
{ {
"name": "speaker_3_voice", "name": "speaker_3_voice",
@@ -173,7 +197,7 @@
"name": "AUDIO", "name": "AUDIO",
"type": "AUDIO", "type": "AUDIO",
"links": [ "links": [
13 23
] ]
} }
], ],
@@ -182,11 +206,12 @@
}, },
"widgets_values": [ "widgets_values": [
"VibeVoice-1.5B", "VibeVoice-1.5B",
"Speaker 1: Hey, remember \"See You Again\"?\nSpeaker 2: Yeah… from Furious 7, right? That song always hits deep.\nSpeaker 1: Let me try to sing a part of it for you. \"It's been a long day… without you, my friend. And I'll tell you all about it when I see you again…\"\nSpeaker 2: Wow… that line. Every time.\nSpeaker 1: Yeah, and then this part always makes me think of the people I've lost. \"We've come a long way… from where we began. Oh, I'll tell you all about it when I see you again…\"\nSpeaker 2: It's beautiful, really. It's not just sad—it's like… hopeful.\nSpeaker 1: Right? Like no matter how far apart we are, there's still that promise.", "Speaker 1: I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\nSpeaker 2: Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\nSpeaker 1: A nightmare? That's the same excuse you used last time. I'm starting to think you just don't care. It's easier to say 'work was crazy' than to just admit that I'm not a priority for you anymore.",
"flash_attention_2",
1.3, 1.3,
50, 30,
42, 309317081412002,
"fixed", "randomize",
true, true,
0.95, 0.95,
0.95, 0.95,
@@ -198,28 +223,28 @@
], ],
"links": [ "links": [
[ [
13, 21,
5, 4,
0,
9,
1,
"AUDIO"
],
[
23,
9,
0, 0,
3, 3,
0, 0,
"AUDIO" "AUDIO"
], ],
[ [
15, 24,
2, 8,
0, 0,
5, 9,
0, 0,
"AUDIO" "AUDIO"
],
[
16,
4,
0,
5,
1,
"AUDIO"
] ]
], ],
"groups": [], "groups": [],
@@ -228,10 +253,10 @@
"ue_links": [], "ue_links": [],
"links_added_by_ue": [], "links_added_by_ue": [],
"ds": { "ds": {
"scale": 1.310999419150025, "scale": 1.0834705943388634,
"offset": [ "offset": [
2000, 2057.223518869778,
1230 1246.6132796718712
] ]
}, },
"frontendVersion": "1.25.10", "frontendVersion": "1.25.10",

Binary file not shown.

Before

Width:  |  Height:  |  Size: 128 KiB

After

Width:  |  Height:  |  Size: 134 KiB

View File

@@ -3,22 +3,30 @@ import re
import torch import torch
import numpy as np import numpy as np
import random import random
from huggingface_hub import snapshot_download from huggingface_hub import hf_hub_download, snapshot_download
import logging import logging
import librosa
import gc import gc
import folder_paths import folder_paths
import comfy.model_management as model_management import comfy.model_management as model_management
import comfy.model_patcher import comfy.model_patcher
from comfy.utils import ProgressBar from comfy.utils import ProgressBar
from comfy.model_management import throw_exception_if_processing_interrupted
from transformers import set_seed, AutoTokenizer
from transformers import set_seed
from .vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference from .vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from .vibevoice.processor.vibevoice_processor import VibeVoiceProcessor from .vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
from .vibevoice.processor.vibevoice_tokenizer_processor import VibeVoiceTokenizerProcessor
from .vibevoice.modular.modular_vibevoice_text_tokenizer import VibeVoiceTextTokenizerFast
logger = logging.getLogger("comfyui_vibevoice") try:
import librosa
except ImportError:
print("VibeVoice Node: `librosa` is not installed. Resampling of reference audio will not be available.")
librosa = None
logger = logging.getLogger(__name__)
LOADED_MODELS = {} LOADED_MODELS = {}
VIBEVOICE_PATCHER_CACHE = {} VIBEVOICE_PATCHER_CACHE = {}
@@ -27,10 +35,12 @@ MODEL_CONFIGS = {
"VibeVoice-1.5B": { "VibeVoice-1.5B": {
"repo_id": "microsoft/VibeVoice-1.5B", "repo_id": "microsoft/VibeVoice-1.5B",
"size_gb": 3.0, "size_gb": 3.0,
"tokenizer_repo": "Qwen/Qwen2.5-1.5B"
}, },
"VibeVoice-Large-pt": { "VibeVoice-Large-pt": {
"repo_id": "WestZhang/VibeVoice-Large-pt", "repo_id": "WestZhang/VibeVoice-Large-pt",
"size_gb": 14.0, "size_gb": 14.0,
"tokenizer_repo": "Qwen/Qwen2.5-7B"
} }
} }
@@ -80,7 +90,7 @@ class VibeVoiceModelHandler(torch.nn.Module):
self.size = int(MODEL_CONFIGS[model_pack_name].get("size_gb", 4.0) * (1024**3)) self.size = int(MODEL_CONFIGS[model_pack_name].get("size_gb", 4.0) * (1024**3))
def load_model(self, device, attention_mode="eager"): def load_model(self, device, attention_mode="eager"):
self.model, self.processor = VibeVoiceLoader.load_model(self.model_pack_name, attention_mode) self.model, self.processor = VibeVoiceLoader.load_model(self.model_pack_name, device , attention_mode)
self.model.to(device) self.model.to(device)
class VibeVoicePatcher(comfy.model_patcher.ModelPatcher): class VibeVoicePatcher(comfy.model_patcher.ModelPatcher):
@@ -170,7 +180,7 @@ class VibeVoiceLoader:
return attention_mode return attention_mode
@staticmethod @staticmethod
def load_model(model_name: str, attention_mode: str = "eager"): def load_model(model_name: str, device, attention_mode: str = "eager"):
# Validate attention mode # Validate attention mode
if attention_mode not in ATTENTION_MODES: if attention_mode not in ATTENTION_MODES:
logger.warning(f"Unknown attention mode '{attention_mode}', falling back to eager") logger.warning(f"Unknown attention mode '{attention_mode}', falling back to eager")
@@ -185,10 +195,19 @@ class VibeVoiceLoader:
model_path = VibeVoiceLoader.get_model_path(model_name) model_path = VibeVoiceLoader.get_model_path(model_name)
print(f"Loading VibeVoice model components from: {model_path}") logger.info(f"Loading VibeVoice model components from: {model_path}")
processor = VibeVoiceProcessor.from_pretrained(model_path)
torch_dtype = model_management.text_encoder_dtype(model_management.get_torch_device()) tokenizer_repo = MODEL_CONFIGS[model_name].get("tokenizer_repo")
try:
tokenizer_file_path = hf_hub_download(repo_id=tokenizer_repo, filename="tokenizer.json")
except Exception as e:
raise RuntimeError(f"Could not download tokenizer.json for {tokenizer_repo}. Error: {e}")
vibevoice_tokenizer = VibeVoiceTextTokenizerFast(tokenizer_file=tokenizer_file_path)
audio_processor = VibeVoiceTokenizerProcessor()
processor = VibeVoiceProcessor(tokenizer=vibevoice_tokenizer, audio_processor=audio_processor)
torch_dtype = model_management.text_encoder_dtype(device)
device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "" device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else ""
# Check compatibility and potentially fall back to safer mode # Check compatibility and potentially fall back to safer mode
@@ -196,15 +215,15 @@ class VibeVoiceLoader:
attention_mode, torch_dtype, device_name attention_mode, torch_dtype, device_name
) )
print(f"Requested attention mode: {attention_mode}") logger.info(f"Requested attention mode: {attention_mode}")
if final_attention_mode != attention_mode: if final_attention_mode != attention_mode:
print(f"Using attention mode: {final_attention_mode} (automatic fallback)") logger.info(f"Using attention mode: {final_attention_mode} (automatic fallback)")
# Update cache key to reflect actual mode used # Update cache key to reflect actual mode used
cache_key = f"{model_name}_attn_{final_attention_mode}" cache_key = f"{model_name}_attn_{final_attention_mode}"
if cache_key in LOADED_MODELS: if cache_key in LOADED_MODELS:
return LOADED_MODELS[cache_key] return LOADED_MODELS[cache_key]
else: else:
print(f"Using attention mode: {final_attention_mode}") logger.info(f"Using attention mode: {final_attention_mode}")
logger.info(f"Final attention implementation: {final_attention_mode}") logger.info(f"Final attention implementation: {final_attention_mode}")
@@ -236,6 +255,7 @@ class VibeVoiceLoader:
model_path, model_path,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
attn_implementation=final_attention_mode, attn_implementation=final_attention_mode,
device_map=device
) )
model.eval() model.eval()
@@ -329,6 +349,8 @@ def preprocess_comfy_audio(audio_dict: dict, target_sr: int = 24000) -> np.ndarr
waveform = waveform / max_val waveform = waveform / max_val
if original_sr != target_sr: if original_sr != target_sr:
if librosa is None:
raise ImportError("`librosa` package is required for audio resampling. Please install it with `pip install librosa`.")
logger.warning(f"Resampling reference audio from {original_sr}Hz to {target_sr}Hz.") logger.warning(f"Resampling reference audio from {original_sr}Hz to {target_sr}Hz.")
waveform = librosa.resample(y=waveform, orig_sr=original_sr, target_sr=target_sr) waveform = librosa.resample(y=waveform, orig_sr=original_sr, target_sr=target_sr)
@@ -339,6 +361,12 @@ def preprocess_comfy_audio(audio_dict: dict, target_sr: int = 24000) -> np.ndarr
return waveform.astype(np.float32) return waveform.astype(np.float32)
def check_for_interrupt():
try:
throw_exception_if_processing_interrupted()
return False
except:
return True
class VibeVoiceTTSNode: class VibeVoiceTTSNode:
@classmethod @classmethod
@@ -508,7 +536,7 @@ class VibeVoiceTTSNode:
outputs = model.generate( outputs = model.generate(
**inputs, max_new_tokens=None, cfg_scale=cfg_scale, **inputs, max_new_tokens=None, cfg_scale=cfg_scale,
tokenizer=processor.tokenizer, generation_config=generation_config, tokenizer=processor.tokenizer, generation_config=generation_config,
verbose=False verbose=False, stop_check_fn=check_for_interrupt
) )
# Note: The model.generate method doesn't support progress callbacks in the current VibeVoice implementation # Note: The model.generate method doesn't support progress callbacks in the current VibeVoice implementation
# But we check for interruption at the start and end of generation # But we check for interruption at the start and end of generation