mirror of
https://github.com/wildminder/ComfyUI-VibeVoice.git
synced 2026-05-01 12:11:24 +00:00
small fixes
This commit is contained in:
@@ -11,14 +11,15 @@ import folder_paths
|
|||||||
|
|
||||||
from .vibevoice_nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
|
from .vibevoice_nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
|
||||||
|
|
||||||
# logger
|
# Configure a logger for the entire custom node package
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.WARNING)
|
||||||
|
|
||||||
if not logger.hasHandlers():
|
if not logger.hasHandlers():
|
||||||
handler = logging.StreamHandler(sys.stdout)
|
handler = logging.StreamHandler(sys.stdout)
|
||||||
formatter = logging.Formatter(f"[ComfyUI-VibeVoice] %(message)s")
|
formatter = logging.Formatter(f"[ComfyUI-VibeVoice] %(message)s")
|
||||||
handler.setFormatter(formatter)
|
handler.setFormatter(formatter)
|
||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
logger.setLevel(logging.INFO)
|
|
||||||
|
|
||||||
|
|
||||||
VIBEVOICE_MODEL_SUBDIR = os.path.join("tts", "VibeVoice")
|
VIBEVOICE_MODEL_SUBDIR = os.path.join("tts", "VibeVoice")
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
{
|
{
|
||||||
"id": "b91265e5-1b03-4b63-8dc3-4abd9a030e08",
|
"id": "b91265e5-1b03-4b63-8dc3-4abd9a030e08",
|
||||||
"revision": 0,
|
"revision": 0,
|
||||||
"last_node_id": 5,
|
"last_node_id": 10,
|
||||||
"last_link_id": 16,
|
"last_link_id": 24,
|
||||||
"nodes": [
|
"nodes": [
|
||||||
{
|
{
|
||||||
"id": 2,
|
"id": 4,
|
||||||
"type": "LoadAudio",
|
"type": "LoadAudio",
|
||||||
"pos": [
|
"pos": [
|
||||||
-1900,
|
-1900,
|
||||||
@@ -24,14 +24,14 @@
|
|||||||
"name": "AUDIO",
|
"name": "AUDIO",
|
||||||
"type": "AUDIO",
|
"type": "AUDIO",
|
||||||
"links": [
|
"links": [
|
||||||
15
|
21
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
|
"Node name for S&R": "LoadAudio",
|
||||||
"cnr_id": "comfy-core",
|
"cnr_id": "comfy-core",
|
||||||
"ver": "0.3.52",
|
"ver": "0.3.52",
|
||||||
"Node name for S&R": "LoadAudio",
|
|
||||||
"ue_properties": {
|
"ue_properties": {
|
||||||
"widget_ue_connectable": {
|
"widget_ue_connectable": {
|
||||||
"audio": true,
|
"audio": true,
|
||||||
@@ -42,13 +42,13 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"widgets_values": [
|
"widgets_values": [
|
||||||
"male_petergriffin.wav",
|
"male_rickmorty.mp3",
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 4,
|
"id": 8,
|
||||||
"type": "LoadAudio",
|
"type": "LoadAudio",
|
||||||
"pos": [
|
"pos": [
|
||||||
-1900,
|
-1900,
|
||||||
@@ -67,14 +67,14 @@
|
|||||||
"name": "AUDIO",
|
"name": "AUDIO",
|
||||||
"type": "AUDIO",
|
"type": "AUDIO",
|
||||||
"links": [
|
"links": [
|
||||||
16
|
24
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"properties": {
|
"properties": {
|
||||||
|
"Node name for S&R": "LoadAudio",
|
||||||
"cnr_id": "comfy-core",
|
"cnr_id": "comfy-core",
|
||||||
"ver": "0.3.52",
|
"ver": "0.3.52",
|
||||||
"Node name for S&R": "LoadAudio",
|
|
||||||
"ue_properties": {
|
"ue_properties": {
|
||||||
"widget_ue_connectable": {
|
"widget_ue_connectable": {
|
||||||
"audio": true,
|
"audio": true,
|
||||||
@@ -85,7 +85,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"widgets_values": [
|
"widgets_values": [
|
||||||
"male_rickmorty.mp3",
|
"male_stewie.mp3",
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
]
|
]
|
||||||
@@ -102,20 +102,20 @@
|
|||||||
112
|
112
|
||||||
],
|
],
|
||||||
"flags": {},
|
"flags": {},
|
||||||
"order": 3,
|
"order": 4,
|
||||||
"mode": 0,
|
"mode": 0,
|
||||||
"inputs": [
|
"inputs": [
|
||||||
{
|
{
|
||||||
"name": "audio",
|
"name": "audio",
|
||||||
"type": "AUDIO",
|
"type": "AUDIO",
|
||||||
"link": 13
|
"link": 23
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"properties": {
|
"properties": {
|
||||||
|
"Node name for S&R": "SaveAudio",
|
||||||
"cnr_id": "comfy-core",
|
"cnr_id": "comfy-core",
|
||||||
"ver": "0.3.52",
|
"ver": "0.3.52",
|
||||||
"Node name for S&R": "SaveAudio",
|
|
||||||
"ue_properties": {
|
"ue_properties": {
|
||||||
"widget_ue_connectable": {
|
"widget_ue_connectable": {
|
||||||
"filename_prefix": true,
|
"filename_prefix": true,
|
||||||
@@ -129,31 +129,55 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": 5,
|
"id": 10,
|
||||||
|
"type": "MarkdownNote",
|
||||||
|
"pos": [
|
||||||
|
-1030,
|
||||||
|
-960
|
||||||
|
],
|
||||||
|
"size": [
|
||||||
|
420,
|
||||||
|
210
|
||||||
|
],
|
||||||
|
"flags": {},
|
||||||
|
"order": 2,
|
||||||
|
"mode": 0,
|
||||||
|
"inputs": [],
|
||||||
|
"outputs": [],
|
||||||
|
"title": "Notes",
|
||||||
|
"properties": {},
|
||||||
|
"widgets_values": [
|
||||||
|
"## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length | Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-0.5B-Streaming | - | - | On the way |\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-7B-Preview| 32K | ~45 min | [HF link](https://huggingface.co/WestZhang/VibeVoice-Large-pt) |"
|
||||||
|
],
|
||||||
|
"color": "#432",
|
||||||
|
"bgcolor": "#653"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
"type": "VibeVoiceTTS",
|
"type": "VibeVoiceTTS",
|
||||||
"pos": [
|
"pos": [
|
||||||
-1570,
|
-1570,
|
||||||
-1130
|
-1130
|
||||||
],
|
],
|
||||||
"size": [
|
"size": [
|
||||||
460,
|
480,
|
||||||
460
|
490
|
||||||
],
|
],
|
||||||
"flags": {},
|
"flags": {},
|
||||||
"order": 2,
|
"order": 3,
|
||||||
"mode": 0,
|
"mode": 0,
|
||||||
"inputs": [
|
"inputs": [
|
||||||
{
|
{
|
||||||
"name": "speaker_1_voice",
|
"name": "speaker_1_voice",
|
||||||
"shape": 7,
|
"shape": 7,
|
||||||
"type": "AUDIO",
|
"type": "AUDIO",
|
||||||
"link": 15
|
"link": 24
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "speaker_2_voice",
|
"name": "speaker_2_voice",
|
||||||
"shape": 7,
|
"shape": 7,
|
||||||
"type": "AUDIO",
|
"type": "AUDIO",
|
||||||
"link": 16
|
"link": 21
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "speaker_3_voice",
|
"name": "speaker_3_voice",
|
||||||
@@ -173,7 +197,7 @@
|
|||||||
"name": "AUDIO",
|
"name": "AUDIO",
|
||||||
"type": "AUDIO",
|
"type": "AUDIO",
|
||||||
"links": [
|
"links": [
|
||||||
13
|
23
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -182,11 +206,12 @@
|
|||||||
},
|
},
|
||||||
"widgets_values": [
|
"widgets_values": [
|
||||||
"VibeVoice-1.5B",
|
"VibeVoice-1.5B",
|
||||||
"Speaker 1: Hey, remember \"See You Again\"?\nSpeaker 2: Yeah… from Furious 7, right? That song always hits deep.\nSpeaker 1: Let me try to sing a part of it for you. \"It's been a long day… without you, my friend. And I'll tell you all about it when I see you again…\"\nSpeaker 2: Wow… that line. Every time.\nSpeaker 1: Yeah, and then this part always makes me think of the people I've lost. \"We've come a long way… from where we began. Oh, I'll tell you all about it when I see you again…\"\nSpeaker 2: It's beautiful, really. It's not just sad—it's like… hopeful.\nSpeaker 1: Right? Like no matter how far apart we are, there's still that promise.",
|
"Speaker 1: I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\nSpeaker 2: Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\nSpeaker 1: A nightmare? That's the same excuse you used last time. I'm starting to think you just don't care. It's easier to say 'work was crazy' than to just admit that I'm not a priority for you anymore.",
|
||||||
|
"flash_attention_2",
|
||||||
1.3,
|
1.3,
|
||||||
50,
|
30,
|
||||||
42,
|
309317081412002,
|
||||||
"fixed",
|
"randomize",
|
||||||
true,
|
true,
|
||||||
0.95,
|
0.95,
|
||||||
0.95,
|
0.95,
|
||||||
@@ -198,28 +223,28 @@
|
|||||||
],
|
],
|
||||||
"links": [
|
"links": [
|
||||||
[
|
[
|
||||||
13,
|
21,
|
||||||
5,
|
4,
|
||||||
|
0,
|
||||||
|
9,
|
||||||
|
1,
|
||||||
|
"AUDIO"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
23,
|
||||||
|
9,
|
||||||
0,
|
0,
|
||||||
3,
|
3,
|
||||||
0,
|
0,
|
||||||
"AUDIO"
|
"AUDIO"
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
15,
|
24,
|
||||||
2,
|
8,
|
||||||
0,
|
0,
|
||||||
5,
|
9,
|
||||||
0,
|
0,
|
||||||
"AUDIO"
|
"AUDIO"
|
||||||
],
|
|
||||||
[
|
|
||||||
16,
|
|
||||||
4,
|
|
||||||
0,
|
|
||||||
5,
|
|
||||||
1,
|
|
||||||
"AUDIO"
|
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
"groups": [],
|
"groups": [],
|
||||||
@@ -228,10 +253,10 @@
|
|||||||
"ue_links": [],
|
"ue_links": [],
|
||||||
"links_added_by_ue": [],
|
"links_added_by_ue": [],
|
||||||
"ds": {
|
"ds": {
|
||||||
"scale": 1.310999419150025,
|
"scale": 1.0834705943388634,
|
||||||
"offset": [
|
"offset": [
|
||||||
2000,
|
2057.223518869778,
|
||||||
1230
|
1246.6132796718712
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"frontendVersion": "1.25.10",
|
"frontendVersion": "1.25.10",
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 128 KiB After Width: | Height: | Size: 134 KiB |
@@ -3,22 +3,30 @@ import re
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import random
|
import random
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import hf_hub_download, snapshot_download
|
||||||
import logging
|
import logging
|
||||||
import librosa
|
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
import folder_paths
|
import folder_paths
|
||||||
import comfy.model_management as model_management
|
import comfy.model_management as model_management
|
||||||
import comfy.model_patcher
|
import comfy.model_patcher
|
||||||
from comfy.utils import ProgressBar
|
from comfy.utils import ProgressBar
|
||||||
|
from comfy.model_management import throw_exception_if_processing_interrupted
|
||||||
|
|
||||||
|
from transformers import set_seed, AutoTokenizer
|
||||||
from transformers import set_seed
|
|
||||||
from .vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
from .vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
||||||
from .vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
|
from .vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
|
||||||
|
from .vibevoice.processor.vibevoice_tokenizer_processor import VibeVoiceTokenizerProcessor
|
||||||
|
from .vibevoice.modular.modular_vibevoice_text_tokenizer import VibeVoiceTextTokenizerFast
|
||||||
|
|
||||||
logger = logging.getLogger("comfyui_vibevoice")
|
try:
|
||||||
|
import librosa
|
||||||
|
except ImportError:
|
||||||
|
print("VibeVoice Node: `librosa` is not installed. Resampling of reference audio will not be available.")
|
||||||
|
librosa = None
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
LOADED_MODELS = {}
|
LOADED_MODELS = {}
|
||||||
VIBEVOICE_PATCHER_CACHE = {}
|
VIBEVOICE_PATCHER_CACHE = {}
|
||||||
@@ -27,10 +35,12 @@ MODEL_CONFIGS = {
|
|||||||
"VibeVoice-1.5B": {
|
"VibeVoice-1.5B": {
|
||||||
"repo_id": "microsoft/VibeVoice-1.5B",
|
"repo_id": "microsoft/VibeVoice-1.5B",
|
||||||
"size_gb": 3.0,
|
"size_gb": 3.0,
|
||||||
|
"tokenizer_repo": "Qwen/Qwen2.5-1.5B"
|
||||||
},
|
},
|
||||||
"VibeVoice-Large-pt": {
|
"VibeVoice-Large-pt": {
|
||||||
"repo_id": "WestZhang/VibeVoice-Large-pt",
|
"repo_id": "WestZhang/VibeVoice-Large-pt",
|
||||||
"size_gb": 14.0,
|
"size_gb": 14.0,
|
||||||
|
"tokenizer_repo": "Qwen/Qwen2.5-7B"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,7 +90,7 @@ class VibeVoiceModelHandler(torch.nn.Module):
|
|||||||
self.size = int(MODEL_CONFIGS[model_pack_name].get("size_gb", 4.0) * (1024**3))
|
self.size = int(MODEL_CONFIGS[model_pack_name].get("size_gb", 4.0) * (1024**3))
|
||||||
|
|
||||||
def load_model(self, device, attention_mode="eager"):
|
def load_model(self, device, attention_mode="eager"):
|
||||||
self.model, self.processor = VibeVoiceLoader.load_model(self.model_pack_name, attention_mode)
|
self.model, self.processor = VibeVoiceLoader.load_model(self.model_pack_name, device , attention_mode)
|
||||||
self.model.to(device)
|
self.model.to(device)
|
||||||
|
|
||||||
class VibeVoicePatcher(comfy.model_patcher.ModelPatcher):
|
class VibeVoicePatcher(comfy.model_patcher.ModelPatcher):
|
||||||
@@ -170,7 +180,7 @@ class VibeVoiceLoader:
|
|||||||
return attention_mode
|
return attention_mode
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_model(model_name: str, attention_mode: str = "eager"):
|
def load_model(model_name: str, device, attention_mode: str = "eager"):
|
||||||
# Validate attention mode
|
# Validate attention mode
|
||||||
if attention_mode not in ATTENTION_MODES:
|
if attention_mode not in ATTENTION_MODES:
|
||||||
logger.warning(f"Unknown attention mode '{attention_mode}', falling back to eager")
|
logger.warning(f"Unknown attention mode '{attention_mode}', falling back to eager")
|
||||||
@@ -185,10 +195,19 @@ class VibeVoiceLoader:
|
|||||||
|
|
||||||
model_path = VibeVoiceLoader.get_model_path(model_name)
|
model_path = VibeVoiceLoader.get_model_path(model_name)
|
||||||
|
|
||||||
print(f"Loading VibeVoice model components from: {model_path}")
|
logger.info(f"Loading VibeVoice model components from: {model_path}")
|
||||||
processor = VibeVoiceProcessor.from_pretrained(model_path)
|
|
||||||
|
|
||||||
torch_dtype = model_management.text_encoder_dtype(model_management.get_torch_device())
|
tokenizer_repo = MODEL_CONFIGS[model_name].get("tokenizer_repo")
|
||||||
|
try:
|
||||||
|
tokenizer_file_path = hf_hub_download(repo_id=tokenizer_repo, filename="tokenizer.json")
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Could not download tokenizer.json for {tokenizer_repo}. Error: {e}")
|
||||||
|
|
||||||
|
vibevoice_tokenizer = VibeVoiceTextTokenizerFast(tokenizer_file=tokenizer_file_path)
|
||||||
|
audio_processor = VibeVoiceTokenizerProcessor()
|
||||||
|
processor = VibeVoiceProcessor(tokenizer=vibevoice_tokenizer, audio_processor=audio_processor)
|
||||||
|
torch_dtype = model_management.text_encoder_dtype(device)
|
||||||
device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else ""
|
device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else ""
|
||||||
|
|
||||||
# Check compatibility and potentially fall back to safer mode
|
# Check compatibility and potentially fall back to safer mode
|
||||||
@@ -196,15 +215,15 @@ class VibeVoiceLoader:
|
|||||||
attention_mode, torch_dtype, device_name
|
attention_mode, torch_dtype, device_name
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"Requested attention mode: {attention_mode}")
|
logger.info(f"Requested attention mode: {attention_mode}")
|
||||||
if final_attention_mode != attention_mode:
|
if final_attention_mode != attention_mode:
|
||||||
print(f"Using attention mode: {final_attention_mode} (automatic fallback)")
|
logger.info(f"Using attention mode: {final_attention_mode} (automatic fallback)")
|
||||||
# Update cache key to reflect actual mode used
|
# Update cache key to reflect actual mode used
|
||||||
cache_key = f"{model_name}_attn_{final_attention_mode}"
|
cache_key = f"{model_name}_attn_{final_attention_mode}"
|
||||||
if cache_key in LOADED_MODELS:
|
if cache_key in LOADED_MODELS:
|
||||||
return LOADED_MODELS[cache_key]
|
return LOADED_MODELS[cache_key]
|
||||||
else:
|
else:
|
||||||
print(f"Using attention mode: {final_attention_mode}")
|
logger.info(f"Using attention mode: {final_attention_mode}")
|
||||||
|
|
||||||
logger.info(f"Final attention implementation: {final_attention_mode}")
|
logger.info(f"Final attention implementation: {final_attention_mode}")
|
||||||
|
|
||||||
@@ -236,6 +255,7 @@ class VibeVoiceLoader:
|
|||||||
model_path,
|
model_path,
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=torch_dtype,
|
||||||
attn_implementation=final_attention_mode,
|
attn_implementation=final_attention_mode,
|
||||||
|
device_map=device
|
||||||
)
|
)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
@@ -329,6 +349,8 @@ def preprocess_comfy_audio(audio_dict: dict, target_sr: int = 24000) -> np.ndarr
|
|||||||
waveform = waveform / max_val
|
waveform = waveform / max_val
|
||||||
|
|
||||||
if original_sr != target_sr:
|
if original_sr != target_sr:
|
||||||
|
if librosa is None:
|
||||||
|
raise ImportError("`librosa` package is required for audio resampling. Please install it with `pip install librosa`.")
|
||||||
logger.warning(f"Resampling reference audio from {original_sr}Hz to {target_sr}Hz.")
|
logger.warning(f"Resampling reference audio from {original_sr}Hz to {target_sr}Hz.")
|
||||||
waveform = librosa.resample(y=waveform, orig_sr=original_sr, target_sr=target_sr)
|
waveform = librosa.resample(y=waveform, orig_sr=original_sr, target_sr=target_sr)
|
||||||
|
|
||||||
@@ -339,6 +361,12 @@ def preprocess_comfy_audio(audio_dict: dict, target_sr: int = 24000) -> np.ndarr
|
|||||||
|
|
||||||
return waveform.astype(np.float32)
|
return waveform.astype(np.float32)
|
||||||
|
|
||||||
|
def check_for_interrupt():
|
||||||
|
try:
|
||||||
|
throw_exception_if_processing_interrupted()
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
return True
|
||||||
|
|
||||||
class VibeVoiceTTSNode:
|
class VibeVoiceTTSNode:
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -508,7 +536,7 @@ class VibeVoiceTTSNode:
|
|||||||
outputs = model.generate(
|
outputs = model.generate(
|
||||||
**inputs, max_new_tokens=None, cfg_scale=cfg_scale,
|
**inputs, max_new_tokens=None, cfg_scale=cfg_scale,
|
||||||
tokenizer=processor.tokenizer, generation_config=generation_config,
|
tokenizer=processor.tokenizer, generation_config=generation_config,
|
||||||
verbose=False
|
verbose=False, stop_check_fn=check_for_interrupt
|
||||||
)
|
)
|
||||||
# Note: The model.generate method doesn't support progress callbacks in the current VibeVoice implementation
|
# Note: The model.generate method doesn't support progress callbacks in the current VibeVoice implementation
|
||||||
# But we check for interruption at the start and end of generation
|
# But we check for interruption at the start and end of generation
|
||||||
|
|||||||
Reference in New Issue
Block a user