mirror of
https://github.com/wildminder/ComfyUI-VibeVoice.git
synced 2026-04-30 11:41:35 +00:00
voice bleeding fix, audio quality, input speakers tags, zero-shot voices
This commit is contained in:
@@ -1,9 +1,71 @@
|
||||
{
|
||||
"id": "b91265e5-1b03-4b63-8dc3-4abd9a030e08",
|
||||
"revision": 0,
|
||||
"last_node_id": 11,
|
||||
"last_link_id": 29,
|
||||
"last_node_id": 14,
|
||||
"last_link_id": 44,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 3,
|
||||
"type": "SaveAudio",
|
||||
"pos": [
|
||||
-1040,
|
||||
-1130
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
112
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"link": 27
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"properties": {
|
||||
"Node name for S&R": "SaveAudio",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.52",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"filename_prefix": true,
|
||||
"audioUI": true
|
||||
},
|
||||
"version": "7.0.1"
|
||||
}
|
||||
},
|
||||
"widgets_values": [
|
||||
"audio/VibeVoice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "MarkdownNote",
|
||||
"pos": [
|
||||
-1898.1748046875,
|
||||
-1409.22314453125
|
||||
],
|
||||
"size": [
|
||||
1035.619873046875,
|
||||
211.96694946289062
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [],
|
||||
"title": "Note",
|
||||
"properties": {},
|
||||
"widgets_values": [
|
||||
"# ComfyUI-VibeVoice\n\nVibeVoice is a novel framework by Microsoft for generating expressive, long-form, multi-speaker conversational audio. It excels at creating natural-sounding dialogue, podcasts, and more, with consistent voices for up to 4 speakers.\n\n**✨ Key Features:**\n* **Multi-Speaker TTS:** Generate conversations with up to 4 distinct voices in a single audio output.\n* **High-Fidelity Voice Cloning:** Use any audio file (`.wav`, `.mp3`) as a reference for a speaker's voice.\n* **Hybrid Generation Mode:** Mix and match cloned voices with high-quality, zero-shot generated voices in the same script.\n* **Flexible Scripting:** Use simple `[1]` tags or the classic `Speaker 1:` format to write your dialogue.\n* **Advanced Attention Mechanisms:** Choose between `eager`, `sdpa`, `flash_attention_2`, and the high-performance `sage` attention for fine-tuned control over speed and compatibility.\n* **Robust 4-Bit Quantization:** Run the large language model component in 4-bit mode to significantly reduce VRAM usage.\n* **Automatic Model Management:** Models are downloaded automatically and managed efficiently by ComfyUI to save VRAM."
|
||||
],
|
||||
"color": "#233",
|
||||
"bgcolor": "#355"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "LoadAudio",
|
||||
@@ -12,26 +74,24 @@
|
||||
-1130
|
||||
],
|
||||
"size": [
|
||||
274.080078125,
|
||||
272.9800720214844,
|
||||
136
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "AUDIO",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
28
|
||||
]
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadAudio",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.52",
|
||||
"Node name for S&R": "LoadAudio",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"audio": true,
|
||||
@@ -51,30 +111,28 @@
|
||||
"id": 8,
|
||||
"type": "LoadAudio",
|
||||
"pos": [
|
||||
-1900,
|
||||
-940
|
||||
-1901.10009765625,
|
||||
-948.7998046875
|
||||
],
|
||||
"size": [
|
||||
274.080078125,
|
||||
136
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "AUDIO",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
29
|
||||
]
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadAudio",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.52",
|
||||
"Node name for S&R": "LoadAudio",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"audio": true,
|
||||
@@ -91,71 +149,52 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"id": 12,
|
||||
"type": "MarkdownNote",
|
||||
"pos": [
|
||||
-1030,
|
||||
-960
|
||||
-1915.701904296875,
|
||||
-762.380126953125
|
||||
],
|
||||
"size": [
|
||||
420,
|
||||
210
|
||||
312.85455322265625,
|
||||
292.8734130859375
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [],
|
||||
"title": "Notes",
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.0.1"
|
||||
}
|
||||
},
|
||||
"title": "Note",
|
||||
"properties": {},
|
||||
"widgets_values": [
|
||||
"## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length | Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-0.5B-Streaming | - | - | On the way |\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-Large| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-Large) |"
|
||||
"### Scripting and Voice Modes\n\n#### Speaker Tagging\nYou can assign lines to speakers in two ways. Both are treated identically.\n\n* **Modern Format (Recommended):** `[1] This is the first speaker.`\n* **Classic Format:** `Speaker 1: This is the first speaker.`\n\nYou can also add an optional colon to the modern format (e.g., `[1]: ...`). The node handles all variations consistently.\n\n#### Hybrid Voice Generation\nThis is a powerful feature that lets you mix cloned voices and generated (zero-shot) voices.\n\n* **To Clone a Voice:** Connect a `Load Audio` node to the speaker's input (e.g., `speaker_1_voice`).\n* **To Generate a Voice:** Leave the speaker's input empty. The model will create a unique, high-quality voice for that speaker."
|
||||
],
|
||||
"color": "#432",
|
||||
"bgcolor": "#653"
|
||||
"color": "#233",
|
||||
"bgcolor": "#355"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "SaveAudio",
|
||||
"id": 14,
|
||||
"type": "MarkdownNote",
|
||||
"pos": [
|
||||
-1040,
|
||||
-1130
|
||||
-1048.3660888671875,
|
||||
-960.8771362304688
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
112
|
||||
280.797607421875,
|
||||
487.02728271484375
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"link": 27
|
||||
}
|
||||
],
|
||||
"inputs": [],
|
||||
"outputs": [],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.52",
|
||||
"Node name for S&R": "SaveAudio",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"filename_prefix": true,
|
||||
"audioUI": true
|
||||
},
|
||||
"version": "7.0.1"
|
||||
}
|
||||
},
|
||||
"title": "Note",
|
||||
"properties": {},
|
||||
"widgets_values": [
|
||||
"audio/VibeVoice"
|
||||
]
|
||||
"## Models\n\nWill be downloaded on the first run, or download them manually and place them into the directory: /models/tts/VibeVoice\n\n| Model | Context Length | Generation Length | Weight |\n|-------|----------------|----------|----------|\n| VibeVoice-1.5B | 64K | ~90 min | [HF link](https://huggingface.co/microsoft/VibeVoice-1.5B) |\n| VibeVoice-Large| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-Large) |\n\n## Support \n\n- Don't know how to update PyTorch?\n- Need help with ComfyUI?\n- Need technical support?\n\n### Or do you just have questions? Then join the [@TokenDiffusion Hub](https://t.me/TokenDiff_hub) group\n\n### AI news [TokenDiffusion](https://t.me/TokenDiff)"
|
||||
],
|
||||
"color": "#233",
|
||||
"bgcolor": "#355"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
@@ -165,24 +204,24 @@
|
||||
-1130
|
||||
],
|
||||
"size": [
|
||||
460,
|
||||
510
|
||||
475.3999938964844,
|
||||
662.9000244140625
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "speaker_1_voice",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 28
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "speaker_2_voice",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 29
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "speaker_3_voice",
|
||||
@@ -207,9 +246,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VibeVoiceTTS",
|
||||
"cnr_id": "ComfyUI-VibeVoice",
|
||||
"ver": "37803a884fb8f9b43c38286f6d654c7f97181a73",
|
||||
"Node name for S&R": "VibeVoiceTTS",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"model_name": true,
|
||||
@@ -229,12 +268,12 @@
|
||||
},
|
||||
"widgets_values": [
|
||||
"VibeVoice-1.5B",
|
||||
"Speaker 1: I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\nSpeaker 2: Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\nSpeaker 1: A nightmare? That's the same excuse you used last time. I'm starting to think you just don't care. It's easier to say 'work was crazy' than to just admit that I'm not a priority for you anymore.",
|
||||
"[1] I can't believe you did it again. I waited for two hours. Two hours! Not a single call, not a text. Do you have any idea how embarrassing that was, just sitting there alone?\n[2] Look, I know, I'm sorry, alright? Work was a complete nightmare. My boss dropped a critical deadline on me at the last minute. I didn't even have a second to breathe, let alone check my phone.\n",
|
||||
false,
|
||||
"flash_attention_2",
|
||||
1.3,
|
||||
10,
|
||||
1,
|
||||
471935335072093,
|
||||
"fixed",
|
||||
true,
|
||||
0.95,
|
||||
@@ -254,37 +293,21 @@
|
||||
3,
|
||||
0,
|
||||
"AUDIO"
|
||||
],
|
||||
[
|
||||
28,
|
||||
4,
|
||||
0,
|
||||
11,
|
||||
0,
|
||||
"AUDIO"
|
||||
],
|
||||
[
|
||||
29,
|
||||
8,
|
||||
0,
|
||||
11,
|
||||
1,
|
||||
"AUDIO"
|
||||
]
|
||||
],
|
||||
"groups": [],
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ue_links": [],
|
||||
"links_added_by_ue": [],
|
||||
"ds": {
|
||||
"scale": 1.2100000000000002,
|
||||
"scale": 0.8264462809917354,
|
||||
"offset": [
|
||||
2000,
|
||||
1230
|
||||
2015.701904296875,
|
||||
1509.22314453125
|
||||
]
|
||||
},
|
||||
"frontendVersion": "1.25.11",
|
||||
"ue_links": [],
|
||||
"links_added_by_ue": [],
|
||||
"frontendVersion": "1.26.11",
|
||||
"VHS_latentpreview": false,
|
||||
"VHS_latentpreviewrate": 0,
|
||||
"VHS_MetadataImage": true,
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 145 KiB After Width: | Height: | Size: 154 KiB |
Reference in New Issue
Block a user