mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-04-17 05:01:23 +00:00
Compare commits
50 Commits
fix/color-
...
feature/de
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
574b117f8c | ||
|
|
e7fbb3c2db | ||
|
|
e9a2d1e4cc | ||
|
|
1de83f91c3 | ||
|
|
8f374716ee | ||
|
|
cb0bbde402 | ||
|
|
7ce3f64c78 | ||
|
|
c5569e8627 | ||
|
|
c16db7fd69 | ||
|
|
fed4ac031a | ||
|
|
35dfcbbb28 | ||
|
|
722bc73319 | ||
|
|
402ff1cdb7 | ||
|
|
acd718598e | ||
|
|
559501e4b8 | ||
|
|
ee2db7488d | ||
|
|
c2657d5fb9 | ||
|
|
971932346a | ||
|
|
31283d2892 | ||
|
|
55ebd287ee | ||
|
|
a2840e7552 | ||
|
|
a134423890 | ||
|
|
b920bdd77d | ||
|
|
5410ed34f5 | ||
|
|
e6be419a30 | ||
|
|
3d4aca8084 | ||
|
|
2d861fb146 | ||
|
|
b615af1c65 | ||
|
|
40862c0776 | ||
|
|
50076f3439 | ||
|
|
61c2387436 | ||
|
|
7083484a48 | ||
|
|
4b1444fc7a | ||
|
|
8cbbea8f6a | ||
|
|
13917b3880 | ||
|
|
f21f6b2212 | ||
|
|
eb0686bbb6 | ||
|
|
5de94e70ec | ||
|
|
76b75f3ad7 | ||
|
|
0c63b4f6e3 | ||
|
|
7d437687c2 | ||
|
|
e2ddf28d78 | ||
|
|
076639fed9 | ||
|
|
55e6478526 | ||
|
|
537c10d231 | ||
|
|
8d723d2caa | ||
|
|
d113d1cc32 | ||
|
|
a500f1edac | ||
|
|
3f77450ef1 | ||
|
|
fc1fdf3389 |
2
.ci/windows_intel_base_files/run_intel_gpu.bat
Executable file
2
.ci/windows_intel_base_files/run_intel_gpu.bat
Executable file
@@ -0,0 +1,2 @@
|
||||
.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build
|
||||
pause
|
||||
36
.github/workflows/release-stable-all.yml
vendored
36
.github/workflows/release-stable-all.yml
vendored
@@ -20,29 +20,12 @@ jobs:
|
||||
git_tag: ${{ inputs.git_tag }}
|
||||
cache_tag: "cu130"
|
||||
python_minor: "13"
|
||||
python_patch: "11"
|
||||
python_patch: "12"
|
||||
rel_name: "nvidia"
|
||||
rel_extra_name: ""
|
||||
test_release: true
|
||||
secrets: inherit
|
||||
|
||||
release_nvidia_cu128:
|
||||
permissions:
|
||||
contents: "write"
|
||||
packages: "write"
|
||||
pull-requests: "read"
|
||||
name: "Release NVIDIA cu128"
|
||||
uses: ./.github/workflows/stable-release.yml
|
||||
with:
|
||||
git_tag: ${{ inputs.git_tag }}
|
||||
cache_tag: "cu128"
|
||||
python_minor: "12"
|
||||
python_patch: "10"
|
||||
rel_name: "nvidia"
|
||||
rel_extra_name: "_cu128"
|
||||
test_release: true
|
||||
secrets: inherit
|
||||
|
||||
release_nvidia_cu126:
|
||||
permissions:
|
||||
contents: "write"
|
||||
@@ -76,3 +59,20 @@ jobs:
|
||||
rel_extra_name: ""
|
||||
test_release: false
|
||||
secrets: inherit
|
||||
|
||||
release_xpu:
|
||||
permissions:
|
||||
contents: "write"
|
||||
packages: "write"
|
||||
pull-requests: "read"
|
||||
name: "Release Intel XPU"
|
||||
uses: ./.github/workflows/stable-release.yml
|
||||
with:
|
||||
git_tag: ${{ inputs.git_tag }}
|
||||
cache_tag: "xpu"
|
||||
python_minor: "13"
|
||||
python_patch: "12"
|
||||
rel_name: "intel"
|
||||
rel_extra_name: ""
|
||||
test_release: true
|
||||
secrets: inherit
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -24,3 +24,4 @@ web_custom_versions/
|
||||
openapi.yaml
|
||||
filtered-openapi.yaml
|
||||
uv.lock
|
||||
.comfy_environment
|
||||
|
||||
@@ -139,9 +139,9 @@ Example:
|
||||
"_quantization_metadata": {
|
||||
"format_version": "1.0",
|
||||
"layers": {
|
||||
"model.layers.0.mlp.up_proj": "float8_e4m3fn",
|
||||
"model.layers.0.mlp.down_proj": "float8_e4m3fn",
|
||||
"model.layers.1.mlp.up_proj": "float8_e4m3fn"
|
||||
"model.layers.0.mlp.up_proj": {"format": "float8_e4m3fn"},
|
||||
"model.layers.0.mlp.down_proj": {"format": "float8_e4m3fn"},
|
||||
"model.layers.1.mlp.up_proj": {"format": "float8_e4m3fn"}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -165,4 +165,4 @@ Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_s
|
||||
3. **Compute scales**: Derive `input_scale` from collected statistics
|
||||
4. **Store in checkpoint**: Save `input_scale` parameters alongside weights
|
||||
|
||||
The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
|
||||
The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
|
||||
|
||||
@@ -61,6 +61,7 @@ See what ComfyUI can do with the [newer template workflows](https://comfy.org/wo
|
||||
|
||||
## Features
|
||||
- Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
|
||||
- NOTE: There are many more models supported than the list below, if you want to see what is supported see our templates list inside ComfyUI.
|
||||
- Image Models
|
||||
- SD1.x, SD2.x ([unCLIP](https://comfyanonymous.github.io/ComfyUI_examples/unclip/))
|
||||
- [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
|
||||
@@ -136,7 +137,7 @@ ComfyUI follows a weekly release cycle targeting Monday but this regularly chang
|
||||
- Builds a new release using the latest stable core version
|
||||
|
||||
3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
|
||||
- Weekly frontend updates are merged into the core repository
|
||||
- Every 2+ weeks frontend updates are merged into the core repository
|
||||
- Features are frozen for the upcoming core release
|
||||
- Development continues for the next release cycle
|
||||
|
||||
@@ -232,7 +233,7 @@ Put your VAE in: models/vae
|
||||
|
||||
AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
|
||||
|
||||
```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.1```
|
||||
```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.2```
|
||||
|
||||
This is the command to install the nightly with ROCm 7.2 which might have some performance improvements:
|
||||
|
||||
@@ -275,7 +276,7 @@ Nvidia users should install stable pytorch using this command:
|
||||
|
||||
This is the command to install pytorch nightly instead which might have performance improvements.
|
||||
|
||||
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130```
|
||||
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu132```
|
||||
|
||||
#### Troubleshooting
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1 +1,322 @@
|
||||
{"revision": 0, "last_node_id": 29, "last_link_id": 0, "nodes": [{"id": 29, "type": "4c9d6ea4-b912-40e5-8766-6793a9758c53", "pos": [1970, -230], "size": [180, 86], "flags": {}, "order": 5, "mode": 0, "inputs": [{"label": "image", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": null}], "outputs": [{"label": "R", "localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": []}, {"label": "G", "localized_name": "IMAGE1", "name": "IMAGE1", "type": "IMAGE", "links": []}, {"label": "B", "localized_name": "IMAGE2", "name": "IMAGE2", "type": "IMAGE", "links": []}, {"label": "A", "localized_name": "IMAGE3", "name": "IMAGE3", "type": "IMAGE", "links": []}], "title": "Image Channels", "properties": {"proxyWidgets": []}, "widgets_values": []}], "links": [], "version": 0.4, "definitions": {"subgraphs": [{"id": "4c9d6ea4-b912-40e5-8766-6793a9758c53", "version": 1, "state": {"lastGroupId": 0, "lastNodeId": 28, "lastLinkId": 39, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "Image Channels", "inputNode": {"id": -10, "bounding": [1820, -185, 120, 60]}, "outputNode": {"id": -20, "bounding": [2460, -215, 120, 120]}, "inputs": [{"id": "3522932b-2d86-4a1f-a02a-cb29f3a9d7fe", "name": "images.image0", "type": "IMAGE", "linkIds": [39], "localized_name": "images.image0", "label": "image", "pos": [1920, -165]}], "outputs": [{"id": "605cb9c3-b065-4d9b-81d2-3ec331889b2b", "name": "IMAGE0", "type": "IMAGE", "linkIds": [26], "localized_name": "IMAGE0", "label": "R", "pos": [2480, -195]}, {"id": "fb44a77e-0522-43e9-9527-82e7465b3596", "name": "IMAGE1", "type": "IMAGE", "linkIds": [27], "localized_name": "IMAGE1", "label": "G", "pos": [2480, -175]}, {"id": "81460ee6-0131-402a-874f-6bf3001fc4ff", "name": "IMAGE2", "type": "IMAGE", "linkIds": [28], "localized_name": "IMAGE2", "label": "B", "pos": [2480, -155]}, {"id": "ae690246-80d4-4951-b1d9-9306d8a77417", "name": "IMAGE3", "type": "IMAGE", "linkIds": [29], "localized_name": "IMAGE3", "label": "A", "pos": [2480, -135]}], "widgets": [], "nodes": [{"id": 23, "type": "GLSLShader", "pos": [2000, -330], "size": [400, 172], "flags": {}, "order": 0, "mode": 0, "inputs": [{"label": "image", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": 39}, {"localized_name": "fragment_shader", "name": "fragment_shader", "type": "STRING", "widget": {"name": "fragment_shader"}, "link": null}, {"localized_name": "size_mode", "name": "size_mode", "type": "COMFY_DYNAMICCOMBO_V3", "widget": {"name": "size_mode"}, "link": null}, {"label": "image1", "localized_name": "images.image1", "name": "images.image1", "shape": 7, "type": "IMAGE", "link": null}], "outputs": [{"label": "R", "localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": [26]}, {"label": "G", "localized_name": "IMAGE1", "name": "IMAGE1", "type": "IMAGE", "links": [27]}, {"label": "B", "localized_name": "IMAGE2", "name": "IMAGE2", "type": "IMAGE", "links": [28]}, {"label": "A", "localized_name": "IMAGE3", "name": "IMAGE3", "type": "IMAGE", "links": [29]}], "properties": {"Node name for S&R": "GLSLShader"}, "widgets_values": ["#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\nlayout(location = 1) out vec4 fragColor1;\nlayout(location = 2) out vec4 fragColor2;\nlayout(location = 3) out vec4 fragColor3;\n\nvoid main() {\n vec4 color = texture(u_image0, v_texCoord);\n // Output each channel as grayscale to separate render targets\n fragColor0 = vec4(vec3(color.r), 1.0); // Red channel\n fragColor1 = vec4(vec3(color.g), 1.0); // Green channel\n fragColor2 = vec4(vec3(color.b), 1.0); // Blue channel\n fragColor3 = vec4(vec3(color.a), 1.0); // Alpha channel\n}\n", "from_input"]}], "groups": [], "links": [{"id": 39, "origin_id": -10, "origin_slot": 0, "target_id": 23, "target_slot": 0, "type": "IMAGE"}, {"id": 26, "origin_id": 23, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "IMAGE"}, {"id": 27, "origin_id": 23, "origin_slot": 1, "target_id": -20, "target_slot": 1, "type": "IMAGE"}, {"id": 28, "origin_id": 23, "origin_slot": 2, "target_id": -20, "target_slot": 2, "type": "IMAGE"}, {"id": 29, "origin_id": 23, "origin_slot": 3, "target_id": -20, "target_slot": 3, "type": "IMAGE"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Image Tools/Color adjust"}]}}
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 29,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 29,
|
||||
"type": "4c9d6ea4-b912-40e5-8766-6793a9758c53",
|
||||
"pos": [
|
||||
1970,
|
||||
-230
|
||||
],
|
||||
"size": [
|
||||
180,
|
||||
86
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "R",
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"label": "G",
|
||||
"localized_name": "IMAGE1",
|
||||
"name": "IMAGE1",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"label": "B",
|
||||
"localized_name": "IMAGE2",
|
||||
"name": "IMAGE2",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"label": "A",
|
||||
"localized_name": "IMAGE3",
|
||||
"name": "IMAGE3",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Image Channels",
|
||||
"properties": {
|
||||
"proxyWidgets": []
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "4c9d6ea4-b912-40e5-8766-6793a9758c53",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 28,
|
||||
"lastLinkId": 39,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Channels",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
1820,
|
||||
-185,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
2460,
|
||||
-215,
|
||||
120,
|
||||
120
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "3522932b-2d86-4a1f-a02a-cb29f3a9d7fe",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
39
|
||||
],
|
||||
"localized_name": "images.image0",
|
||||
"label": "image",
|
||||
"pos": [
|
||||
1920,
|
||||
-165
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "605cb9c3-b065-4d9b-81d2-3ec331889b2b",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
26
|
||||
],
|
||||
"localized_name": "IMAGE0",
|
||||
"label": "R",
|
||||
"pos": [
|
||||
2480,
|
||||
-195
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "fb44a77e-0522-43e9-9527-82e7465b3596",
|
||||
"name": "IMAGE1",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
27
|
||||
],
|
||||
"localized_name": "IMAGE1",
|
||||
"label": "G",
|
||||
"pos": [
|
||||
2480,
|
||||
-175
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "81460ee6-0131-402a-874f-6bf3001fc4ff",
|
||||
"name": "IMAGE2",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
28
|
||||
],
|
||||
"localized_name": "IMAGE2",
|
||||
"label": "B",
|
||||
"pos": [
|
||||
2480,
|
||||
-155
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ae690246-80d4-4951-b1d9-9306d8a77417",
|
||||
"name": "IMAGE3",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
29
|
||||
],
|
||||
"localized_name": "IMAGE3",
|
||||
"label": "A",
|
||||
"pos": [
|
||||
2480,
|
||||
-135
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 23,
|
||||
"type": "GLSLShader",
|
||||
"pos": [
|
||||
2000,
|
||||
-330
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
172
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": 39
|
||||
},
|
||||
{
|
||||
"localized_name": "fragment_shader",
|
||||
"name": "fragment_shader",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "fragment_shader"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "size_mode",
|
||||
"name": "size_mode",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "size_mode"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "image1",
|
||||
"localized_name": "images.image1",
|
||||
"name": "images.image1",
|
||||
"shape": 7,
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "R",
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
26
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "G",
|
||||
"localized_name": "IMAGE1",
|
||||
"name": "IMAGE1",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
27
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "B",
|
||||
"localized_name": "IMAGE2",
|
||||
"name": "IMAGE2",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
28
|
||||
]
|
||||
},
|
||||
{
|
||||
"label": "A",
|
||||
"localized_name": "IMAGE3",
|
||||
"name": "IMAGE3",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
29
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GLSLShader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\nlayout(location = 1) out vec4 fragColor1;\nlayout(location = 2) out vec4 fragColor2;\nlayout(location = 3) out vec4 fragColor3;\n\nvoid main() {\n vec4 color = texture(u_image0, v_texCoord);\n // Output each channel as grayscale to separate render targets\n fragColor0 = vec4(vec3(color.r), 1.0); // Red channel\n fragColor1 = vec4(vec3(color.g), 1.0); // Green channel\n fragColor2 = vec4(vec3(color.b), 1.0); // Blue channel\n fragColor3 = vec4(vec3(color.a), 1.0); // Alpha channel\n}\n",
|
||||
"from_input"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 39,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 23,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 27,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 2,
|
||||
"target_id": -20,
|
||||
"target_slot": 2,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 29,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 3,
|
||||
"target_id": -20,
|
||||
"target_slot": 3,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1 +1,278 @@
|
||||
{"revision": 0, "last_node_id": 15, "last_link_id": 0, "nodes": [{"id": 15, "type": "24d8bbfd-39d4-4774-bff0-3de40cc7a471", "pos": [-1490, 2040], "size": [400, 260], "flags": {}, "order": 0, "mode": 0, "inputs": [{"name": "prompt", "type": "STRING", "widget": {"name": "prompt"}, "link": null}, {"label": "reference images", "name": "images", "type": "IMAGE", "link": null}], "outputs": [{"name": "STRING", "type": "STRING", "links": null}], "title": "Prompt Enhance", "properties": {"proxyWidgets": [["-1", "prompt"]], "cnr_id": "comfy-core", "ver": "0.14.1"}, "widgets_values": [""]}], "links": [], "version": 0.4, "definitions": {"subgraphs": [{"id": "24d8bbfd-39d4-4774-bff0-3de40cc7a471", "version": 1, "state": {"lastGroupId": 0, "lastNodeId": 15, "lastLinkId": 14, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "Prompt Enhance", "inputNode": {"id": -10, "bounding": [-2170, 2110, 138.876953125, 80]}, "outputNode": {"id": -20, "bounding": [-640, 2110, 120, 60]}, "inputs": [{"id": "aeab7216-00e0-4528-a09b-bba50845c5a6", "name": "prompt", "type": "STRING", "linkIds": [11], "pos": [-2051.123046875, 2130]}, {"id": "7b73fd36-aa31-4771-9066-f6c83879994b", "name": "images", "type": "IMAGE", "linkIds": [14], "label": "reference images", "pos": [-2051.123046875, 2150]}], "outputs": [{"id": "c7b0d930-68a1-48d1-b496-0519e5837064", "name": "STRING", "type": "STRING", "linkIds": [13], "pos": [-620, 2130]}], "widgets": [], "nodes": [{"id": 11, "type": "GeminiNode", "pos": [-1560, 1990], "size": [470, 470], "flags": {}, "order": 0, "mode": 0, "inputs": [{"localized_name": "images", "name": "images", "shape": 7, "type": "IMAGE", "link": 14}, {"localized_name": "audio", "name": "audio", "shape": 7, "type": "AUDIO", "link": null}, {"localized_name": "video", "name": "video", "shape": 7, "type": "VIDEO", "link": null}, {"localized_name": "files", "name": "files", "shape": 7, "type": "GEMINI_INPUT_FILES", "link": null}, {"localized_name": "prompt", "name": "prompt", "type": "STRING", "widget": {"name": "prompt"}, "link": 11}, {"localized_name": "model", "name": "model", "type": "COMBO", "widget": {"name": "model"}, "link": null}, {"localized_name": "seed", "name": "seed", "type": "INT", "widget": {"name": "seed"}, "link": null}, {"localized_name": "system_prompt", "name": "system_prompt", "shape": 7, "type": "STRING", "widget": {"name": "system_prompt"}, "link": null}], "outputs": [{"localized_name": "STRING", "name": "STRING", "type": "STRING", "links": [13]}], "properties": {"cnr_id": "comfy-core", "ver": "0.14.1", "Node name for S&R": "GeminiNode"}, "widgets_values": ["", "gemini-3-pro-preview", 42, "randomize", "You are an expert in prompt writing.\nBased on the input, rewrite the user's input into a detailed prompt.\nincluding camera settings, lighting, composition, and style.\nReturn the prompt only"], "color": "#432", "bgcolor": "#653"}], "groups": [], "links": [{"id": 11, "origin_id": -10, "origin_slot": 0, "target_id": 11, "target_slot": 4, "type": "STRING"}, {"id": 13, "origin_id": 11, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "STRING"}, {"id": 14, "origin_id": -10, "origin_slot": 1, "target_id": 11, "target_slot": 0, "type": "IMAGE"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Text generation/Prompt enhance"}]}, "extra": {}}
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 15,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 15,
|
||||
"type": "24d8bbfd-39d4-4774-bff0-3de40cc7a471",
|
||||
"pos": [
|
||||
-1490,
|
||||
2040
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
260
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "prompt",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "prompt"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "reference images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"title": "Prompt Enhance",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"prompt"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.14.1"
|
||||
},
|
||||
"widgets_values": [
|
||||
""
|
||||
]
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "24d8bbfd-39d4-4774-bff0-3de40cc7a471",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 15,
|
||||
"lastLinkId": 14,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Prompt Enhance",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2170,
|
||||
2110,
|
||||
138.876953125,
|
||||
80
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-640,
|
||||
2110,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "aeab7216-00e0-4528-a09b-bba50845c5a6",
|
||||
"name": "prompt",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
11
|
||||
],
|
||||
"pos": [
|
||||
-2051.123046875,
|
||||
2130
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "7b73fd36-aa31-4771-9066-f6c83879994b",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
14
|
||||
],
|
||||
"label": "reference images",
|
||||
"pos": [
|
||||
-2051.123046875,
|
||||
2150
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "c7b0d930-68a1-48d1-b496-0519e5837064",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
13
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
2130
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 11,
|
||||
"type": "GeminiNode",
|
||||
"pos": [
|
||||
-1560,
|
||||
1990
|
||||
],
|
||||
"size": [
|
||||
470,
|
||||
470
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"shape": 7,
|
||||
"type": "IMAGE",
|
||||
"link": 14
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"shape": 7,
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "files",
|
||||
"name": "files",
|
||||
"shape": 7,
|
||||
"type": "GEMINI_INPUT_FILES",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "prompt",
|
||||
"name": "prompt",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "prompt"
|
||||
},
|
||||
"link": 11
|
||||
},
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "seed",
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "system_prompt",
|
||||
"name": "system_prompt",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "system_prompt"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "STRING",
|
||||
"name": "STRING",
|
||||
"type": "STRING",
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.14.1",
|
||||
"Node name for S&R": "GeminiNode"
|
||||
},
|
||||
"widgets_values": [
|
||||
"",
|
||||
"gemini-3-pro-preview",
|
||||
42,
|
||||
"randomize",
|
||||
"You are an expert in prompt writing.\nBased on the input, rewrite the user's input into a detailed prompt.\nincluding camera settings, lighting, composition, and style.\nReturn the prompt only"
|
||||
],
|
||||
"color": "#432",
|
||||
"bgcolor": "#653"
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 11,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 11,
|
||||
"target_slot": 4,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 11,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Prompt enhance"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
|
||||
@@ -1 +1,309 @@
|
||||
{"revision": 0, "last_node_id": 25, "last_link_id": 0, "nodes": [{"id": 25, "type": "621ba4e2-22a8-482d-a369-023753198b7b", "pos": [4610, -790], "size": [230, 58], "flags": {}, "order": 4, "mode": 0, "inputs": [{"label": "image", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": null}], "outputs": [{"label": "IMAGE", "localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": []}], "title": "Sharpen", "properties": {"proxyWidgets": [["24", "value"]]}, "widgets_values": []}], "links": [], "version": 0.4, "definitions": {"subgraphs": [{"id": "621ba4e2-22a8-482d-a369-023753198b7b", "version": 1, "state": {"lastGroupId": 0, "lastNodeId": 24, "lastLinkId": 36, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "Sharpen", "inputNode": {"id": -10, "bounding": [4090, -825, 120, 60]}, "outputNode": {"id": -20, "bounding": [5150, -825, 120, 60]}, "inputs": [{"id": "37011fb7-14b7-4e0e-b1a0-6a02e8da1fd7", "name": "images.image0", "type": "IMAGE", "linkIds": [34], "localized_name": "images.image0", "label": "image", "pos": [4190, -805]}], "outputs": [{"id": "e9182b3f-635c-4cd4-a152-4b4be17ae4b9", "name": "IMAGE0", "type": "IMAGE", "linkIds": [35], "localized_name": "IMAGE0", "label": "IMAGE", "pos": [5170, -805]}], "widgets": [], "nodes": [{"id": 24, "type": "PrimitiveFloat", "pos": [4280, -1240], "size": [270, 58], "flags": {}, "order": 0, "mode": 0, "inputs": [{"label": "strength", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [36]}], "properties": {"Node name for S&R": "PrimitiveFloat", "min": 0, "max": 3, "precision": 2, "step": 0.05}, "widgets_values": [0.5]}, {"id": 23, "type": "GLSLShader", "pos": [4570, -1240], "size": [370, 192], "flags": {}, "order": 1, "mode": 0, "inputs": [{"label": "image0", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": 34}, {"label": "image1", "localized_name": "images.image1", "name": "images.image1", "shape": 7, "type": "IMAGE", "link": null}, {"label": "u_float0", "localized_name": "floats.u_float0", "name": "floats.u_float0", "shape": 7, "type": "FLOAT", "link": 36}, {"label": "u_float1", "localized_name": "floats.u_float1", "name": "floats.u_float1", "shape": 7, "type": "FLOAT", "link": null}, {"label": "u_int0", "localized_name": "ints.u_int0", "name": "ints.u_int0", "shape": 7, "type": "INT", "link": null}, {"localized_name": "fragment_shader", "name": "fragment_shader", "type": "STRING", "widget": {"name": "fragment_shader"}, "link": null}, {"localized_name": "size_mode", "name": "size_mode", "type": "COMFY_DYNAMICCOMBO_V3", "widget": {"name": "size_mode"}, "link": null}], "outputs": [{"localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": [35]}, {"localized_name": "IMAGE1", "name": "IMAGE1", "type": "IMAGE", "links": null}, {"localized_name": "IMAGE2", "name": "IMAGE2", "type": "IMAGE", "links": null}, {"localized_name": "IMAGE3", "name": "IMAGE3", "type": "IMAGE", "links": null}], "properties": {"Node name for S&R": "GLSLShader"}, "widgets_values": ["#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nvoid main() {\n vec2 texel = 1.0 / u_resolution;\n \n // Sample center and neighbors\n vec4 center = texture(u_image0, v_texCoord);\n vec4 top = texture(u_image0, v_texCoord + vec2( 0.0, -texel.y));\n vec4 bottom = texture(u_image0, v_texCoord + vec2( 0.0, texel.y));\n vec4 left = texture(u_image0, v_texCoord + vec2(-texel.x, 0.0));\n vec4 right = texture(u_image0, v_texCoord + vec2( texel.x, 0.0));\n \n // Edge enhancement (Laplacian)\n vec4 edges = center * 4.0 - top - bottom - left - right;\n \n // Add edges back scaled by strength\n vec4 sharpened = center + edges * u_float0;\n \n fragColor0 = vec4(clamp(sharpened.rgb, 0.0, 1.0), center.a);\n}", "from_input"]}], "groups": [], "links": [{"id": 36, "origin_id": 24, "origin_slot": 0, "target_id": 23, "target_slot": 2, "type": "FLOAT"}, {"id": 34, "origin_id": -10, "origin_slot": 0, "target_id": 23, "target_slot": 0, "type": "IMAGE"}, {"id": 35, "origin_id": 23, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "IMAGE"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Image Tools/Sharpen"}]}}
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 25,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 25,
|
||||
"type": "621ba4e2-22a8-482d-a369-023753198b7b",
|
||||
"pos": [
|
||||
4610,
|
||||
-790
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "IMAGE",
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Sharpen",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"24",
|
||||
"value"
|
||||
]
|
||||
]
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "621ba4e2-22a8-482d-a369-023753198b7b",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 24,
|
||||
"lastLinkId": 36,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Sharpen",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
4090,
|
||||
-825,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
5150,
|
||||
-825,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "37011fb7-14b7-4e0e-b1a0-6a02e8da1fd7",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
34
|
||||
],
|
||||
"localized_name": "images.image0",
|
||||
"label": "image",
|
||||
"pos": [
|
||||
4190,
|
||||
-805
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "e9182b3f-635c-4cd4-a152-4b4be17ae4b9",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
35
|
||||
],
|
||||
"localized_name": "IMAGE0",
|
||||
"label": "IMAGE",
|
||||
"pos": [
|
||||
5170,
|
||||
-805
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 24,
|
||||
"type": "PrimitiveFloat",
|
||||
"pos": [
|
||||
4280,
|
||||
-1240
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "strength",
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
36
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveFloat",
|
||||
"min": 0,
|
||||
"max": 3,
|
||||
"precision": 2,
|
||||
"step": 0.05
|
||||
},
|
||||
"widgets_values": [
|
||||
0.5
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"type": "GLSLShader",
|
||||
"pos": [
|
||||
4570,
|
||||
-1240
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
192
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image0",
|
||||
"localized_name": "images.image0",
|
||||
"name": "images.image0",
|
||||
"type": "IMAGE",
|
||||
"link": 34
|
||||
},
|
||||
{
|
||||
"label": "image1",
|
||||
"localized_name": "images.image1",
|
||||
"name": "images.image1",
|
||||
"shape": 7,
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "u_float0",
|
||||
"localized_name": "floats.u_float0",
|
||||
"name": "floats.u_float0",
|
||||
"shape": 7,
|
||||
"type": "FLOAT",
|
||||
"link": 36
|
||||
},
|
||||
{
|
||||
"label": "u_float1",
|
||||
"localized_name": "floats.u_float1",
|
||||
"name": "floats.u_float1",
|
||||
"shape": 7,
|
||||
"type": "FLOAT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "u_int0",
|
||||
"localized_name": "ints.u_int0",
|
||||
"name": "ints.u_int0",
|
||||
"shape": 7,
|
||||
"type": "INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "fragment_shader",
|
||||
"name": "fragment_shader",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "fragment_shader"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "size_mode",
|
||||
"name": "size_mode",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "size_mode"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE0",
|
||||
"name": "IMAGE0",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
35
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "IMAGE1",
|
||||
"name": "IMAGE1",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "IMAGE2",
|
||||
"name": "IMAGE2",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "IMAGE3",
|
||||
"name": "IMAGE3",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GLSLShader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nvoid main() {\n vec2 texel = 1.0 / u_resolution;\n \n // Sample center and neighbors\n vec4 center = texture(u_image0, v_texCoord);\n vec4 top = texture(u_image0, v_texCoord + vec2( 0.0, -texel.y));\n vec4 bottom = texture(u_image0, v_texCoord + vec2( 0.0, texel.y));\n vec4 left = texture(u_image0, v_texCoord + vec2(-texel.x, 0.0));\n vec4 right = texture(u_image0, v_texCoord + vec2( texel.x, 0.0));\n \n // Edge enhancement (Laplacian)\n vec4 edges = center * 4.0 - top - bottom - left - right;\n \n // Add edges back scaled by strength\n vec4 sharpened = center + edges * u_float0;\n \n fragColor0 = vec4(clamp(sharpened.rgb, 0.0, 1.0), center.a);\n}",
|
||||
"from_input"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 36,
|
||||
"origin_id": 24,
|
||||
"origin_slot": 0,
|
||||
"target_id": 23,
|
||||
"target_slot": 2,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 34,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 23,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 35,
|
||||
"origin_id": 23,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Sharpen"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1 +1,420 @@
|
||||
{"revision": 0, "last_node_id": 13, "last_link_id": 0, "nodes": [{"id": 13, "type": "cf95b747-3e17-46cb-8097-cac60ff9b2e1", "pos": [1120, 330], "size": [240, 58], "flags": {}, "order": 3, "mode": 0, "inputs": [{"localized_name": "video", "name": "video", "type": "VIDEO", "link": null}, {"name": "model_name", "type": "COMBO", "widget": {"name": "model_name"}, "link": null}], "outputs": [{"localized_name": "VIDEO", "name": "VIDEO", "type": "VIDEO", "links": []}], "title": "Video Upscale(GAN x4)", "properties": {"proxyWidgets": [["-1", "model_name"]], "cnr_id": "comfy-core", "ver": "0.14.1"}, "widgets_values": ["RealESRGAN_x4plus.safetensors"]}], "links": [], "version": 0.4, "definitions": {"subgraphs": [{"id": "cf95b747-3e17-46cb-8097-cac60ff9b2e1", "version": 1, "state": {"lastGroupId": 0, "lastNodeId": 13, "lastLinkId": 19, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "Video Upscale(GAN x4)", "inputNode": {"id": -10, "bounding": [550, 460, 120, 80]}, "outputNode": {"id": -20, "bounding": [1490, 460, 120, 60]}, "inputs": [{"id": "666d633e-93e7-42dc-8d11-2b7b99b0f2a6", "name": "video", "type": "VIDEO", "linkIds": [10], "localized_name": "video", "pos": [650, 480]}, {"id": "2e23a087-caa8-4d65-99e6-662761aa905a", "name": "model_name", "type": "COMBO", "linkIds": [19], "pos": [650, 500]}], "outputs": [{"id": "0c1768ea-3ec2-412f-9af6-8e0fa36dae70", "name": "VIDEO", "type": "VIDEO", "linkIds": [15], "localized_name": "VIDEO", "pos": [1510, 480]}], "widgets": [], "nodes": [{"id": 2, "type": "ImageUpscaleWithModel", "pos": [1110, 450], "size": [320, 46], "flags": {}, "order": 1, "mode": 0, "inputs": [{"localized_name": "upscale_model", "name": "upscale_model", "type": "UPSCALE_MODEL", "link": 1}, {"localized_name": "image", "name": "image", "type": "IMAGE", "link": 14}], "outputs": [{"localized_name": "IMAGE", "name": "IMAGE", "type": "IMAGE", "links": [13]}], "properties": {"cnr_id": "comfy-core", "ver": "0.10.0", "Node name for S&R": "ImageUpscaleWithModel"}}, {"id": 11, "type": "CreateVideo", "pos": [1110, 550], "size": [320, 78], "flags": {}, "order": 3, "mode": 0, "inputs": [{"localized_name": "images", "name": "images", "type": "IMAGE", "link": 13}, {"localized_name": "audio", "name": "audio", "shape": 7, "type": "AUDIO", "link": 16}, {"localized_name": "fps", "name": "fps", "type": "FLOAT", "widget": {"name": "fps"}, "link": 12}], "outputs": [{"localized_name": "VIDEO", "name": "VIDEO", "type": "VIDEO", "links": [15]}], "properties": {"cnr_id": "comfy-core", "ver": "0.10.0", "Node name for S&R": "CreateVideo"}, "widgets_values": [30]}, {"id": 10, "type": "GetVideoComponents", "pos": [1110, 330], "size": [320, 70], "flags": {}, "order": 2, "mode": 0, "inputs": [{"localized_name": "video", "name": "video", "type": "VIDEO", "link": 10}], "outputs": [{"localized_name": "images", "name": "images", "type": "IMAGE", "links": [14]}, {"localized_name": "audio", "name": "audio", "type": "AUDIO", "links": [16]}, {"localized_name": "fps", "name": "fps", "type": "FLOAT", "links": [12]}], "properties": {"cnr_id": "comfy-core", "ver": "0.10.0", "Node name for S&R": "GetVideoComponents"}}, {"id": 1, "type": "UpscaleModelLoader", "pos": [750, 450], "size": [280, 60], "flags": {}, "order": 0, "mode": 0, "inputs": [{"localized_name": "model_name", "name": "model_name", "type": "COMBO", "widget": {"name": "model_name"}, "link": 19}], "outputs": [{"localized_name": "UPSCALE_MODEL", "name": "UPSCALE_MODEL", "type": "UPSCALE_MODEL", "links": [1]}], "properties": {"cnr_id": "comfy-core", "ver": "0.10.0", "Node name for S&R": "UpscaleModelLoader", "models": [{"name": "RealESRGAN_x4plus.safetensors", "url": "https://huggingface.co/Comfy-Org/Real-ESRGAN_repackaged/resolve/main/RealESRGAN_x4plus.safetensors", "directory": "upscale_models"}]}, "widgets_values": ["RealESRGAN_x4plus.safetensors"]}], "groups": [], "links": [{"id": 1, "origin_id": 1, "origin_slot": 0, "target_id": 2, "target_slot": 0, "type": "UPSCALE_MODEL"}, {"id": 14, "origin_id": 10, "origin_slot": 0, "target_id": 2, "target_slot": 1, "type": "IMAGE"}, {"id": 13, "origin_id": 2, "origin_slot": 0, "target_id": 11, "target_slot": 0, "type": "IMAGE"}, {"id": 16, "origin_id": 10, "origin_slot": 1, "target_id": 11, "target_slot": 1, "type": "AUDIO"}, {"id": 12, "origin_id": 10, "origin_slot": 2, "target_id": 11, "target_slot": 2, "type": "FLOAT"}, {"id": 10, "origin_id": -10, "origin_slot": 0, "target_id": 10, "target_slot": 0, "type": "VIDEO"}, {"id": 15, "origin_id": 11, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "VIDEO"}, {"id": 19, "origin_id": -10, "origin_slot": 1, "target_id": 1, "target_slot": 0, "type": "COMBO"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Video generation and editing/Enhance video"}]}, "extra": {}}
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 13,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 13,
|
||||
"type": "cf95b747-3e17-46cb-8097-cac60ff9b2e1",
|
||||
"pos": [
|
||||
1120,
|
||||
330
|
||||
],
|
||||
"size": [
|
||||
240,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Video Upscale(GAN x4)",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"model_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.14.1"
|
||||
},
|
||||
"widgets_values": [
|
||||
"RealESRGAN_x4plus.safetensors"
|
||||
]
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "cf95b747-3e17-46cb-8097-cac60ff9b2e1",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 13,
|
||||
"lastLinkId": 19,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Video Upscale(GAN x4)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
550,
|
||||
460,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1490,
|
||||
460,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "666d633e-93e7-42dc-8d11-2b7b99b0f2a6",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
10
|
||||
],
|
||||
"localized_name": "video",
|
||||
"pos": [
|
||||
650,
|
||||
480
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "2e23a087-caa8-4d65-99e6-662761aa905a",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
19
|
||||
],
|
||||
"pos": [
|
||||
650,
|
||||
500
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "0c1768ea-3ec2-412f-9af6-8e0fa36dae70",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
15
|
||||
],
|
||||
"localized_name": "VIDEO",
|
||||
"pos": [
|
||||
1510,
|
||||
480
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 2,
|
||||
"type": "ImageUpscaleWithModel",
|
||||
"pos": [
|
||||
1110,
|
||||
450
|
||||
],
|
||||
"size": [
|
||||
320,
|
||||
46
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "upscale_model",
|
||||
"name": "upscale_model",
|
||||
"type": "UPSCALE_MODEL",
|
||||
"link": 1
|
||||
},
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 14
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.10.0",
|
||||
"Node name for S&R": "ImageUpscaleWithModel"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "CreateVideo",
|
||||
"pos": [
|
||||
1110,
|
||||
550
|
||||
],
|
||||
"size": [
|
||||
320,
|
||||
78
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 13
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 16
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
},
|
||||
"link": 12
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
15
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.10.0",
|
||||
"Node name for S&R": "CreateVideo"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
1110,
|
||||
330
|
||||
],
|
||||
"size": [
|
||||
320,
|
||||
70
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 10
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
14
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
16
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
12
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.10.0",
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"type": "UpscaleModelLoader",
|
||||
"pos": [
|
||||
750,
|
||||
450
|
||||
],
|
||||
"size": [
|
||||
280,
|
||||
60
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model_name",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": 19
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "UPSCALE_MODEL",
|
||||
"name": "UPSCALE_MODEL",
|
||||
"type": "UPSCALE_MODEL",
|
||||
"links": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.10.0",
|
||||
"Node name for S&R": "UpscaleModelLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "RealESRGAN_x4plus.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/Real-ESRGAN_repackaged/resolve/main/RealESRGAN_x4plus.safetensors",
|
||||
"directory": "upscale_models"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"RealESRGAN_x4plus.safetensors"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "UPSCALE_MODEL"
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"origin_id": 10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": 11,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"origin_id": 10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 11,
|
||||
"target_slot": 1,
|
||||
"type": "AUDIO"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": 10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 11,
|
||||
"target_slot": 2,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 10,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Enhance video"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
|
||||
33
comfy/deploy_environment.py
Normal file
33
comfy/deploy_environment.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
import folder_paths
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_DEPLOY_ENV = "local_git"
|
||||
_ENV_FILENAME = ".comfy_environment"
|
||||
|
||||
_cached_value: str | None = None
|
||||
|
||||
|
||||
def get_deploy_environment() -> str:
|
||||
global _cached_value
|
||||
if _cached_value is not None:
|
||||
return _cached_value
|
||||
|
||||
env_file = os.path.join(folder_paths.base_path, _ENV_FILENAME)
|
||||
try:
|
||||
with open(env_file, encoding="utf-8") as f:
|
||||
first_line = f.readline().strip()
|
||||
value = "".join(c for c in first_line if 32 <= ord(c) < 127)
|
||||
if value:
|
||||
_cached_value = value
|
||||
return _cached_value
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning("Failed to read %s: %s", env_file, e)
|
||||
|
||||
_cached_value = _DEFAULT_DEPLOY_ENV
|
||||
return _cached_value
|
||||
@@ -611,6 +611,7 @@ class AceStepDiTModel(nn.Module):
|
||||
intermediate_size,
|
||||
patch_size,
|
||||
audio_acoustic_hidden_dim,
|
||||
condition_dim=None,
|
||||
layer_types=None,
|
||||
sliding_window=128,
|
||||
rms_norm_eps=1e-6,
|
||||
@@ -640,7 +641,7 @@ class AceStepDiTModel(nn.Module):
|
||||
|
||||
self.time_embed = TimestepEmbedding(256, hidden_size, dtype=dtype, device=device, operations=operations)
|
||||
self.time_embed_r = TimestepEmbedding(256, hidden_size, dtype=dtype, device=device, operations=operations)
|
||||
self.condition_embedder = Linear(hidden_size, hidden_size, dtype=dtype, device=device)
|
||||
self.condition_embedder = Linear(condition_dim, hidden_size, dtype=dtype, device=device)
|
||||
|
||||
if layer_types is None:
|
||||
layer_types = ["full_attention"] * num_layers
|
||||
@@ -1035,6 +1036,9 @@ class AceStepConditionGenerationModel(nn.Module):
|
||||
fsq_dim=2048,
|
||||
fsq_levels=[8, 8, 8, 5, 5, 5],
|
||||
fsq_input_num_quantizers=1,
|
||||
encoder_hidden_size=2048,
|
||||
encoder_intermediate_size=6144,
|
||||
encoder_num_heads=16,
|
||||
audio_model=None,
|
||||
dtype=None,
|
||||
device=None,
|
||||
@@ -1054,24 +1058,24 @@ class AceStepConditionGenerationModel(nn.Module):
|
||||
|
||||
self.decoder = AceStepDiTModel(
|
||||
in_channels, hidden_size, num_dit_layers, num_heads, num_kv_heads, head_dim,
|
||||
intermediate_size, patch_size, audio_acoustic_hidden_dim,
|
||||
intermediate_size, patch_size, audio_acoustic_hidden_dim, condition_dim=encoder_hidden_size,
|
||||
layer_types=layer_types, sliding_window=sliding_window, rms_norm_eps=rms_norm_eps,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.encoder = AceStepConditionEncoder(
|
||||
text_hidden_dim, timbre_hidden_dim, hidden_size, num_lyric_layers, num_timbre_layers,
|
||||
num_heads, num_kv_heads, head_dim, intermediate_size, rms_norm_eps,
|
||||
text_hidden_dim, timbre_hidden_dim, encoder_hidden_size, num_lyric_layers, num_timbre_layers,
|
||||
encoder_num_heads, num_kv_heads, head_dim, encoder_intermediate_size, rms_norm_eps,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.tokenizer = AceStepAudioTokenizer(
|
||||
audio_acoustic_hidden_dim, hidden_size, pool_window_size, fsq_dim=fsq_dim, fsq_levels=fsq_levels, fsq_input_num_quantizers=fsq_input_num_quantizers, num_layers=num_tokenizer_layers, head_dim=head_dim, rms_norm_eps=rms_norm_eps,
|
||||
audio_acoustic_hidden_dim, encoder_hidden_size, pool_window_size, fsq_dim=fsq_dim, fsq_levels=fsq_levels, fsq_input_num_quantizers=fsq_input_num_quantizers, num_layers=num_tokenizer_layers, head_dim=head_dim, rms_norm_eps=rms_norm_eps,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.detokenizer = AudioTokenDetokenizer(
|
||||
hidden_size, pool_window_size, audio_acoustic_hidden_dim, num_layers=2, head_dim=head_dim,
|
||||
encoder_hidden_size, pool_window_size, audio_acoustic_hidden_dim, num_layers=2, head_dim=head_dim,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
self.null_condition_emb = nn.Parameter(torch.empty(1, 1, hidden_size, dtype=dtype, device=device))
|
||||
self.null_condition_emb = nn.Parameter(torch.empty(1, 1, encoder_hidden_size, dtype=dtype, device=device))
|
||||
|
||||
def prepare_condition(
|
||||
self,
|
||||
|
||||
303
comfy/ldm/ernie/model.py
Normal file
303
comfy/ldm/ernie/model.py
Normal file
@@ -0,0 +1,303 @@
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
import comfy.model_management
|
||||
|
||||
def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
|
||||
assert dim % 2 == 0
|
||||
if not comfy.model_management.supports_fp64(pos.device):
|
||||
device = torch.device("cpu")
|
||||
else:
|
||||
device = pos.device
|
||||
|
||||
scale = torch.arange(0, dim, 2, dtype=torch.float64, device=device) / dim
|
||||
omega = 1.0 / (theta**scale)
|
||||
out = torch.einsum("...n,d->...nd", pos.to(device), omega)
|
||||
out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
|
||||
return out.to(dtype=torch.float32, device=pos.device)
|
||||
|
||||
def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
|
||||
rot_dim = freqs_cis.shape[-1]
|
||||
x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:]
|
||||
cos_ = freqs_cis[0]
|
||||
sin_ = freqs_cis[1]
|
||||
x1, x2 = x.chunk(2, dim=-1)
|
||||
x_rotated = torch.cat((-x2, x1), dim=-1)
|
||||
return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1)
|
||||
|
||||
class ErnieImageEmbedND3(nn.Module):
|
||||
def __init__(self, dim: int, theta: int, axes_dim: tuple):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.theta = theta
|
||||
self.axes_dim = list(axes_dim)
|
||||
|
||||
def forward(self, ids: torch.Tensor) -> torch.Tensor:
|
||||
emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1)
|
||||
emb = emb.unsqueeze(3) # [2, B, S, 1, head_dim//2]
|
||||
return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1) # [B, S, 1, head_dim]
|
||||
|
||||
class ErnieImagePatchEmbedDynamic(nn.Module):
|
||||
def __init__(self, in_channels: int, embed_dim: int, patch_size: int, operations, device=None, dtype=None):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.proj = operations.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
x = self.proj(x)
|
||||
batch_size, dim, height, width = x.shape
|
||||
return x.reshape(batch_size, dim, height * width).transpose(1, 2).contiguous()
|
||||
|
||||
class Timesteps(nn.Module):
|
||||
def __init__(self, num_channels: int, flip_sin_to_cos: bool = False):
|
||||
super().__init__()
|
||||
self.num_channels = num_channels
|
||||
self.flip_sin_to_cos = flip_sin_to_cos
|
||||
|
||||
def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
|
||||
half_dim = self.num_channels // 2
|
||||
exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) / half_dim
|
||||
emb = torch.exp(exponent)
|
||||
emb = timesteps[:, None].float() * emb[None, :]
|
||||
if self.flip_sin_to_cos:
|
||||
emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=-1)
|
||||
else:
|
||||
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
|
||||
return emb
|
||||
|
||||
class TimestepEmbedding(nn.Module):
|
||||
def __init__(self, in_channels: int, time_embed_dim: int, operations, device=None, dtype=None):
|
||||
super().__init__()
|
||||
Linear = operations.Linear
|
||||
self.linear_1 = Linear(in_channels, time_embed_dim, bias=True, device=device, dtype=dtype)
|
||||
self.act = nn.SiLU()
|
||||
self.linear_2 = Linear(time_embed_dim, time_embed_dim, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
sample = self.linear_1(sample)
|
||||
sample = self.act(sample)
|
||||
sample = self.linear_2(sample)
|
||||
return sample
|
||||
|
||||
class ErnieImageAttention(nn.Module):
|
||||
def __init__(self, query_dim: int, heads: int, dim_head: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
|
||||
super().__init__()
|
||||
self.heads = heads
|
||||
self.head_dim = dim_head
|
||||
self.inner_dim = heads * dim_head
|
||||
|
||||
Linear = operations.Linear
|
||||
RMSNorm = operations.RMSNorm
|
||||
|
||||
self.to_q = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
|
||||
self.to_k = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
|
||||
self.to_v = Linear(query_dim, self.inner_dim, bias=False, device=device, dtype=dtype)
|
||||
|
||||
self.norm_q = RMSNorm(dim_head, eps=eps, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.norm_k = RMSNorm(dim_head, eps=eps, elementwise_affine=True, device=device, dtype=dtype)
|
||||
|
||||
self.to_out = nn.ModuleList([Linear(self.inner_dim, query_dim, bias=False, device=device, dtype=dtype)])
|
||||
|
||||
def forward(self, x: torch.Tensor, attention_mask: torch.Tensor = None, image_rotary_emb: torch.Tensor = None) -> torch.Tensor:
|
||||
B, S, _ = x.shape
|
||||
|
||||
q_flat = self.to_q(x)
|
||||
k_flat = self.to_k(x)
|
||||
v_flat = self.to_v(x)
|
||||
|
||||
query = q_flat.view(B, S, self.heads, self.head_dim)
|
||||
key = k_flat.view(B, S, self.heads, self.head_dim)
|
||||
|
||||
query = self.norm_q(query)
|
||||
key = self.norm_k(key)
|
||||
|
||||
if image_rotary_emb is not None:
|
||||
query = apply_rotary_emb(query, image_rotary_emb)
|
||||
key = apply_rotary_emb(key, image_rotary_emb)
|
||||
|
||||
query, key = query.to(x.dtype), key.to(x.dtype)
|
||||
|
||||
q_flat = query.reshape(B, S, -1)
|
||||
k_flat = key.reshape(B, S, -1)
|
||||
|
||||
hidden_states = optimized_attention(q_flat, k_flat, v_flat, self.heads, mask=attention_mask)
|
||||
|
||||
return self.to_out[0](hidden_states)
|
||||
|
||||
class ErnieImageFeedForward(nn.Module):
|
||||
def __init__(self, hidden_size: int, ffn_hidden_size: int, operations, device=None, dtype=None):
|
||||
super().__init__()
|
||||
Linear = operations.Linear
|
||||
self.gate_proj = Linear(hidden_size, ffn_hidden_size, bias=False, device=device, dtype=dtype)
|
||||
self.up_proj = Linear(hidden_size, ffn_hidden_size, bias=False, device=device, dtype=dtype)
|
||||
self.linear_fc2 = Linear(ffn_hidden_size, hidden_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.linear_fc2(self.up_proj(x) * F.gelu(self.gate_proj(x)))
|
||||
|
||||
class ErnieImageSharedAdaLNBlock(nn.Module):
|
||||
def __init__(self, hidden_size: int, num_heads: int, ffn_hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
|
||||
super().__init__()
|
||||
RMSNorm = operations.RMSNorm
|
||||
|
||||
self.adaLN_sa_ln = RMSNorm(hidden_size, eps=eps, device=device, dtype=dtype)
|
||||
self.self_attention = ErnieImageAttention(
|
||||
query_dim=hidden_size,
|
||||
dim_head=hidden_size // num_heads,
|
||||
heads=num_heads,
|
||||
eps=eps,
|
||||
operations=operations,
|
||||
device=device,
|
||||
dtype=dtype
|
||||
)
|
||||
self.adaLN_mlp_ln = RMSNorm(hidden_size, eps=eps, device=device, dtype=dtype)
|
||||
self.mlp = ErnieImageFeedForward(hidden_size, ffn_hidden_size, operations=operations, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, rotary_pos_emb, temb, attention_mask=None):
|
||||
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = temb
|
||||
|
||||
residual = x
|
||||
x_norm = self.adaLN_sa_ln(x)
|
||||
x_norm = (x_norm.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype)
|
||||
|
||||
attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
|
||||
x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype)
|
||||
|
||||
residual = x
|
||||
x_norm = self.adaLN_mlp_ln(x)
|
||||
x_norm = (x_norm.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype)
|
||||
|
||||
return residual + (gate_mlp.float() * self.mlp(x_norm).float()).to(x.dtype)
|
||||
|
||||
class ErnieImageAdaLNContinuous(nn.Module):
|
||||
def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
|
||||
super().__init__()
|
||||
LayerNorm = operations.LayerNorm
|
||||
Linear = operations.Linear
|
||||
self.norm = LayerNorm(hidden_size, elementwise_affine=False, eps=eps, device=device, dtype=dtype)
|
||||
self.linear = Linear(hidden_size, hidden_size * 2, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
|
||||
scale, shift = self.linear(conditioning).chunk(2, dim=-1)
|
||||
x = self.norm(x)
|
||||
x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
|
||||
return x
|
||||
|
||||
class ErnieImageModel(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int = 4096,
|
||||
num_attention_heads: int = 32,
|
||||
num_layers: int = 36,
|
||||
ffn_hidden_size: int = 12288,
|
||||
in_channels: int = 128,
|
||||
out_channels: int = 128,
|
||||
patch_size: int = 1,
|
||||
text_in_dim: int = 3072,
|
||||
rope_theta: int = 256,
|
||||
rope_axes_dim: tuple = (32, 48, 48),
|
||||
eps: float = 1e-6,
|
||||
qk_layernorm: bool = True,
|
||||
device=None,
|
||||
dtype=None,
|
||||
operations=None,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
self.hidden_size = hidden_size
|
||||
self.num_heads = num_attention_heads
|
||||
self.head_dim = hidden_size // num_attention_heads
|
||||
self.patch_size = patch_size
|
||||
self.out_channels = out_channels
|
||||
|
||||
Linear = operations.Linear
|
||||
|
||||
self.x_embedder = ErnieImagePatchEmbedDynamic(in_channels, hidden_size, patch_size, operations, device, dtype)
|
||||
self.text_proj = Linear(text_in_dim, hidden_size, bias=False, device=device, dtype=dtype) if text_in_dim != hidden_size else None
|
||||
|
||||
self.time_proj = Timesteps(hidden_size, flip_sin_to_cos=False)
|
||||
self.time_embedding = TimestepEmbedding(hidden_size, hidden_size, operations, device, dtype)
|
||||
|
||||
self.pos_embed = ErnieImageEmbedND3(dim=self.head_dim, theta=rope_theta, axes_dim=rope_axes_dim)
|
||||
|
||||
self.adaLN_modulation = nn.Sequential(
|
||||
nn.SiLU(),
|
||||
Linear(hidden_size, 6 * hidden_size, device=device, dtype=dtype)
|
||||
)
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
ErnieImageSharedAdaLNBlock(hidden_size, num_attention_heads, ffn_hidden_size, eps, operations, device, dtype)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.final_norm = ErnieImageAdaLNContinuous(hidden_size, eps, operations, device, dtype)
|
||||
self.final_linear = Linear(hidden_size, patch_size * patch_size * out_channels, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, timesteps, context, **kwargs):
|
||||
device, dtype = x.device, x.dtype
|
||||
B, C, H, W = x.shape
|
||||
p, Hp, Wp = self.patch_size, H // self.patch_size, W // self.patch_size
|
||||
N_img = Hp * Wp
|
||||
|
||||
img_bsh = self.x_embedder(x)
|
||||
|
||||
text_bth = context
|
||||
if self.text_proj is not None and text_bth.numel() > 0:
|
||||
text_bth = self.text_proj(text_bth)
|
||||
Tmax = text_bth.shape[1]
|
||||
|
||||
hidden_states = torch.cat([img_bsh, text_bth], dim=1)
|
||||
|
||||
text_ids = torch.zeros((B, Tmax, 3), device=device, dtype=torch.float32)
|
||||
text_ids[:, :, 0] = torch.linspace(0, Tmax - 1, steps=Tmax, device=x.device, dtype=torch.float32)
|
||||
index = float(Tmax)
|
||||
|
||||
transformer_options = kwargs.get("transformer_options", {})
|
||||
rope_options = transformer_options.get("rope_options", None)
|
||||
|
||||
h_len, w_len = float(Hp), float(Wp)
|
||||
h_offset, w_offset = 0.0, 0.0
|
||||
|
||||
if rope_options is not None:
|
||||
h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
|
||||
w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
|
||||
index += rope_options.get("shift_t", 0.0)
|
||||
h_offset += rope_options.get("shift_y", 0.0)
|
||||
w_offset += rope_options.get("shift_x", 0.0)
|
||||
|
||||
image_ids = torch.zeros((Hp, Wp, 3), device=device, dtype=torch.float32)
|
||||
image_ids[:, :, 0] = image_ids[:, :, 1] + index
|
||||
image_ids[:, :, 1] = image_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=Hp, device=device, dtype=torch.float32).unsqueeze(1)
|
||||
image_ids[:, :, 2] = image_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=Wp, device=device, dtype=torch.float32).unsqueeze(0)
|
||||
|
||||
image_ids = image_ids.view(1, N_img, 3).expand(B, -1, -1)
|
||||
|
||||
rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)).to(x.dtype)
|
||||
del image_ids, text_ids
|
||||
|
||||
sample = self.time_proj(timesteps).to(dtype)
|
||||
c = self.time_embedding(sample)
|
||||
|
||||
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
|
||||
t.unsqueeze(1).contiguous() for t in self.adaLN_modulation(c).chunk(6, dim=-1)
|
||||
]
|
||||
|
||||
temb = [shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp]
|
||||
for layer in self.layers:
|
||||
hidden_states = layer(hidden_states, rotary_pos_emb, temb)
|
||||
|
||||
hidden_states = self.final_norm(hidden_states, c).type_as(hidden_states)
|
||||
|
||||
patches = self.final_linear(hidden_states)[:, :N_img, :]
|
||||
output = (
|
||||
patches.view(B, Hp, Wp, p, p, self.out_channels)
|
||||
.permute(0, 5, 1, 3, 2, 4)
|
||||
.contiguous()
|
||||
.view(B, self.out_channels, H, W)
|
||||
)
|
||||
|
||||
return output
|
||||
@@ -16,7 +16,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transforme
|
||||
|
||||
def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
|
||||
assert dim % 2 == 0
|
||||
if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
|
||||
if not comfy.model_management.supports_fp64(pos.device):
|
||||
device = torch.device("cpu")
|
||||
else:
|
||||
device = pos.device
|
||||
|
||||
@@ -155,6 +155,7 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
|
||||
def __init__(self, embed_dim: int, **kwargs):
|
||||
self.max_batch_size = kwargs.pop("max_batch_size", None)
|
||||
ddconfig = kwargs.pop("ddconfig")
|
||||
decoder_ddconfig = kwargs.pop("decoder_ddconfig", ddconfig)
|
||||
super().__init__(
|
||||
encoder_config={
|
||||
"target": "comfy.ldm.modules.diffusionmodules.model.Encoder",
|
||||
@@ -162,7 +163,7 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
|
||||
},
|
||||
decoder_config={
|
||||
"target": "comfy.ldm.modules.diffusionmodules.model.Decoder",
|
||||
"params": ddconfig,
|
||||
"params": decoder_ddconfig,
|
||||
},
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -3,12 +3,9 @@ from ..diffusionmodules.openaimodel import Timestep
|
||||
import torch
|
||||
|
||||
class CLIPEmbeddingNoiseAugmentation(ImageConcatWithNoiseAugmentation):
|
||||
def __init__(self, *args, clip_stats_path=None, timestep_dim=256, **kwargs):
|
||||
def __init__(self, *args, timestep_dim=256, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if clip_stats_path is None:
|
||||
clip_mean, clip_std = torch.zeros(timestep_dim), torch.ones(timestep_dim)
|
||||
else:
|
||||
clip_mean, clip_std = torch.load(clip_stats_path, map_location="cpu")
|
||||
clip_mean, clip_std = torch.zeros(timestep_dim), torch.ones(timestep_dim)
|
||||
self.register_buffer("data_mean", clip_mean[None, :], persistent=False)
|
||||
self.register_buffer("data_std", clip_std[None, :], persistent=False)
|
||||
self.time_embed = Timestep(timestep_dim)
|
||||
|
||||
@@ -90,7 +90,7 @@ class HeatmapHead(torch.nn.Module):
|
||||
origin_max = np.max(hm[k])
|
||||
dr = np.zeros((H + 2 * border, W + 2 * border), dtype=np.float32)
|
||||
dr[border:-border, border:-border] = hm[k].copy()
|
||||
dr = gaussian_filter(dr, sigma=2.0)
|
||||
dr = gaussian_filter(dr, sigma=2.0, truncate=2.5)
|
||||
hm[k] = dr[border:-border, border:-border].copy()
|
||||
cur_max = np.max(hm[k])
|
||||
if cur_max > 0:
|
||||
|
||||
725
comfy/ldm/rt_detr/rtdetr_v4.py
Normal file
725
comfy/ldm/rt_detr/rtdetr_v4.py
Normal file
@@ -0,0 +1,725 @@
|
||||
from collections import OrderedDict
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchvision
|
||||
import comfy.model_management
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
|
||||
COCO_CLASSES = [
|
||||
'person','bicycle','car','motorcycle','airplane','bus','train','truck','boat',
|
||||
'traffic light','fire hydrant','stop sign','parking meter','bench','bird','cat',
|
||||
'dog','horse','sheep','cow','elephant','bear','zebra','giraffe','backpack',
|
||||
'umbrella','handbag','tie','suitcase','frisbee','skis','snowboard','sports ball',
|
||||
'kite','baseball bat','baseball glove','skateboard','surfboard','tennis racket',
|
||||
'bottle','wine glass','cup','fork','knife','spoon','bowl','banana','apple',
|
||||
'sandwich','orange','broccoli','carrot','hot dog','pizza','donut','cake','chair',
|
||||
'couch','potted plant','bed','dining table','toilet','tv','laptop','mouse',
|
||||
'remote','keyboard','cell phone','microwave','oven','toaster','sink',
|
||||
'refrigerator','book','clock','vase','scissors','teddy bear','hair drier','toothbrush',
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HGNetv2 backbone
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ConvBNAct(nn.Module):
|
||||
"""Conv→BN→ReLU. padding='same' adds asymmetric zero-pad (stem)."""
|
||||
def __init__(self, ic, oc, k=3, s=1, groups=1, use_act=True, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
|
||||
self.conv = operations.Conv2d(ic, oc, k, s, (k - 1) // 2, groups=groups, bias=False, device=device, dtype=dtype)
|
||||
self.bn = nn.BatchNorm2d(oc, device=device, dtype=dtype)
|
||||
self.act = nn.ReLU() if use_act else nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(self.bn(self.conv(x)))
|
||||
|
||||
class LightConvBNAct(nn.Module):
|
||||
def __init__(self, ic, oc, k, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.conv1 = ConvBNAct(ic, oc, 1, use_act=False, device=device, dtype=dtype, operations=operations)
|
||||
self.conv2 = ConvBNAct(oc, oc, k, groups=oc, use_act=True, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, x):
|
||||
return self.conv2(self.conv1(x))
|
||||
|
||||
class _StemBlock(nn.Module):
|
||||
def __init__(self, ic, mc, oc, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.stem1 = ConvBNAct(ic, mc, 3, 2, device=device, dtype=dtype, operations=operations)
|
||||
# stem2a/stem2b use kernel=2, stride=1, no internal padding;
|
||||
# padding is applied manually in forward (matching PaddlePaddle original)
|
||||
self.stem2a = ConvBNAct(mc, mc//2, 2, 1, device=device, dtype=dtype, operations=operations)
|
||||
self.stem2b = ConvBNAct(mc//2, mc, 2, 1, device=device, dtype=dtype, operations=operations)
|
||||
self.stem3 = ConvBNAct(mc*2, mc, 3, 2, device=device, dtype=dtype, operations=operations)
|
||||
self.stem4 = ConvBNAct(mc, oc, 1, device=device, dtype=dtype, operations=operations)
|
||||
self.pool = nn.MaxPool2d(2, 1, ceil_mode=True)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.stem1(x)
|
||||
x = F.pad(x, (0, 1, 0, 1)) # pad before pool and stem2a
|
||||
x2 = self.stem2a(x)
|
||||
x2 = F.pad(x2, (0, 1, 0, 1)) # pad before stem2b
|
||||
x2 = self.stem2b(x2)
|
||||
x1 = self.pool(x)
|
||||
return self.stem4(self.stem3(torch.cat([x1, x2], 1)))
|
||||
|
||||
|
||||
class _HG_Block(nn.Module):
|
||||
def __init__(self, ic, mc, oc, layer_num, k=3, residual=False, light=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.residual = residual
|
||||
if light:
|
||||
self.layers = nn.ModuleList(
|
||||
[LightConvBNAct(ic if i == 0 else mc, mc, k, device=device, dtype=dtype, operations=operations) for i in range(layer_num)])
|
||||
else:
|
||||
self.layers = nn.ModuleList(
|
||||
[ConvBNAct(ic if i == 0 else mc, mc, k, device=device, dtype=dtype, operations=operations) for i in range(layer_num)])
|
||||
total = ic + layer_num * mc
|
||||
|
||||
self.aggregation = nn.Sequential(
|
||||
ConvBNAct(total, oc // 2, 1, device=device, dtype=dtype, operations=operations),
|
||||
ConvBNAct(oc // 2, oc, 1, device=device, dtype=dtype, operations=operations))
|
||||
|
||||
def forward(self, x):
|
||||
identity = x
|
||||
outs = [x]
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
outs.append(x)
|
||||
x = self.aggregation(torch.cat(outs, 1))
|
||||
return x + identity if self.residual else x
|
||||
|
||||
|
||||
class _HG_Stage(nn.Module):
|
||||
# config order: ic, mc, oc, num_blocks, downsample, light, k, layer_num
|
||||
def __init__(self, ic, mc, oc, num_blocks, downsample=True, light=False, k=3, layer_num=6, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
if downsample:
|
||||
self.downsample = ConvBNAct(ic, ic, 3, 2, groups=ic, use_act=False, device=device, dtype=dtype, operations=operations)
|
||||
else:
|
||||
self.downsample = nn.Identity()
|
||||
self.blocks = nn.Sequential(*[
|
||||
_HG_Block(ic if i == 0 else oc, mc, oc, layer_num,
|
||||
k=k, residual=(i != 0), light=light, device=device, dtype=dtype, operations=operations)
|
||||
for i in range(num_blocks)
|
||||
])
|
||||
|
||||
def forward(self, x):
|
||||
return self.blocks(self.downsample(x))
|
||||
|
||||
|
||||
class HGNetv2(nn.Module):
|
||||
# B5 config: stem=[3,32,64], stages=[ic, mc, oc, blocks, down, light, k, layers]
|
||||
_STAGE_CFGS = [[64, 64, 128, 1, False, False, 3, 6],
|
||||
[128, 128, 512, 2, True, False, 3, 6],
|
||||
[512, 256, 1024, 5, True, True, 5, 6],
|
||||
[1024,512, 2048, 2, True, True, 5, 6]]
|
||||
|
||||
def __init__(self, return_idx=(1, 2, 3), device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.stem = _StemBlock(3, 32, 64, device=device, dtype=dtype, operations=operations)
|
||||
self.stages = nn.ModuleList([_HG_Stage(*cfg, device=device, dtype=dtype, operations=operations) for cfg in self._STAGE_CFGS])
|
||||
self.return_idx = list(return_idx)
|
||||
self.out_channels = [self._STAGE_CFGS[i][2] for i in return_idx]
|
||||
|
||||
def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
|
||||
x = self.stem(x)
|
||||
outs = []
|
||||
for i, stage in enumerate(self.stages):
|
||||
x = stage(x)
|
||||
if i in self.return_idx:
|
||||
outs.append(x)
|
||||
return outs
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Encoder — HybridEncoder (dfine version: RepNCSPELAN4 + SCDown PAN)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ConvNormLayer(nn.Module):
|
||||
"""Conv→act (expects pre-fused BN weights)."""
|
||||
def __init__(self, ic, oc, k, s, g=1, padding=None, act=None, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
p = (k - 1) // 2 if padding is None else padding
|
||||
self.conv = operations.Conv2d(ic, oc, k, s, p, groups=g, bias=True, device=device, dtype=dtype)
|
||||
self.act = nn.SiLU() if act == 'silu' else nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(self.conv(x))
|
||||
|
||||
|
||||
class VGGBlock(nn.Module):
|
||||
"""Rep-VGG block (expects pre-fused weights)."""
|
||||
def __init__(self, ic, oc, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.conv = operations.Conv2d(ic, oc, 3, 1, padding=1, bias=True, device=device, dtype=dtype)
|
||||
self.act = nn.SiLU()
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(self.conv(x))
|
||||
|
||||
|
||||
class CSPLayer(nn.Module):
|
||||
def __init__(self, ic, oc, num_blocks=3, expansion=1.0, act='silu', device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
h = int(oc * expansion)
|
||||
self.conv1 = ConvNormLayer(ic, h, 1, 1, act=act, device=device, dtype=dtype, operations=operations)
|
||||
self.conv2 = ConvNormLayer(ic, h, 1, 1, act=act, device=device, dtype=dtype, operations=operations)
|
||||
self.bottlenecks = nn.Sequential(*[VGGBlock(h, h, device=device, dtype=dtype, operations=operations) for _ in range(num_blocks)])
|
||||
self.conv3 = ConvNormLayer(h, oc, 1, 1, act=act, device=device, dtype=dtype, operations=operations) if h != oc else nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
return self.conv3(self.bottlenecks(self.conv1(x)) + self.conv2(x))
|
||||
|
||||
|
||||
class RepNCSPELAN4(nn.Module):
|
||||
"""CSP-ELAN block — the FPN/PAN block in RTv4's HybridEncoder."""
|
||||
def __init__(self, c1, c2, c3, c4, n=3, act='silu', device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.c = c3 // 2
|
||||
self.cv1 = ConvNormLayer(c1, c3, 1, 1, act=act, device=device, dtype=dtype, operations=operations)
|
||||
self.cv2 = nn.Sequential(CSPLayer(c3 // 2, c4, n, 1.0, act=act, device=device, dtype=dtype, operations=operations), ConvNormLayer(c4, c4, 3, 1, act=act, device=device, dtype=dtype, operations=operations))
|
||||
self.cv3 = nn.Sequential(CSPLayer(c4, c4, n, 1.0, act=act, device=device, dtype=dtype, operations=operations), ConvNormLayer(c4, c4, 3, 1, act=act, device=device, dtype=dtype, operations=operations))
|
||||
self.cv4 = ConvNormLayer(c3 + 2 * c4, c2, 1, 1, act=act, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, x):
|
||||
y = list(self.cv1(x).split((self.c, self.c), 1))
|
||||
y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
|
||||
return self.cv4(torch.cat(y, 1))
|
||||
|
||||
|
||||
class SCDown(nn.Module):
|
||||
"""Separable conv downsampling used in HybridEncoder PAN bottom-up path."""
|
||||
def __init__(self, ic, oc, k, s, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.cv1 = ConvNormLayer(ic, oc, 1, 1, device=device, dtype=dtype, operations=operations)
|
||||
self.cv2 = ConvNormLayer(oc, oc, k, s, g=oc, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, x):
|
||||
return self.cv2(self.cv1(x))
|
||||
|
||||
|
||||
class SelfAttention(nn.Module):
|
||||
def __init__(self, embed_dim, num_heads, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = embed_dim // num_heads
|
||||
self.q_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
|
||||
self.k_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
|
||||
self.v_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
|
||||
self.out_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, query, key, value, attn_mask=None):
|
||||
optimized_attention = optimized_attention_for_device(query.device, False, small_input=True)
|
||||
q, k, v = self.q_proj(query), self.k_proj(key), self.v_proj(value)
|
||||
out = optimized_attention(q, k, v, heads=self.num_heads, mask=attn_mask)
|
||||
return self.out_proj(out)
|
||||
|
||||
|
||||
class _TransformerEncoderLayer(nn.Module):
|
||||
"""Single AIFI encoder layer (pre- or post-norm, GELU by default)."""
|
||||
def __init__(self, d_model, nhead, dim_feedforward, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = SelfAttention(d_model, nhead, device=device, dtype=dtype, operations=operations)
|
||||
self.linear1 = operations.Linear(d_model, dim_feedforward, device=device, dtype=dtype)
|
||||
self.linear2 = operations.Linear(dim_feedforward, d_model, device=device, dtype=dtype)
|
||||
self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.activation = nn.GELU()
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
q = k = src if pos_embed is None else src + pos_embed
|
||||
src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)
|
||||
src = self.norm1(src + src2)
|
||||
src2 = self.linear2(self.activation(self.linear1(src)))
|
||||
return self.norm2(src + src2)
|
||||
|
||||
|
||||
class _TransformerEncoder(nn.Module):
|
||||
"""Thin wrapper so state-dict keys are encoder.0.layers.N.*"""
|
||||
def __init__(self, num_layers, d_model, nhead, dim_feedforward, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList([
|
||||
_TransformerEncoderLayer(d_model, nhead, dim_feedforward, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
for layer in self.layers:
|
||||
src = layer(src, src_mask=src_mask, pos_embed=pos_embed)
|
||||
return src
|
||||
|
||||
|
||||
class HybridEncoder(nn.Module):
|
||||
def __init__(self, in_channels=(512, 1024, 2048), feat_strides=(8, 16, 32), hidden_dim=256, nhead=8, dim_feedforward=2048, use_encoder_idx=(2,), num_encoder_layers=1,
|
||||
pe_temperature=10000, expansion=1.0, depth_mult=1.0, act='silu', eval_spatial_size=(640, 640), device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.in_channels = list(in_channels)
|
||||
self.feat_strides = list(feat_strides)
|
||||
self.hidden_dim = hidden_dim
|
||||
self.use_encoder_idx = list(use_encoder_idx)
|
||||
self.pe_temperature = pe_temperature
|
||||
self.eval_spatial_size = eval_spatial_size
|
||||
self.out_channels = [hidden_dim] * len(in_channels)
|
||||
self.out_strides = list(feat_strides)
|
||||
|
||||
# channel projection (expects pre-fused weights)
|
||||
self.input_proj = nn.ModuleList([
|
||||
nn.Sequential(OrderedDict([('conv', operations.Conv2d(ch, hidden_dim, 1, bias=True, device=device, dtype=dtype))]))
|
||||
for ch in in_channels
|
||||
])
|
||||
|
||||
# AIFI transformer — use _TransformerEncoder so keys are encoder.0.layers.N.*
|
||||
self.encoder = nn.ModuleList([
|
||||
_TransformerEncoder(num_encoder_layers, hidden_dim, nhead, dim_feedforward, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(len(use_encoder_idx))
|
||||
])
|
||||
|
||||
nb = round(3 * depth_mult)
|
||||
exp = expansion
|
||||
|
||||
# top-down FPN (dfine: lateral conv has no act)
|
||||
self.lateral_convs = nn.ModuleList(
|
||||
[ConvNormLayer(hidden_dim, hidden_dim, 1, 1, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(len(in_channels) - 1)])
|
||||
self.fpn_blocks = nn.ModuleList(
|
||||
[RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(exp * hidden_dim // 2), nb, act=act, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(len(in_channels) - 1)])
|
||||
|
||||
# bottom-up PAN (dfine: nn.Sequential(SCDown) — keeps checkpoint key .0.cv1/.0.cv2)
|
||||
self.downsample_convs = nn.ModuleList(
|
||||
[nn.Sequential(SCDown(hidden_dim, hidden_dim, 3, 2, device=device, dtype=dtype, operations=operations))
|
||||
for _ in range(len(in_channels) - 1)])
|
||||
self.pan_blocks = nn.ModuleList(
|
||||
[RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(exp * hidden_dim // 2), nb, act=act, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(len(in_channels) - 1)])
|
||||
|
||||
# cache positional embeddings for fixed spatial size
|
||||
if eval_spatial_size:
|
||||
for idx in self.use_encoder_idx:
|
||||
stride = self.feat_strides[idx]
|
||||
pe = self._build_pe(eval_spatial_size[1] // stride,
|
||||
eval_spatial_size[0] // stride,
|
||||
hidden_dim, pe_temperature)
|
||||
setattr(self, f'pos_embed{idx}', pe)
|
||||
|
||||
@staticmethod
|
||||
def _build_pe(w, h, dim=256, temp=10000.):
|
||||
assert dim % 4 == 0
|
||||
gw = torch.arange(w, dtype=torch.float32)
|
||||
gh = torch.arange(h, dtype=torch.float32)
|
||||
gw, gh = torch.meshgrid(gw, gh, indexing='ij')
|
||||
pdim = dim // 4
|
||||
omega = 1. / (temp ** (torch.arange(pdim, dtype=torch.float32) / pdim))
|
||||
ow = gw.flatten()[:, None] @ omega[None]
|
||||
oh = gh.flatten()[:, None] @ omega[None]
|
||||
return torch.cat([ow.sin(), ow.cos(), oh.sin(), oh.cos()], 1)[None]
|
||||
|
||||
def forward(self, feats: List[torch.Tensor]) -> List[torch.Tensor]:
|
||||
proj = [self.input_proj[i](f) for i, f in enumerate(feats)]
|
||||
|
||||
for i, enc_idx in enumerate(self.use_encoder_idx):
|
||||
h, w = proj[enc_idx].shape[2:]
|
||||
src = proj[enc_idx].flatten(2).permute(0, 2, 1)
|
||||
pe = getattr(self, f'pos_embed{enc_idx}').to(device=src.device, dtype=src.dtype)
|
||||
for layer in self.encoder[i].layers:
|
||||
src = layer(src, pos_embed=pe)
|
||||
proj[enc_idx] = src.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous()
|
||||
|
||||
n = len(self.in_channels)
|
||||
inner = [proj[-1]]
|
||||
for k in range(n - 1, 0, -1):
|
||||
j = n - 1 - k
|
||||
top = self.lateral_convs[j](inner[0])
|
||||
inner[0] = top
|
||||
up = F.interpolate(top, scale_factor=2., mode='nearest')
|
||||
inner.insert(0, self.fpn_blocks[j](torch.cat([up, proj[k - 1]], 1)))
|
||||
|
||||
outs = [inner[0]]
|
||||
for k in range(n - 1):
|
||||
outs.append(self.pan_blocks[k](
|
||||
torch.cat([self.downsample_convs[k](outs[-1]), inner[k + 1]], 1)))
|
||||
return outs
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Decoder — DFINETransformer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _deformable_attn_v2(value: list, spatial_shapes, sampling_locations: torch.Tensor, attention_weights: torch.Tensor, num_points_list: List[int]) -> torch.Tensor:
|
||||
"""
|
||||
value : list of per-level tensors [bs*n_head, c, h_l, w_l]
|
||||
sampling_locations: [bs, Lq, n_head, sum(pts), 2] in [0,1]
|
||||
attention_weights : [bs, Lq, n_head, sum(pts)]
|
||||
"""
|
||||
_, c = value[0].shape[:2] # bs*n_head, c
|
||||
_, Lq, n_head, _, _ = sampling_locations.shape
|
||||
bs = sampling_locations.shape[0]
|
||||
n_h = n_head
|
||||
|
||||
grids = (2 * sampling_locations - 1) # [bs, Lq, n_head, sum_pts, 2]
|
||||
grids = grids.permute(0, 2, 1, 3, 4).flatten(0, 1) # [bs*n_head, Lq, sum_pts, 2]
|
||||
grids_per_lvl = grids.split(num_points_list, dim=2) # list of [bs*n_head, Lq, pts_l, 2]
|
||||
|
||||
sampled = []
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
val_l = value[lvl].reshape(bs * n_h, c, h, w)
|
||||
sv = F.grid_sample(val_l, grids_per_lvl[lvl], mode='bilinear', padding_mode='zeros', align_corners=False)
|
||||
sampled.append(sv) # sv: [bs*n_head, c, Lq, pts_l]
|
||||
|
||||
attn = attention_weights.permute(0, 2, 1, 3) # [bs, n_head, Lq, sum_pts]
|
||||
attn = attn.flatten(0, 1).unsqueeze(1) # [bs*n_head, 1, Lq, sum_pts]
|
||||
out = (torch.cat(sampled, -1) * attn).sum(-1) # [bs*n_head, c, Lq]
|
||||
out = out.reshape(bs, n_h * c, Lq)
|
||||
return out.permute(0, 2, 1) # [bs, Lq, hidden]
|
||||
|
||||
|
||||
class MSDeformableAttention(nn.Module):
|
||||
def __init__(self, embed_dim=256, num_heads=8, num_levels=3, num_points=4, offset_scale=0.5, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.embed_dim, self.num_heads = embed_dim, num_heads
|
||||
self.head_dim = embed_dim // num_heads
|
||||
pts = num_points if isinstance(num_points, list) else [num_points] * num_levels
|
||||
self.num_points_list = pts
|
||||
self.offset_scale = offset_scale
|
||||
total = num_heads * sum(pts)
|
||||
self.register_buffer('num_points_scale', torch.tensor([1. / n for n in pts for _ in range(n)], dtype=torch.float32))
|
||||
self.sampling_offsets = operations.Linear(embed_dim, total * 2, device=device, dtype=dtype)
|
||||
self.attention_weights = operations.Linear(embed_dim, total, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, query, ref_pts, value, spatial_shapes):
|
||||
bs, Lq = query.shape[:2]
|
||||
offsets = self.sampling_offsets(query).reshape(
|
||||
bs, Lq, self.num_heads, sum(self.num_points_list), 2)
|
||||
attn_w = F.softmax(
|
||||
self.attention_weights(query).reshape(
|
||||
bs, Lq, self.num_heads, sum(self.num_points_list)), -1)
|
||||
scale = self.num_points_scale.to(query).unsqueeze(-1)
|
||||
offset = offsets * scale * ref_pts[:, :, None, :, 2:] * self.offset_scale
|
||||
locs = ref_pts[:, :, None, :, :2] + offset # [bs, Lq, n_head, sum_pts, 2]
|
||||
return _deformable_attn_v2(value, spatial_shapes, locs, attn_w, self.num_points_list)
|
||||
|
||||
|
||||
class Gate(nn.Module):
|
||||
def __init__(self, d_model, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.gate = operations.Linear(2 * d_model, 2 * d_model, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x1, x2):
|
||||
g1, g2 = torch.sigmoid(self.gate(torch.cat([x1, x2], -1))).chunk(2, -1)
|
||||
return self.norm(g1 * x1 + g2 * x2)
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, in_dim, hidden_dim, out_dim, num_layers, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
dims = [in_dim] + [hidden_dim] * (num_layers - 1) + [out_dim]
|
||||
self.layers = nn.ModuleList(operations.Linear(dims[i], dims[i + 1], device=device, dtype=dtype) for i in range(num_layers))
|
||||
|
||||
def forward(self, x):
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = nn.SiLU()(layer(x)) if i < len(self.layers) - 1 else layer(x)
|
||||
return x
|
||||
|
||||
|
||||
class TransformerDecoderLayer(nn.Module):
|
||||
def __init__(self, d_model=256, nhead=8, dim_feedforward=1024, num_levels=3, num_points=4, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = SelfAttention(d_model, nhead, device=device, dtype=dtype, operations=operations)
|
||||
self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.cross_attn = MSDeformableAttention(d_model, nhead, num_levels, num_points, device=device, dtype=dtype, operations=operations)
|
||||
self.gateway = Gate(d_model, device=device, dtype=dtype, operations=operations)
|
||||
self.linear1 = operations.Linear(d_model, dim_feedforward, device=device, dtype=dtype)
|
||||
self.activation = nn.ReLU()
|
||||
self.linear2 = operations.Linear(dim_feedforward, d_model, device=device, dtype=dtype)
|
||||
self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, target, ref_pts, value, spatial_shapes, attn_mask=None, query_pos=None):
|
||||
q = k = target if query_pos is None else target + query_pos
|
||||
t2 = self.self_attn(q, k, value=target, attn_mask=attn_mask)
|
||||
target = self.norm1(target + t2)
|
||||
t2 = self.cross_attn(
|
||||
target if query_pos is None else target + query_pos,
|
||||
ref_pts, value, spatial_shapes)
|
||||
target = self.gateway(target, t2)
|
||||
t2 = self.linear2(self.activation(self.linear1(target)))
|
||||
target = self.norm3((target + t2).clamp(-65504, 65504))
|
||||
return target
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FDR utilities
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def weighting_function(reg_max, up, reg_scale):
|
||||
"""Non-uniform weighting function W(n) for FDR box regression."""
|
||||
ub1 = (abs(up[0]) * abs(reg_scale)).item()
|
||||
ub2 = ub1 * 2
|
||||
step = (ub1 + 1) ** (2 / (reg_max - 2))
|
||||
left = [-(step ** i) + 1 for i in range(reg_max // 2 - 1, 0, -1)]
|
||||
right = [ (step ** i) - 1 for i in range(1, reg_max // 2)]
|
||||
vals = [-ub2] + left + [0] + right + [ub2]
|
||||
return torch.tensor(vals, dtype=up.dtype, device=up.device)
|
||||
|
||||
|
||||
def distance2bbox(points, distance, reg_scale):
|
||||
"""Decode edge-distances → cxcywh boxes."""
|
||||
rs = abs(reg_scale).to(dtype=points.dtype)
|
||||
x1 = points[..., 0] - (0.5 * rs + distance[..., 0]) * (points[..., 2] / rs)
|
||||
y1 = points[..., 1] - (0.5 * rs + distance[..., 1]) * (points[..., 3] / rs)
|
||||
x2 = points[..., 0] + (0.5 * rs + distance[..., 2]) * (points[..., 2] / rs)
|
||||
y2 = points[..., 1] + (0.5 * rs + distance[..., 3]) * (points[..., 3] / rs)
|
||||
x0, y0, x1_, y1_ = (x1 + x2) / 2, (y1 + y2) / 2, x2 - x1, y2 - y1
|
||||
return torch.stack([x0, y0, x1_, y1_], -1)
|
||||
|
||||
|
||||
class Integral(nn.Module):
|
||||
"""Sum Pr(n)·W(n) over the distribution bins."""
|
||||
def __init__(self, reg_max=32):
|
||||
super().__init__()
|
||||
self.reg_max = reg_max
|
||||
|
||||
def forward(self, x, project):
|
||||
shape = x.shape
|
||||
x = F.softmax(x.reshape(-1, self.reg_max + 1), 1)
|
||||
x = F.linear(x, project.to(device=x.device, dtype=x.dtype)).reshape(-1, 4)
|
||||
return x.reshape(list(shape[:-1]) + [-1])
|
||||
|
||||
|
||||
class LQE(nn.Module):
|
||||
"""Location Quality Estimator — refines class scores using corner distribution."""
|
||||
def __init__(self, k=4, hidden_dim=64, num_layers=2, reg_max=32, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.k, self.reg_max = k, reg_max
|
||||
self.reg_conf = MLP(4 * (k + 1), hidden_dim, 1, num_layers, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, scores, pred_corners):
|
||||
B, L, _ = pred_corners.shape
|
||||
prob = F.softmax(pred_corners.reshape(B, L, 4, self.reg_max + 1), -1)
|
||||
topk, _ = prob.topk(self.k, -1)
|
||||
stat = torch.cat([topk, topk.mean(-1, keepdim=True)], -1)
|
||||
return scores + self.reg_conf(stat.reshape(B, L, -1))
|
||||
|
||||
|
||||
class TransformerDecoder(nn.Module):
|
||||
def __init__(self, hidden_dim, nhead, dim_feedforward, num_levels, num_points, num_layers, reg_max, reg_scale, up, eval_idx=-1, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.hidden_dim = hidden_dim
|
||||
self.num_layers = num_layers
|
||||
self.nhead = nhead
|
||||
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
|
||||
self.up, self.reg_scale, self.reg_max = up, reg_scale, reg_max
|
||||
self.layers = nn.ModuleList([
|
||||
TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, num_levels, num_points, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(self.eval_idx + 1)
|
||||
])
|
||||
self.lqe_layers = nn.ModuleList([LQE(4, 64, 2, reg_max, device=device, dtype=dtype, operations=operations) for _ in range(self.eval_idx + 1)])
|
||||
self.register_buffer('project', weighting_function(reg_max, up, reg_scale))
|
||||
|
||||
def _value_op(self, memory, spatial_shapes):
|
||||
"""Reshape memory to per-level value tensors for deformable attention."""
|
||||
c = self.hidden_dim // self.nhead
|
||||
split = [h * w for h, w in spatial_shapes]
|
||||
val = memory.reshape(memory.shape[0], memory.shape[1], self.nhead, c) # memory: [bs, sum(h*w), hidden_dim]
|
||||
# → [bs, n_head, c, sum_hw]
|
||||
val = val.permute(0, 2, 3, 1).flatten(0, 1) # [bs*n_head, c, sum_hw]
|
||||
return val.split(split, dim=-1) # list of [bs*n_head, c, h_l*w_l]
|
||||
|
||||
def forward(self, target, ref_pts_unact, memory, spatial_shapes, bbox_head, score_head, query_pos_head, pre_bbox_head, integral):
|
||||
val_split_flat = self._value_op(memory, spatial_shapes) # pre-split value for deformable attention
|
||||
|
||||
# reshape to [bs*n_head, c, h_l, w_l]
|
||||
value = []
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
v = val_split_flat[lvl] # [bs*n_head, c, h*w]
|
||||
value.append(v.reshape(v.shape[0], v.shape[1], h, w))
|
||||
|
||||
ref_pts = F.sigmoid(ref_pts_unact)
|
||||
output = target
|
||||
output_detach = pred_corners_undetach = 0
|
||||
|
||||
dec_bboxes, dec_logits = [], []
|
||||
|
||||
for i, layer in enumerate(self.layers):
|
||||
ref_input = ref_pts.unsqueeze(2) # [bs, Lq, 1, 4]
|
||||
query_pos = query_pos_head(ref_pts).clamp(-10, 10)
|
||||
output = layer(output, ref_input, value, spatial_shapes, query_pos=query_pos)
|
||||
|
||||
if i == 0:
|
||||
ref_unact = ref_pts.clamp(1e-5, 1 - 1e-5)
|
||||
ref_unact = torch.log(ref_unact / (1 - ref_unact))
|
||||
pre_bboxes = F.sigmoid(pre_bbox_head(output) + ref_unact)
|
||||
ref_pts_initial = pre_bboxes.detach()
|
||||
|
||||
pred_corners = bbox_head[i](output + output_detach) + pred_corners_undetach
|
||||
inter_ref_bbox = distance2bbox(ref_pts_initial, integral(pred_corners, self.project), self.reg_scale)
|
||||
|
||||
if i == self.eval_idx:
|
||||
scores = score_head[i](output)
|
||||
scores = self.lqe_layers[i](scores, pred_corners)
|
||||
dec_bboxes.append(inter_ref_bbox)
|
||||
dec_logits.append(scores)
|
||||
break
|
||||
|
||||
pred_corners_undetach = pred_corners
|
||||
ref_pts = inter_ref_bbox.detach()
|
||||
output_detach = output.detach()
|
||||
|
||||
return torch.stack(dec_bboxes), torch.stack(dec_logits)
|
||||
|
||||
|
||||
class DFINETransformer(nn.Module):
|
||||
def __init__(self, num_classes=80, hidden_dim=256, num_queries=300, feat_channels=[256, 256, 256], feat_strides=[8, 16, 32],
|
||||
num_levels=3, num_points=[3, 6, 3], nhead=8, num_layers=6, dim_feedforward=1024, eval_idx=-1, eps=1e-2, reg_max=32,
|
||||
reg_scale=8.0, eval_spatial_size=(640, 640), device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
assert len(feat_strides) == len(feat_channels)
|
||||
self.hidden_dim = hidden_dim
|
||||
self.num_queries = num_queries
|
||||
self.num_levels = num_levels
|
||||
self.eps = eps
|
||||
self.eval_spatial_size = eval_spatial_size
|
||||
|
||||
self.feat_strides = list(feat_strides)
|
||||
for i in range(num_levels - len(feat_strides)):
|
||||
self.feat_strides.append(feat_strides[-1] * 2 ** (i + 1))
|
||||
|
||||
# input projection (expects pre-fused weights)
|
||||
self.input_proj = nn.ModuleList()
|
||||
for ch in feat_channels:
|
||||
if ch == hidden_dim:
|
||||
self.input_proj.append(nn.Identity())
|
||||
else:
|
||||
self.input_proj.append(nn.Sequential(OrderedDict([
|
||||
('conv', operations.Conv2d(ch, hidden_dim, 1, bias=True, device=device, dtype=dtype))])))
|
||||
in_ch = feat_channels[-1]
|
||||
for i in range(num_levels - len(feat_channels)):
|
||||
self.input_proj.append(nn.Sequential(OrderedDict([
|
||||
('conv', operations.Conv2d(in_ch if i == 0 else hidden_dim,
|
||||
hidden_dim, 3, 2, 1, bias=True, device=device, dtype=dtype))])))
|
||||
in_ch = hidden_dim
|
||||
|
||||
# FDR parameters (non-trainable placeholders, set from config)
|
||||
self.up = nn.Parameter(torch.tensor([0.5]), requires_grad=False)
|
||||
self.reg_scale = nn.Parameter(torch.tensor([reg_scale]), requires_grad=False)
|
||||
|
||||
pts = num_points if isinstance(num_points, (list, tuple)) else [num_points] * num_levels
|
||||
self.decoder = TransformerDecoder(hidden_dim, nhead, dim_feedforward, num_levels, pts,
|
||||
num_layers, reg_max, self.reg_scale, self.up, eval_idx, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, 2, device=device, dtype=dtype, operations=operations)
|
||||
self.enc_output = nn.Sequential(OrderedDict([
|
||||
('proj', operations.Linear(hidden_dim, hidden_dim, device=device, dtype=dtype)),
|
||||
('norm', operations.LayerNorm(hidden_dim, device=device, dtype=dtype))]))
|
||||
self.enc_score_head = operations.Linear(hidden_dim, num_classes, device=device, dtype=dtype)
|
||||
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.eval_idx_ = eval_idx if eval_idx >= 0 else num_layers + eval_idx
|
||||
self.dec_score_head = nn.ModuleList(
|
||||
[operations.Linear(hidden_dim, num_classes, device=device, dtype=dtype) for _ in range(self.eval_idx_ + 1)])
|
||||
self.pre_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3, device=device, dtype=dtype, operations=operations)
|
||||
self.dec_bbox_head = nn.ModuleList(
|
||||
[MLP(hidden_dim, hidden_dim, 4 * (reg_max + 1), 3, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(self.eval_idx_ + 1)])
|
||||
self.integral = Integral(reg_max)
|
||||
|
||||
if eval_spatial_size:
|
||||
# Register as buffers so checkpoint values override the freshly-computed defaults
|
||||
anchors, valid_mask = self._gen_anchors()
|
||||
self.register_buffer('anchors', anchors)
|
||||
self.register_buffer('valid_mask', valid_mask)
|
||||
|
||||
def _gen_anchors(self, spatial_shapes=None, grid_size=0.05, dtype=torch.float32, device='cpu'):
|
||||
if spatial_shapes is None:
|
||||
h0, w0 = self.eval_spatial_size
|
||||
spatial_shapes = [[int(h0 / s), int(w0 / s)] for s in self.feat_strides]
|
||||
anchors = []
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
gy, gx = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij')
|
||||
gxy = (torch.stack([gx, gy], -1).float() + 0.5) / torch.tensor([w, h], dtype=dtype)
|
||||
wh = torch.ones_like(gxy) * grid_size * (2. ** lvl)
|
||||
anchors.append(torch.cat([gxy, wh], -1).reshape(-1, h * w, 4))
|
||||
anchors = torch.cat(anchors, 1).to(device)
|
||||
valid_mask = ((anchors > self.eps) & (anchors < 1 - self.eps)).all(-1, keepdim=True)
|
||||
anchors = torch.log(anchors / (1 - anchors))
|
||||
anchors = torch.where(valid_mask, anchors, torch.full_like(anchors, float('inf')))
|
||||
return anchors, valid_mask
|
||||
|
||||
def _encoder_input(self, feats: List[torch.Tensor]):
|
||||
proj = [self.input_proj[i](f) for i, f in enumerate(feats)]
|
||||
for i in range(len(feats), self.num_levels):
|
||||
proj.append(self.input_proj[i](feats[-1] if i == len(feats) else proj[-1]))
|
||||
flat, shapes = [], []
|
||||
for f in proj:
|
||||
_, _, h, w = f.shape
|
||||
flat.append(f.flatten(2).permute(0, 2, 1))
|
||||
shapes.append([h, w])
|
||||
return torch.cat(flat, 1), shapes
|
||||
|
||||
def _decoder_input(self, memory: torch.Tensor):
|
||||
anchors, valid_mask = self.anchors.to(memory), self.valid_mask
|
||||
if memory.shape[0] > 1:
|
||||
anchors = anchors.repeat(memory.shape[0], 1, 1)
|
||||
|
||||
mem = valid_mask.to(memory) * memory
|
||||
out_mem = self.enc_output(mem)
|
||||
logits = self.enc_score_head(out_mem)
|
||||
_, idx = torch.topk(logits.max(-1).values, self.num_queries, dim=-1)
|
||||
idx_e = idx.unsqueeze(-1)
|
||||
topk_mem = out_mem.gather(1, idx_e.expand(-1, -1, out_mem.shape[-1]))
|
||||
topk_anc = anchors.gather(1, idx_e.expand(-1, -1, anchors.shape[-1]))
|
||||
topk_ref = self.enc_bbox_head(topk_mem) + topk_anc
|
||||
return topk_mem.detach(), topk_ref.detach()
|
||||
|
||||
def forward(self, feats: List[torch.Tensor]):
|
||||
memory, shapes = self._encoder_input(feats)
|
||||
content, ref = self._decoder_input(memory)
|
||||
out_bboxes, out_logits = self.decoder(
|
||||
content, ref, memory, shapes,
|
||||
self.dec_bbox_head, self.dec_score_head,
|
||||
self.query_pos_head, self.pre_bbox_head, self.integral)
|
||||
return {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main model
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class RTv4(nn.Module):
|
||||
def __init__(self, num_classes=80, num_queries=300, enc_h=256, dec_h=256, enc_ff=2048, dec_ff=1024, feat_strides=[8, 16, 32], device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
self.dtype = dtype
|
||||
self.operations = operations
|
||||
|
||||
self.backbone = HGNetv2(device=device, dtype=dtype, operations=operations)
|
||||
self.encoder = HybridEncoder(hidden_dim=enc_h, dim_feedforward=enc_ff, device=device, dtype=dtype, operations=operations)
|
||||
self.decoder = DFINETransformer(num_classes=num_classes, hidden_dim=dec_h, num_queries=num_queries,
|
||||
feat_channels=[enc_h] * len(feat_strides), feat_strides=feat_strides, dim_feedforward=dec_ff, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.num_classes = num_classes
|
||||
self.num_queries = num_queries
|
||||
self.load_device = comfy.model_management.get_torch_device()
|
||||
|
||||
def _forward(self, x: torch.Tensor):
|
||||
return self.decoder(self.encoder(self.backbone(x)))
|
||||
|
||||
def postprocess(self, outputs, orig_size: tuple = (640, 640)) -> List[dict]:
|
||||
logits = outputs['pred_logits']
|
||||
boxes = torchvision.ops.box_convert(outputs['pred_boxes'], 'cxcywh', 'xyxy')
|
||||
boxes = boxes * torch.tensor(orig_size, device=boxes.device, dtype=boxes.dtype).repeat(1, 2).unsqueeze(1)
|
||||
scores = F.sigmoid(logits)
|
||||
scores, idx = torch.topk(scores.flatten(1), self.num_queries, dim=-1)
|
||||
labels = idx % self.num_classes
|
||||
boxes = boxes.gather(1, (idx // self.num_classes).unsqueeze(-1).expand(-1, -1, 4))
|
||||
return [{'labels': lbl, 'boxes': b, 'scores': s} for lbl, b, s in zip(labels, boxes, scores)]
|
||||
|
||||
def forward(self, x: torch.Tensor, orig_size: tuple = (640, 640), **kwargs):
|
||||
outputs = self._forward(x.to(device=self.load_device, dtype=self.dtype))
|
||||
return self.postprocess(outputs, orig_size)
|
||||
@@ -52,6 +52,8 @@ import comfy.ldm.qwen_image.model
|
||||
import comfy.ldm.kandinsky5.model
|
||||
import comfy.ldm.anima.model
|
||||
import comfy.ldm.ace.ace_step15
|
||||
import comfy.ldm.rt_detr.rtdetr_v4
|
||||
import comfy.ldm.ernie.model
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.patcher_extension
|
||||
@@ -1957,3 +1959,18 @@ class Kandinsky5Image(Kandinsky5):
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
return None
|
||||
|
||||
class RT_DETR_v4(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.rt_detr.rtdetr_v4.RTv4)
|
||||
|
||||
class ErnieImage(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ernie.model.ErnieImageModel)
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
cross_attn = kwargs.get("cross_attn", None)
|
||||
if cross_attn is not None:
|
||||
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
|
||||
return out
|
||||
|
||||
@@ -696,6 +696,26 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
if '{}encoder.lyric_encoder.layers.0.input_layernorm.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config = {}
|
||||
dit_config["audio_model"] = "ace1.5"
|
||||
head_dim = 128
|
||||
dit_config["hidden_size"] = state_dict['{}decoder.layers.0.self_attn_norm.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["intermediate_size"] = state_dict['{}decoder.layers.0.mlp.gate_proj.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["num_heads"] = state_dict['{}decoder.layers.0.self_attn.q_proj.weight'.format(key_prefix)].shape[0] // head_dim
|
||||
|
||||
dit_config["encoder_hidden_size"] = state_dict['{}encoder.lyric_encoder.layers.0.input_layernorm.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["encoder_num_heads"] = state_dict['{}encoder.lyric_encoder.layers.0.self_attn.q_proj.weight'.format(key_prefix)].shape[0] // head_dim
|
||||
dit_config["encoder_intermediate_size"] = state_dict['{}encoder.lyric_encoder.layers.0.mlp.gate_proj.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["num_dit_layers"] = count_blocks(state_dict_keys, '{}decoder.layers.'.format(key_prefix) + '{}.')
|
||||
return dit_config
|
||||
|
||||
if '{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix) in state_dict_keys: # RT-DETR_v4
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "RT_DETR_v4"
|
||||
dit_config["enc_h"] = state_dict['{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix)].shape[0]
|
||||
return dit_config
|
||||
|
||||
if '{}layers.0.mlp.linear_fc2.weight'.format(key_prefix) in state_dict_keys: # Ernie Image
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "ernie"
|
||||
return dit_config
|
||||
|
||||
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
|
||||
|
||||
@@ -1326,9 +1326,9 @@ MAX_PINNED_MEMORY = -1
|
||||
if not args.disable_pinned_memory:
|
||||
if is_nvidia() or is_amd():
|
||||
if WINDOWS:
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.45 # Windows limit is apparently 50%
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40 # Windows limit is apparently 50%
|
||||
else:
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95
|
||||
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90
|
||||
logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))
|
||||
|
||||
PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"])
|
||||
@@ -1403,8 +1403,6 @@ def unpin_memory(tensor):
|
||||
|
||||
if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
|
||||
TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr)
|
||||
if len(PINNED_MEMORY) == 0:
|
||||
TOTAL_PINNED_MEMORY = 0
|
||||
return True
|
||||
else:
|
||||
logging.warning("Unpin error.")
|
||||
@@ -1734,6 +1732,21 @@ def supports_mxfp8_compute(device=None):
|
||||
|
||||
return True
|
||||
|
||||
def supports_fp64(device=None):
|
||||
if is_device_mps(device):
|
||||
return False
|
||||
|
||||
if is_intel_xpu():
|
||||
return False
|
||||
|
||||
if is_directml_enabled():
|
||||
return False
|
||||
|
||||
if is_ixuca():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def extended_fp16_support():
|
||||
# TODO: check why some models work with fp16 on newer torch versions but not on older
|
||||
if torch_version_numeric < (2, 7):
|
||||
|
||||
@@ -1151,7 +1151,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
||||
if param is None:
|
||||
continue
|
||||
p = fn(param)
|
||||
if p.is_inference():
|
||||
if (not torch.is_inference_mode_enabled()) and p.is_inference():
|
||||
p = p.clone()
|
||||
self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False))
|
||||
for key, buf in self._buffers.items():
|
||||
|
||||
28
comfy/sd.py
28
comfy/sd.py
@@ -62,6 +62,7 @@ import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.qwen35
|
||||
import comfy.text_encoders.ernie
|
||||
|
||||
import comfy.model_patcher
|
||||
import comfy.lora
|
||||
@@ -556,12 +557,19 @@ class VAE:
|
||||
old_memory_used_decode = self.memory_used_decode
|
||||
self.memory_used_decode = lambda shape, dtype: old_memory_used_decode(shape, dtype) * 4.0
|
||||
|
||||
decoder_ch = sd['decoder.conv_in.weight'].shape[0] // ddconfig['ch_mult'][-1]
|
||||
if decoder_ch != ddconfig['ch']:
|
||||
decoder_ddconfig = ddconfig.copy()
|
||||
decoder_ddconfig['ch'] = decoder_ch
|
||||
else:
|
||||
decoder_ddconfig = None
|
||||
|
||||
if 'post_quant_conv.weight' in sd:
|
||||
self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
|
||||
self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1], **({"decoder_ddconfig": decoder_ddconfig} if decoder_ddconfig is not None else {}))
|
||||
else:
|
||||
self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"},
|
||||
encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig},
|
||||
decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig})
|
||||
decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': decoder_ddconfig if decoder_ddconfig is not None else ddconfig})
|
||||
elif "decoder.layers.1.layers.0.beta" in sd:
|
||||
config = {}
|
||||
param_key = None
|
||||
@@ -1228,6 +1236,7 @@ class TEModel(Enum):
|
||||
QWEN35_4B = 25
|
||||
QWEN35_9B = 26
|
||||
QWEN35_27B = 27
|
||||
MINISTRAL_3_3B = 28
|
||||
|
||||
|
||||
def detect_te_model(sd):
|
||||
@@ -1294,6 +1303,8 @@ def detect_te_model(sd):
|
||||
return TEModel.MISTRAL3_24B
|
||||
else:
|
||||
return TEModel.MISTRAL3_24B_PRUNED_FLUX2
|
||||
if weight.shape[0] == 3072:
|
||||
return TEModel.MINISTRAL_3_3B
|
||||
|
||||
return TEModel.LLAMA3_8
|
||||
return None
|
||||
@@ -1451,6 +1462,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
elif te_model == TEModel.QWEN3_06B:
|
||||
clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer
|
||||
elif te_model == TEModel.MINISTRAL_3_3B:
|
||||
clip_target.clip = comfy.text_encoders.ernie.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.ernie.ErnieTokenizer
|
||||
tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None)
|
||||
else:
|
||||
# clip_l
|
||||
if clip_type == CLIPType.SD3:
|
||||
@@ -1736,15 +1751,18 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable
|
||||
"""
|
||||
dtype = model_options.get("dtype", None)
|
||||
|
||||
custom_operations = model_options.get("custom_operations", None)
|
||||
if custom_operations is None:
|
||||
sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)
|
||||
|
||||
#Allow loading unets from checkpoint files
|
||||
diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
|
||||
temp_sd = comfy.utils.state_dict_prefix_replace(sd, {diffusion_model_prefix: ""}, filter_keys=True)
|
||||
if len(temp_sd) > 0:
|
||||
sd = temp_sd
|
||||
if custom_operations is None:
|
||||
sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)
|
||||
|
||||
custom_operations = model_options.get("custom_operations", None)
|
||||
if custom_operations is None:
|
||||
sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)
|
||||
parameters = comfy.utils.calculate_parameters(sd)
|
||||
weight_dtype = comfy.utils.weight_dtype(sd)
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@ import comfy.text_encoders.z_image
|
||||
import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.ernie
|
||||
|
||||
from . import supported_models_base
|
||||
from . import latent_formats
|
||||
@@ -1734,6 +1735,52 @@ class LongCatImage(supported_models_base.BASE):
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.longcat_image.LongCatImageTokenizer, comfy.text_encoders.longcat_image.te(**hunyuan_detect))
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
|
||||
|
||||
class RT_DETR_v4(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "RT_DETR_v4",
|
||||
}
|
||||
|
||||
supported_inference_dtypes = [torch.float16, torch.float32]
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.RT_DETR_v4(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
return None
|
||||
|
||||
|
||||
class ErnieImage(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "ernie",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"multiplier": 1000.0,
|
||||
"shift": 3.0,
|
||||
}
|
||||
|
||||
memory_usage_factor = 10.0
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.Flux2
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.ErnieImage(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}ministral3_3b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.ernie.ErnieTokenizer, comfy.text_encoders.ernie.te(**hunyuan_detect))
|
||||
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage]
|
||||
|
||||
models += [SVD_img2vid]
|
||||
|
||||
38
comfy/text_encoders/ernie.py
Normal file
38
comfy/text_encoders/ernie.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from .flux import Mistral3Tokenizer
|
||||
from comfy import sd1_clip
|
||||
import comfy.text_encoders.llama
|
||||
|
||||
class Ministral3_3BTokenizer(Mistral3Tokenizer):
|
||||
def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='ministral3_3b', tokenizer_data={}):
|
||||
return super().__init__(embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_data=tokenizer_data)
|
||||
|
||||
class ErnieTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="ministral3_3b", tokenizer=Mistral3Tokenizer)
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
|
||||
tokens = super().tokenize_with_weights(text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
return tokens
|
||||
|
||||
|
||||
class Ministral3_3BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
|
||||
textmodel_json_config = {}
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 1, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Ministral3_3B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
|
||||
class ErnieTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, name="ministral3_3b", clip_model=Ministral3_3BModel):
|
||||
super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class ErnieTEModel_(ErnieTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return ErnieTEModel
|
||||
@@ -116,9 +116,9 @@ class MistralTokenizerClass:
|
||||
return LlamaTokenizerFast(**kwargs)
|
||||
|
||||
class Mistral3Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_data={}):
|
||||
self.tekken_data = tokenizer_data.get("tekken_model", None)
|
||||
super().__init__("", pad_with_end=False, embedding_directory=embedding_directory, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
|
||||
super().__init__("", pad_with_end=False, embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, start_token=1, max_length=99999999, min_length=1, pad_left=True, disable_weights=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
|
||||
|
||||
def state_dict(self):
|
||||
return {"tekken_model": self.tekken_data}
|
||||
|
||||
@@ -60,6 +60,30 @@ class Mistral3Small24BConfig:
|
||||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
|
||||
@dataclass
|
||||
class Ministral3_3BConfig:
|
||||
vocab_size: int = 131072
|
||||
hidden_size: int = 3072
|
||||
intermediate_size: int = 9216
|
||||
num_hidden_layers: int = 26
|
||||
num_attention_heads: int = 32
|
||||
num_key_value_heads: int = 8
|
||||
max_position_embeddings: int = 262144
|
||||
rms_norm_eps: float = 1e-5
|
||||
rope_theta: float = 1000000.0
|
||||
transformer_type: str = "llama"
|
||||
head_dim = 128
|
||||
rms_norm_add = False
|
||||
mlp_activation = "silu"
|
||||
qkv_bias = False
|
||||
rope_dims = None
|
||||
q_norm = None
|
||||
k_norm = None
|
||||
rope_scale = None
|
||||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
stop_tokens = [2]
|
||||
|
||||
@dataclass
|
||||
class Qwen25_3BConfig:
|
||||
vocab_size: int = 151936
|
||||
@@ -946,6 +970,15 @@ class Mistral3Small24B(BaseLlama, torch.nn.Module):
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
class Ministral3_3B(BaseLlama, BaseQwen3, BaseGenerate, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = Ministral3_3BConfig(**config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
class Qwen25_3B(BaseLlama, torch.nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
|
||||
@@ -52,6 +52,26 @@ class TaskImageContent(BaseModel):
|
||||
role: Literal["first_frame", "last_frame", "reference_image"] | None = Field(None)
|
||||
|
||||
|
||||
class TaskVideoContentUrl(BaseModel):
|
||||
url: str = Field(...)
|
||||
|
||||
|
||||
class TaskVideoContent(BaseModel):
|
||||
type: str = Field("video_url")
|
||||
video_url: TaskVideoContentUrl = Field(...)
|
||||
role: str = Field("reference_video")
|
||||
|
||||
|
||||
class TaskAudioContentUrl(BaseModel):
|
||||
url: str = Field(...)
|
||||
|
||||
|
||||
class TaskAudioContent(BaseModel):
|
||||
type: str = Field("audio_url")
|
||||
audio_url: TaskAudioContentUrl = Field(...)
|
||||
role: str = Field("reference_audio")
|
||||
|
||||
|
||||
class Text2VideoTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
content: list[TaskTextContent] = Field(..., min_length=1)
|
||||
@@ -64,6 +84,17 @@ class Image2VideoTaskCreationRequest(BaseModel):
|
||||
generate_audio: bool | None = Field(...)
|
||||
|
||||
|
||||
class Seedance2TaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
content: list[TaskTextContent | TaskImageContent | TaskVideoContent | TaskAudioContent] = Field(..., min_length=1)
|
||||
generate_audio: bool | None = Field(None)
|
||||
resolution: str | None = Field(None)
|
||||
ratio: str | None = Field(None)
|
||||
duration: int | None = Field(None, ge=4, le=15)
|
||||
seed: int | None = Field(None, ge=0, le=2147483647)
|
||||
watermark: bool | None = Field(None)
|
||||
|
||||
|
||||
class TaskCreationResponse(BaseModel):
|
||||
id: str = Field(...)
|
||||
|
||||
@@ -77,12 +108,27 @@ class TaskStatusResult(BaseModel):
|
||||
video_url: str = Field(...)
|
||||
|
||||
|
||||
class TaskStatusUsage(BaseModel):
|
||||
completion_tokens: int = Field(0)
|
||||
total_tokens: int = Field(0)
|
||||
|
||||
|
||||
class TaskStatusResponse(BaseModel):
|
||||
id: str = Field(...)
|
||||
model: str = Field(...)
|
||||
status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...)
|
||||
error: TaskStatusError | None = Field(None)
|
||||
content: TaskStatusResult | None = Field(None)
|
||||
usage: TaskStatusUsage | None = Field(None)
|
||||
|
||||
|
||||
# Dollars per 1K tokens, keyed by (model_id, has_video_input).
|
||||
SEEDANCE2_PRICE_PER_1K_TOKENS = {
|
||||
("dreamina-seedance-2-0-260128", False): 0.007,
|
||||
("dreamina-seedance-2-0-260128", True): 0.0043,
|
||||
("dreamina-seedance-2-0-fast-260128", False): 0.0056,
|
||||
("dreamina-seedance-2-0-fast-260128", True): 0.0033,
|
||||
}
|
||||
|
||||
|
||||
RECOMMENDED_PRESETS = [
|
||||
@@ -112,6 +158,12 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
|
||||
("Custom", None, None),
|
||||
]
|
||||
|
||||
# Seedance 2.0 reference video pixel count limits per model.
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
|
||||
"dreamina-seedance-2-0-260128": {"min": 409_600, "max": 927_408},
|
||||
"dreamina-seedance-2-0-fast-260128": {"min": 409_600, "max": 927_408},
|
||||
}
|
||||
|
||||
# The time in this dictionary are given for 10 seconds duration.
|
||||
VIDEO_TASKS_EXECUTION_TIME = {
|
||||
"seedance-1-0-lite-t2v-250428": {
|
||||
|
||||
226
comfy_api_nodes/apis/wan.py
Normal file
226
comfy_api_nodes/apis/wan.py
Normal file
@@ -0,0 +1,226 @@
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class Text2ImageInputField(BaseModel):
|
||||
prompt: str = Field(...)
|
||||
negative_prompt: str | None = Field(None)
|
||||
|
||||
|
||||
class Image2ImageInputField(BaseModel):
|
||||
prompt: str = Field(...)
|
||||
negative_prompt: str | None = Field(None)
|
||||
images: list[str] = Field(..., min_length=1, max_length=2)
|
||||
|
||||
|
||||
class Text2VideoInputField(BaseModel):
|
||||
prompt: str = Field(...)
|
||||
negative_prompt: str | None = Field(None)
|
||||
audio_url: str | None = Field(None)
|
||||
|
||||
|
||||
class Image2VideoInputField(BaseModel):
|
||||
prompt: str = Field(...)
|
||||
negative_prompt: str | None = Field(None)
|
||||
img_url: str = Field(...)
|
||||
audio_url: str | None = Field(None)
|
||||
|
||||
|
||||
class Reference2VideoInputField(BaseModel):
|
||||
prompt: str = Field(...)
|
||||
negative_prompt: str | None = Field(None)
|
||||
reference_video_urls: list[str] = Field(...)
|
||||
|
||||
|
||||
class Txt2ImageParametersField(BaseModel):
|
||||
size: str = Field(...)
|
||||
n: int = Field(1, description="Number of images to generate.") # we support only value=1
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
prompt_extend: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
|
||||
|
||||
class Image2ImageParametersField(BaseModel):
|
||||
size: str | None = Field(None)
|
||||
n: int = Field(1, description="Number of images to generate.") # we support only value=1
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
watermark: bool = Field(False)
|
||||
|
||||
|
||||
class Text2VideoParametersField(BaseModel):
|
||||
size: str = Field(...)
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
duration: int = Field(5, ge=5, le=15)
|
||||
prompt_extend: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
audio: bool = Field(False, description="Whether to generate audio automatically.")
|
||||
shot_type: str = Field("single")
|
||||
|
||||
|
||||
class Image2VideoParametersField(BaseModel):
|
||||
resolution: str = Field(...)
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
duration: int = Field(5, ge=5, le=15)
|
||||
prompt_extend: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
audio: bool = Field(False, description="Whether to generate audio automatically.")
|
||||
shot_type: str = Field("single")
|
||||
|
||||
|
||||
class Reference2VideoParametersField(BaseModel):
|
||||
size: str = Field(...)
|
||||
duration: int = Field(5, ge=5, le=15)
|
||||
shot_type: str = Field("single")
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
watermark: bool = Field(False)
|
||||
|
||||
|
||||
class Text2ImageTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
input: Text2ImageInputField = Field(...)
|
||||
parameters: Txt2ImageParametersField = Field(...)
|
||||
|
||||
|
||||
class Image2ImageTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
input: Image2ImageInputField = Field(...)
|
||||
parameters: Image2ImageParametersField = Field(...)
|
||||
|
||||
|
||||
class Text2VideoTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
input: Text2VideoInputField = Field(...)
|
||||
parameters: Text2VideoParametersField = Field(...)
|
||||
|
||||
|
||||
class Image2VideoTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
input: Image2VideoInputField = Field(...)
|
||||
parameters: Image2VideoParametersField = Field(...)
|
||||
|
||||
|
||||
class Reference2VideoTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
input: Reference2VideoInputField = Field(...)
|
||||
parameters: Reference2VideoParametersField = Field(...)
|
||||
|
||||
|
||||
class Wan27MediaItem(BaseModel):
|
||||
type: str = Field(...)
|
||||
url: str = Field(...)
|
||||
|
||||
|
||||
class Wan27ReferenceVideoInputField(BaseModel):
|
||||
prompt: str = Field(...)
|
||||
negative_prompt: str | None = Field(None)
|
||||
media: list[Wan27MediaItem] = Field(...)
|
||||
|
||||
|
||||
class Wan27ReferenceVideoParametersField(BaseModel):
|
||||
resolution: str = Field(...)
|
||||
ratio: str | None = Field(None)
|
||||
duration: int = Field(5, ge=2, le=10)
|
||||
watermark: bool = Field(False)
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
|
||||
|
||||
class Wan27ReferenceVideoTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
input: Wan27ReferenceVideoInputField = Field(...)
|
||||
parameters: Wan27ReferenceVideoParametersField = Field(...)
|
||||
|
||||
|
||||
class Wan27ImageToVideoInputField(BaseModel):
|
||||
prompt: str | None = Field(None)
|
||||
negative_prompt: str | None = Field(None)
|
||||
media: list[Wan27MediaItem] = Field(...)
|
||||
|
||||
|
||||
class Wan27ImageToVideoParametersField(BaseModel):
|
||||
resolution: str = Field(...)
|
||||
duration: int = Field(5, ge=2, le=15)
|
||||
prompt_extend: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
|
||||
|
||||
class Wan27ImageToVideoTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
input: Wan27ImageToVideoInputField = Field(...)
|
||||
parameters: Wan27ImageToVideoParametersField = Field(...)
|
||||
|
||||
|
||||
class Wan27VideoEditInputField(BaseModel):
|
||||
prompt: str = Field(...)
|
||||
media: list[Wan27MediaItem] = Field(...)
|
||||
|
||||
|
||||
class Wan27VideoEditParametersField(BaseModel):
|
||||
resolution: str = Field(...)
|
||||
ratio: str | None = Field(None)
|
||||
duration: int = Field(0)
|
||||
audio_setting: str = Field("auto")
|
||||
watermark: bool = Field(False)
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
|
||||
|
||||
class Wan27VideoEditTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
input: Wan27VideoEditInputField = Field(...)
|
||||
parameters: Wan27VideoEditParametersField = Field(...)
|
||||
|
||||
|
||||
class Wan27Text2VideoParametersField(BaseModel):
|
||||
resolution: str = Field(...)
|
||||
ratio: str | None = Field(None)
|
||||
duration: int = Field(5, ge=2, le=15)
|
||||
prompt_extend: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
|
||||
|
||||
class Wan27Text2VideoTaskCreationRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
input: Text2VideoInputField = Field(...)
|
||||
parameters: Wan27Text2VideoParametersField = Field(...)
|
||||
|
||||
|
||||
class TaskCreationOutputField(BaseModel):
|
||||
task_id: str = Field(...)
|
||||
task_status: str = Field(...)
|
||||
|
||||
|
||||
class TaskCreationResponse(BaseModel):
|
||||
output: TaskCreationOutputField | None = Field(None)
|
||||
request_id: str = Field(...)
|
||||
code: str | None = Field(None, description="Error code for the failed request.")
|
||||
message: str | None = Field(None, description="Details about the failed request.")
|
||||
|
||||
|
||||
class TaskResult(BaseModel):
|
||||
url: str | None = Field(None)
|
||||
code: str | None = Field(None)
|
||||
message: str | None = Field(None)
|
||||
|
||||
|
||||
class ImageTaskStatusOutputField(TaskCreationOutputField):
|
||||
task_id: str = Field(...)
|
||||
task_status: str = Field(...)
|
||||
results: list[TaskResult] | None = Field(None)
|
||||
|
||||
|
||||
class VideoTaskStatusOutputField(TaskCreationOutputField):
|
||||
task_id: str = Field(...)
|
||||
task_status: str = Field(...)
|
||||
video_url: str | None = Field(None)
|
||||
code: str | None = Field(None)
|
||||
message: str | None = Field(None)
|
||||
|
||||
|
||||
class ImageTaskStatusResponse(BaseModel):
|
||||
output: ImageTaskStatusOutputField | None = Field(None)
|
||||
request_id: str = Field(...)
|
||||
|
||||
|
||||
class VideoTaskStatusResponse(BaseModel):
|
||||
output: VideoTaskStatusOutputField | None = Field(None)
|
||||
request_id: str = Field(...)
|
||||
@@ -8,16 +8,23 @@ from comfy_api.latest import IO, ComfyExtension, Input
|
||||
from comfy_api_nodes.apis.bytedance import (
|
||||
RECOMMENDED_PRESETS,
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4,
|
||||
SEEDANCE2_PRICE_PER_1K_TOKENS,
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS,
|
||||
VIDEO_TASKS_EXECUTION_TIME,
|
||||
Image2VideoTaskCreationRequest,
|
||||
ImageTaskCreationResponse,
|
||||
Seedance2TaskCreationRequest,
|
||||
Seedream4Options,
|
||||
Seedream4TaskCreationRequest,
|
||||
TaskAudioContent,
|
||||
TaskAudioContentUrl,
|
||||
TaskCreationResponse,
|
||||
TaskImageContent,
|
||||
TaskImageContentUrl,
|
||||
TaskStatusResponse,
|
||||
TaskTextContent,
|
||||
TaskVideoContent,
|
||||
TaskVideoContentUrl,
|
||||
Text2ImageTaskCreationRequest,
|
||||
Text2VideoTaskCreationRequest,
|
||||
)
|
||||
@@ -29,7 +36,10 @@ from comfy_api_nodes.util import (
|
||||
image_tensor_pair_to_batch,
|
||||
poll_op,
|
||||
sync_op,
|
||||
upload_audio_to_comfyapi,
|
||||
upload_image_to_comfyapi,
|
||||
upload_images_to_comfyapi,
|
||||
upload_video_to_comfyapi,
|
||||
validate_image_aspect_ratio,
|
||||
validate_image_dimensions,
|
||||
validate_string,
|
||||
@@ -46,12 +56,56 @@ SEEDREAM_MODELS = {
|
||||
# Long-running tasks endpoints(e.g., video)
|
||||
BYTEPLUS_TASK_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks"
|
||||
BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" # + /{task_id}
|
||||
BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT = "/proxy/byteplus-seedance2/api/v3/contents/generations/tasks" # + /{task_id}
|
||||
|
||||
SEEDANCE_MODELS = {
|
||||
"Seedance 2.0": "dreamina-seedance-2-0-260128",
|
||||
"Seedance 2.0 Fast": "dreamina-seedance-2-0-fast-260128",
|
||||
}
|
||||
|
||||
DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-250428"}
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _validate_ref_video_pixels(video: Input.Video, model_id: str, index: int) -> None:
|
||||
"""Validate reference video pixel count against Seedance 2.0 model limits."""
|
||||
limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
|
||||
if not limits:
|
||||
return
|
||||
try:
|
||||
w, h = video.get_dimensions()
|
||||
except Exception:
|
||||
return
|
||||
pixels = w * h
|
||||
min_px = limits.get("min")
|
||||
max_px = limits.get("max")
|
||||
if min_px and pixels < min_px:
|
||||
raise ValueError(
|
||||
f"Reference video {index} is too small: {w}x{h} = {pixels:,}px. " f"Minimum is {min_px:,}px for this model."
|
||||
)
|
||||
if max_px and pixels > max_px:
|
||||
raise ValueError(
|
||||
f"Reference video {index} is too large: {w}x{h} = {pixels:,}px. "
|
||||
f"Maximum is {max_px:,}px for this model. Try downscaling the video."
|
||||
)
|
||||
|
||||
|
||||
def _seedance2_price_extractor(model_id: str, has_video_input: bool):
|
||||
"""Returns a price_extractor closure for Seedance 2.0 poll_op."""
|
||||
rate = SEEDANCE2_PRICE_PER_1K_TOKENS.get((model_id, has_video_input))
|
||||
if rate is None:
|
||||
return None
|
||||
|
||||
def extractor(response: TaskStatusResponse) -> float | None:
|
||||
if response.usage is None:
|
||||
return None
|
||||
return response.usage.total_tokens * 1.43 * rate / 1_000.0
|
||||
|
||||
return extractor
|
||||
|
||||
|
||||
def get_image_url_from_response(response: ImageTaskCreationResponse) -> str:
|
||||
if response.error:
|
||||
error_msg = f"ByteDance request failed. Code: {response.error['code']}, message: {response.error['message']}"
|
||||
@@ -335,8 +389,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
||||
mp_provided = out_num_pixels / 1_000_000.0
|
||||
if ("seedream-4-5" in model or "seedream-5-0" in model) and out_num_pixels < 3686400:
|
||||
raise ValueError(
|
||||
f"Minimum image resolution for the selected model is 3.68MP, "
|
||||
f"but {mp_provided:.2f}MP provided."
|
||||
f"Minimum image resolution for the selected model is 3.68MP, " f"but {mp_provided:.2f}MP provided."
|
||||
)
|
||||
if "seedream-4-0" in model and out_num_pixels < 921600:
|
||||
raise ValueError(
|
||||
@@ -952,33 +1005,6 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
|
||||
)
|
||||
|
||||
|
||||
async def process_video_task(
|
||||
cls: type[IO.ComfyNode],
|
||||
payload: Text2VideoTaskCreationRequest | Image2VideoTaskCreationRequest,
|
||||
estimated_duration: int | None,
|
||||
) -> IO.NodeOutput:
|
||||
if payload.model in DEPRECATED_MODELS:
|
||||
logger.warning(
|
||||
"Model '%s' is deprecated and will be deactivated on May 13, 2026. "
|
||||
"Please switch to a newer model. Recommended: seedance-1-0-pro-fast-251015.",
|
||||
payload.model,
|
||||
)
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
|
||||
data=payload,
|
||||
response_model=TaskCreationResponse,
|
||||
)
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
|
||||
status_extractor=lambda r: r.status,
|
||||
estimated_duration=estimated_duration,
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
|
||||
def raise_if_text_params(prompt: str, text_params: list[str]) -> None:
|
||||
for i in text_params:
|
||||
if f"--{i} " in prompt:
|
||||
@@ -1040,6 +1066,530 @@ PRICE_BADGE_VIDEO = IO.PriceBadge(
|
||||
)
|
||||
|
||||
|
||||
def _seedance2_text_inputs():
|
||||
return [
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Text prompt for video generation.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"resolution",
|
||||
options=["480p", "720p"],
|
||||
tooltip="Resolution of the output video.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"ratio",
|
||||
options=["16:9", "4:3", "1:1", "3:4", "9:16", "21:9", "adaptive"],
|
||||
tooltip="Aspect ratio of the output video.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"duration",
|
||||
default=7,
|
||||
min=4,
|
||||
max=15,
|
||||
step=1,
|
||||
tooltip="Duration of the output video in seconds (4-15).",
|
||||
display_mode=IO.NumberDisplay.slider,
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"generate_audio",
|
||||
default=True,
|
||||
tooltip="Enable audio generation for the output video.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class ByteDance2TextToVideoNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ByteDance2TextToVideoNode",
|
||||
display_name="ByteDance Seedance 2.0 Text to Video",
|
||||
category="api node/video/ByteDance",
|
||||
description="Generate video using Seedance 2.0 models based on a text prompt.",
|
||||
inputs=[
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()),
|
||||
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()),
|
||||
],
|
||||
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed controls whether the node should re-run; "
|
||||
"results are non-deterministic regardless of seed.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=False,
|
||||
tooltip="Whether to add a watermark to the video.",
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
|
||||
expr="""
|
||||
(
|
||||
$rate480 := 10044;
|
||||
$rate720 := 21600;
|
||||
$m := widgets.model;
|
||||
$pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$rate := $res = "720p" ? $rate720 : $rate480;
|
||||
$cost := $dur * $rate * $pricePer1K / 1000;
|
||||
{"type": "usd", "usd": $cost, "format": {"approximate": true}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
model: dict,
|
||||
seed: int,
|
||||
watermark: bool,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(model["prompt"], strip_whitespace=True, min_length=1)
|
||||
model_id = SEEDANCE_MODELS[model["model"]]
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
|
||||
data=Seedance2TaskCreationRequest(
|
||||
model=model_id,
|
||||
content=[TaskTextContent(text=model["prompt"])],
|
||||
generate_audio=model["generate_audio"],
|
||||
resolution=model["resolution"],
|
||||
ratio=model["ratio"],
|
||||
duration=model["duration"],
|
||||
seed=seed,
|
||||
watermark=watermark,
|
||||
),
|
||||
response_model=TaskCreationResponse,
|
||||
)
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"{BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
|
||||
response_model=TaskStatusResponse,
|
||||
status_extractor=lambda r: r.status,
|
||||
price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
|
||||
poll_interval=9,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
|
||||
class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ByteDance2FirstLastFrameNode",
|
||||
display_name="ByteDance Seedance 2.0 First-Last-Frame to Video",
|
||||
category="api node/video/ByteDance",
|
||||
description="Generate video using Seedance 2.0 from a first frame image and optional last frame image.",
|
||||
inputs=[
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()),
|
||||
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()),
|
||||
],
|
||||
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
|
||||
),
|
||||
IO.Image.Input(
|
||||
"first_frame",
|
||||
tooltip="First frame image for the video.",
|
||||
),
|
||||
IO.Image.Input(
|
||||
"last_frame",
|
||||
tooltip="Last frame image for the video.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed controls whether the node should re-run; "
|
||||
"results are non-deterministic regardless of seed.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=False,
|
||||
tooltip="Whether to add a watermark to the video.",
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
|
||||
expr="""
|
||||
(
|
||||
$rate480 := 10044;
|
||||
$rate720 := 21600;
|
||||
$m := widgets.model;
|
||||
$pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$rate := $res = "720p" ? $rate720 : $rate480;
|
||||
$cost := $dur * $rate * $pricePer1K / 1000;
|
||||
{"type": "usd", "usd": $cost, "format": {"approximate": true}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
model: dict,
|
||||
first_frame: Input.Image,
|
||||
seed: int,
|
||||
watermark: bool,
|
||||
last_frame: Input.Image | None = None,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(model["prompt"], strip_whitespace=True, min_length=1)
|
||||
model_id = SEEDANCE_MODELS[model["model"]]
|
||||
|
||||
content: list[TaskTextContent | TaskImageContent] = [
|
||||
TaskTextContent(text=model["prompt"]),
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(
|
||||
url=await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame.")
|
||||
),
|
||||
role="first_frame",
|
||||
),
|
||||
]
|
||||
if last_frame is not None:
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(
|
||||
url=await upload_image_to_comfyapi(cls, last_frame, wait_label="Uploading last frame.")
|
||||
),
|
||||
role="last_frame",
|
||||
),
|
||||
)
|
||||
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
|
||||
data=Seedance2TaskCreationRequest(
|
||||
model=model_id,
|
||||
content=content,
|
||||
generate_audio=model["generate_audio"],
|
||||
resolution=model["resolution"],
|
||||
ratio=model["ratio"],
|
||||
duration=model["duration"],
|
||||
seed=seed,
|
||||
watermark=watermark,
|
||||
),
|
||||
response_model=TaskCreationResponse,
|
||||
)
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"{BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
|
||||
response_model=TaskStatusResponse,
|
||||
status_extractor=lambda r: r.status,
|
||||
price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
|
||||
poll_interval=9,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
|
||||
def _seedance2_reference_inputs():
|
||||
return [
|
||||
*_seedance2_text_inputs(),
|
||||
IO.Autogrow.Input(
|
||||
"reference_images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("reference_image"),
|
||||
names=[
|
||||
"image_1",
|
||||
"image_2",
|
||||
"image_3",
|
||||
"image_4",
|
||||
"image_5",
|
||||
"image_6",
|
||||
"image_7",
|
||||
"image_8",
|
||||
"image_9",
|
||||
],
|
||||
min=0,
|
||||
),
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"reference_videos",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Video.Input("reference_video"),
|
||||
names=["video_1", "video_2", "video_3"],
|
||||
min=0,
|
||||
),
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"reference_audios",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Audio.Input("reference_audio"),
|
||||
names=["audio_1", "audio_2", "audio_3"],
|
||||
min=0,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ByteDance2ReferenceNode",
|
||||
display_name="ByteDance Seedance 2.0 Reference to Video",
|
||||
category="api node/video/ByteDance",
|
||||
description="Generate, edit, or extend video using Seedance 2.0 with reference images, "
|
||||
"videos, and audio. Supports multimodal reference, video editing, and video extension.",
|
||||
inputs=[
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_reference_inputs()),
|
||||
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_reference_inputs()),
|
||||
],
|
||||
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed controls whether the node should re-run; "
|
||||
"results are non-deterministic regardless of seed.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=False,
|
||||
tooltip="Whether to add a watermark to the video.",
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=["model", "model.resolution", "model.duration"],
|
||||
input_groups=["model.reference_videos"],
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$rate480 := 10044;
|
||||
$rate720 := 21600;
|
||||
$m := widgets.model;
|
||||
$hasVideo := $lookup(inputGroups, "model.reference_videos") > 0;
|
||||
$noVideoPricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
|
||||
$videoPricePer1K := $contains($m, "fast") ? 0.004719 : 0.006149;
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$rate := $res = "720p" ? $rate720 : $rate480;
|
||||
$noVideoCost := $dur * $rate * $noVideoPricePer1K / 1000;
|
||||
$minVideoFactor := $ceil($dur * 5 / 3);
|
||||
$minVideoCost := $minVideoFactor * $rate * $videoPricePer1K / 1000;
|
||||
$maxVideoCost := (15 + $dur) * $rate * $videoPricePer1K / 1000;
|
||||
$hasVideo
|
||||
? {
|
||||
"type": "range_usd",
|
||||
"min_usd": $minVideoCost,
|
||||
"max_usd": $maxVideoCost,
|
||||
"format": {"approximate": true}
|
||||
}
|
||||
: {
|
||||
"type": "usd",
|
||||
"usd": $noVideoCost,
|
||||
"format": {"approximate": true}
|
||||
}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
model: dict,
|
||||
seed: int,
|
||||
watermark: bool,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(model["prompt"], strip_whitespace=True, min_length=1)
|
||||
|
||||
reference_images = model.get("reference_images", {})
|
||||
reference_videos = model.get("reference_videos", {})
|
||||
reference_audios = model.get("reference_audios", {})
|
||||
|
||||
if not reference_images and not reference_videos:
|
||||
raise ValueError("At least one reference image or video is required.")
|
||||
|
||||
model_id = SEEDANCE_MODELS[model["model"]]
|
||||
has_video_input = len(reference_videos) > 0
|
||||
total_video_duration = 0.0
|
||||
for i, key in enumerate(reference_videos, 1):
|
||||
video = reference_videos[key]
|
||||
_validate_ref_video_pixels(video, model_id, i)
|
||||
try:
|
||||
dur = video.get_duration()
|
||||
if dur < 1.8:
|
||||
raise ValueError(f"Reference video {i} is too short: {dur:.1f}s. Minimum duration is 1.8 seconds.")
|
||||
total_video_duration += dur
|
||||
except ValueError:
|
||||
raise
|
||||
except Exception:
|
||||
pass
|
||||
if total_video_duration > 15.1:
|
||||
raise ValueError(f"Total reference video duration is {total_video_duration:.1f}s. Maximum is 15.1 seconds.")
|
||||
|
||||
total_audio_duration = 0.0
|
||||
for i, key in enumerate(reference_audios, 1):
|
||||
audio = reference_audios[key]
|
||||
dur = int(audio["waveform"].shape[-1]) / int(audio["sample_rate"])
|
||||
if dur < 1.8:
|
||||
raise ValueError(f"Reference audio {i} is too short: {dur:.1f}s. Minimum duration is 1.8 seconds.")
|
||||
total_audio_duration += dur
|
||||
if total_audio_duration > 15.1:
|
||||
raise ValueError(f"Total reference audio duration is {total_audio_duration:.1f}s. Maximum is 15.1 seconds.")
|
||||
|
||||
content: list[TaskTextContent | TaskImageContent | TaskVideoContent | TaskAudioContent] = [
|
||||
TaskTextContent(text=model["prompt"]),
|
||||
]
|
||||
for i, key in enumerate(reference_images, 1):
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(
|
||||
url=await upload_image_to_comfyapi(
|
||||
cls,
|
||||
image=reference_images[key],
|
||||
wait_label=f"Uploading image {i}",
|
||||
),
|
||||
),
|
||||
role="reference_image",
|
||||
),
|
||||
)
|
||||
for i, key in enumerate(reference_videos, 1):
|
||||
content.append(
|
||||
TaskVideoContent(
|
||||
video_url=TaskVideoContentUrl(
|
||||
url=await upload_video_to_comfyapi(
|
||||
cls,
|
||||
reference_videos[key],
|
||||
wait_label=f"Uploading video {i}",
|
||||
),
|
||||
),
|
||||
),
|
||||
)
|
||||
for key in reference_audios:
|
||||
content.append(
|
||||
TaskAudioContent(
|
||||
audio_url=TaskAudioContentUrl(
|
||||
url=await upload_audio_to_comfyapi(
|
||||
cls,
|
||||
reference_audios[key],
|
||||
container_format="mp3",
|
||||
codec_name="libmp3lame",
|
||||
mime_type="audio/mpeg",
|
||||
),
|
||||
),
|
||||
),
|
||||
)
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
|
||||
data=Seedance2TaskCreationRequest(
|
||||
model=model_id,
|
||||
content=content,
|
||||
generate_audio=model["generate_audio"],
|
||||
resolution=model["resolution"],
|
||||
ratio=model["ratio"],
|
||||
duration=model["duration"],
|
||||
seed=seed,
|
||||
watermark=watermark,
|
||||
),
|
||||
response_model=TaskCreationResponse,
|
||||
)
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"{BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
|
||||
response_model=TaskStatusResponse,
|
||||
status_extractor=lambda r: r.status,
|
||||
price_extractor=_seedance2_price_extractor(model_id, has_video_input=has_video_input),
|
||||
poll_interval=9,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
|
||||
async def process_video_task(
|
||||
cls: type[IO.ComfyNode],
|
||||
payload: Text2VideoTaskCreationRequest | Image2VideoTaskCreationRequest,
|
||||
estimated_duration: int | None,
|
||||
) -> IO.NodeOutput:
|
||||
if payload.model in DEPRECATED_MODELS:
|
||||
logger.warning(
|
||||
"Model '%s' is deprecated and will be deactivated on May 13, 2026. "
|
||||
"Please switch to a newer model. Recommended: seedance-1-0-pro-fast-251015.",
|
||||
payload.model,
|
||||
)
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
|
||||
data=payload,
|
||||
response_model=TaskCreationResponse,
|
||||
)
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
|
||||
status_extractor=lambda r: r.status,
|
||||
estimated_duration=estimated_duration,
|
||||
response_model=TaskStatusResponse,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
|
||||
class ByteDanceExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
@@ -1050,6 +1600,9 @@ class ByteDanceExtension(ComfyExtension):
|
||||
ByteDanceImageToVideoNode,
|
||||
ByteDanceFirstLastFrameNode,
|
||||
ByteDanceImageReferenceNode,
|
||||
ByteDance2TextToVideoNode,
|
||||
ByteDance2FirstLastFrameNode,
|
||||
ByteDance2ReferenceNode,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -558,7 +558,7 @@ class GrokVideoReferenceNode(IO.ComfyNode):
|
||||
(
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$refs := inputGroups["model.reference_images"];
|
||||
$refs := $lookup(inputGroups, "model.reference_images");
|
||||
$rate := $res = "720p" ? 0.07 : 0.05;
|
||||
$price := ($rate * $dur + 0.002 * $refs) * 1.43;
|
||||
{"type":"usd","usd": $price}
|
||||
|
||||
@@ -132,7 +132,7 @@ class TencentTextToModelNode(IO.ComfyNode):
|
||||
tooltip="The LowPoly option is unavailable for the `3.1` model.",
|
||||
),
|
||||
IO.String.Input("prompt", multiline=True, default="", tooltip="Supports up to 1024 characters."),
|
||||
IO.Int.Input("face_count", default=500000, min=40000, max=1500000),
|
||||
IO.Int.Input("face_count", default=500000, min=3000, max=1500000),
|
||||
IO.DynamicCombo.Input(
|
||||
"generate_type",
|
||||
options=[
|
||||
@@ -251,7 +251,7 @@ class TencentImageToModelNode(IO.ComfyNode):
|
||||
IO.Image.Input("image_left", optional=True),
|
||||
IO.Image.Input("image_right", optional=True),
|
||||
IO.Image.Input("image_back", optional=True),
|
||||
IO.Int.Input("face_count", default=500000, min=40000, max=1500000),
|
||||
IO.Int.Input("face_count", default=500000, min=3000, max=1500000),
|
||||
IO.DynamicCombo.Input(
|
||||
"generate_type",
|
||||
options=[
|
||||
@@ -422,6 +422,7 @@ class TencentModelTo3DUVNode(IO.ComfyNode):
|
||||
outputs=[
|
||||
IO.File3DOBJ.Output(display_name="OBJ"),
|
||||
IO.File3DFBX.Output(display_name="FBX"),
|
||||
IO.Image.Output(display_name="uv_image"),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
@@ -468,9 +469,16 @@ class TencentModelTo3DUVNode(IO.ComfyNode):
|
||||
response_model=To3DProTaskResultResponse,
|
||||
status_extractor=lambda r: r.Status,
|
||||
)
|
||||
uv_image_file = get_file_from_response(result.ResultFile3Ds, "uv_image", raise_if_not_found=False)
|
||||
uv_image = (
|
||||
await download_url_to_image_tensor(uv_image_file.Url)
|
||||
if uv_image_file is not None
|
||||
else torch.zeros(1, 1, 1, 3)
|
||||
)
|
||||
return IO.NodeOutput(
|
||||
await download_url_to_file_3d(get_file_from_response(result.ResultFile3Ds, "obj").Url, "obj"),
|
||||
await download_url_to_file_3d(get_file_from_response(result.ResultFile3Ds, "fbx").Url, "fbx"),
|
||||
uv_image,
|
||||
)
|
||||
|
||||
|
||||
|
||||
287
comfy_api_nodes/nodes_sonilo.py
Normal file
287
comfy_api_nodes/nodes_sonilo.py
Normal file
@@ -0,0 +1,287 @@
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import aiohttp
|
||||
from typing_extensions import override
|
||||
|
||||
from comfy_api.latest import IO, ComfyExtension, Input
|
||||
from comfy_api_nodes.util import (
|
||||
ApiEndpoint,
|
||||
audio_bytes_to_audio_input,
|
||||
upload_video_to_comfyapi,
|
||||
validate_string,
|
||||
)
|
||||
from comfy_api_nodes.util._helpers import (
|
||||
default_base_url,
|
||||
get_auth_header,
|
||||
get_node_id,
|
||||
is_processing_interrupted,
|
||||
)
|
||||
from comfy_api_nodes.util.common_exceptions import ProcessingInterrupted
|
||||
from server import PromptServer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SoniloVideoToMusic(IO.ComfyNode):
|
||||
"""Generate music from video using Sonilo's AI model."""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="SoniloVideoToMusic",
|
||||
display_name="Sonilo Video to Music",
|
||||
category="api node/audio/Sonilo",
|
||||
description="Generate music from video content using Sonilo's AI model. "
|
||||
"Analyzes the video and creates matching music.",
|
||||
inputs=[
|
||||
IO.Video.Input(
|
||||
"video",
|
||||
tooltip="Input video to generate music from. Maximum duration: 6 minutes.",
|
||||
),
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
default="",
|
||||
multiline=True,
|
||||
tooltip="Optional text prompt to guide music generation. "
|
||||
"Leave empty for best quality - the model will fully analyze the video content.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=0xFFFFFFFFFFFFFFFF,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed for reproducibility. Currently ignored by the Sonilo "
|
||||
"service but kept for graph consistency.",
|
||||
),
|
||||
],
|
||||
outputs=[IO.Audio.Output()],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
expr='{"type":"usd","usd":0.009,"format":{"suffix":"/second"}}',
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
video: Input.Video,
|
||||
prompt: str = "",
|
||||
seed: int = 0,
|
||||
) -> IO.NodeOutput:
|
||||
video_url = await upload_video_to_comfyapi(cls, video, max_duration=360)
|
||||
form = aiohttp.FormData()
|
||||
form.add_field("video_url", video_url)
|
||||
if prompt.strip():
|
||||
form.add_field("prompt", prompt.strip())
|
||||
audio_bytes = await _stream_sonilo_music(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/sonilo/v2m/generate", method="POST"),
|
||||
form,
|
||||
)
|
||||
return IO.NodeOutput(audio_bytes_to_audio_input(audio_bytes))
|
||||
|
||||
|
||||
class SoniloTextToMusic(IO.ComfyNode):
|
||||
"""Generate music from a text prompt using Sonilo's AI model."""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="SoniloTextToMusic",
|
||||
display_name="Sonilo Text to Music",
|
||||
category="api node/audio/Sonilo",
|
||||
description="Generate music from a text prompt using Sonilo's AI model. "
|
||||
"Leave duration at 0 to let the model infer it from the prompt.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
default="",
|
||||
multiline=True,
|
||||
tooltip="Text prompt describing the music to generate.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"duration",
|
||||
default=0,
|
||||
min=0,
|
||||
max=360,
|
||||
tooltip="Target duration in seconds. Set to 0 to let the model "
|
||||
"infer the duration from the prompt. Maximum: 6 minutes.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=0xFFFFFFFFFFFFFFFF,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed for reproducibility. Currently ignored by the Sonilo "
|
||||
"service but kept for graph consistency.",
|
||||
),
|
||||
],
|
||||
outputs=[IO.Audio.Output()],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["duration"]),
|
||||
expr="""
|
||||
(
|
||||
widgets.duration > 0
|
||||
? {"type":"usd","usd": 0.005 * widgets.duration}
|
||||
: {"type":"usd","usd": 0.005, "format":{"suffix":"/second"}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
duration: int = 0,
|
||||
seed: int = 0,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
form = aiohttp.FormData()
|
||||
form.add_field("prompt", prompt)
|
||||
if duration > 0:
|
||||
form.add_field("duration", str(duration))
|
||||
audio_bytes = await _stream_sonilo_music(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/sonilo/t2m/generate", method="POST"),
|
||||
form,
|
||||
)
|
||||
return IO.NodeOutput(audio_bytes_to_audio_input(audio_bytes))
|
||||
|
||||
|
||||
async def _stream_sonilo_music(
|
||||
cls: type[IO.ComfyNode],
|
||||
endpoint: ApiEndpoint,
|
||||
form: aiohttp.FormData,
|
||||
) -> bytes:
|
||||
"""POST ``form`` to Sonilo, read the NDJSON stream, and return the first stream's audio bytes."""
|
||||
url = urljoin(default_base_url().rstrip("/") + "/", endpoint.path.lstrip("/"))
|
||||
|
||||
headers: dict[str, str] = {}
|
||||
headers.update(get_auth_header(cls))
|
||||
headers.update(endpoint.headers)
|
||||
|
||||
node_id = get_node_id(cls)
|
||||
start_ts = time.monotonic()
|
||||
last_chunk_status_ts = 0.0
|
||||
audio_streams: dict[int, list[bytes]] = {}
|
||||
title: str | None = None
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=1200.0, sock_read=300.0)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
PromptServer.instance.send_progress_text("Status: Queued", node_id)
|
||||
async with session.post(url, data=form, headers=headers) as resp:
|
||||
if resp.status >= 400:
|
||||
msg = await _extract_error_message(resp)
|
||||
raise Exception(f"Sonilo API error ({resp.status}): {msg}")
|
||||
|
||||
while True:
|
||||
if is_processing_interrupted():
|
||||
raise ProcessingInterrupted("Task cancelled")
|
||||
|
||||
raw_line = await resp.content.readline()
|
||||
if not raw_line:
|
||||
break
|
||||
|
||||
line = raw_line.decode("utf-8").strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
evt = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Sonilo: skipping malformed NDJSON line")
|
||||
continue
|
||||
|
||||
evt_type = evt.get("type")
|
||||
if evt_type == "error":
|
||||
code = evt.get("code", "UNKNOWN")
|
||||
message = evt.get("message", "Unknown error")
|
||||
raise Exception(f"Sonilo generation error ({code}): {message}")
|
||||
if evt_type == "duration":
|
||||
duration_sec = evt.get("duration_sec")
|
||||
if duration_sec is not None:
|
||||
PromptServer.instance.send_progress_text(
|
||||
f"Status: Generating\nVideo duration: {duration_sec:.1f}s",
|
||||
node_id,
|
||||
)
|
||||
elif evt_type in ("titles", "title"):
|
||||
# v2m sends a "titles" list, t2m sends a scalar "title"
|
||||
if evt_type == "titles":
|
||||
titles = evt.get("titles", [])
|
||||
if titles:
|
||||
title = titles[0]
|
||||
else:
|
||||
title = evt.get("title") or title
|
||||
if title:
|
||||
PromptServer.instance.send_progress_text(
|
||||
f"Status: Generating\nTitle: {title}",
|
||||
node_id,
|
||||
)
|
||||
elif evt_type == "audio_chunk":
|
||||
stream_idx = evt.get("stream_index", 0)
|
||||
chunk_data = base64.b64decode(evt["data"])
|
||||
|
||||
if stream_idx not in audio_streams:
|
||||
audio_streams[stream_idx] = []
|
||||
audio_streams[stream_idx].append(chunk_data)
|
||||
|
||||
now = time.monotonic()
|
||||
if now - last_chunk_status_ts >= 1.0:
|
||||
total_chunks = sum(len(chunks) for chunks in audio_streams.values())
|
||||
elapsed = int(now - start_ts)
|
||||
status_lines = ["Status: Receiving audio"]
|
||||
if title:
|
||||
status_lines.append(f"Title: {title}")
|
||||
status_lines.append(f"Chunks received: {total_chunks}")
|
||||
status_lines.append(f"Time elapsed: {elapsed}s")
|
||||
PromptServer.instance.send_progress_text("\n".join(status_lines), node_id)
|
||||
last_chunk_status_ts = now
|
||||
elif evt_type == "complete":
|
||||
break
|
||||
|
||||
if not audio_streams:
|
||||
raise Exception("Sonilo API returned no audio data.")
|
||||
|
||||
PromptServer.instance.send_progress_text("Status: Completed", node_id)
|
||||
selected_stream = 0 if 0 in audio_streams else min(audio_streams)
|
||||
return b"".join(audio_streams[selected_stream])
|
||||
|
||||
|
||||
async def _extract_error_message(resp: aiohttp.ClientResponse) -> str:
|
||||
"""Extract a human-readable error message from an HTTP error response."""
|
||||
try:
|
||||
error_body = await resp.json()
|
||||
detail = error_body.get("detail", {})
|
||||
if isinstance(detail, dict):
|
||||
return detail.get("message", str(detail))
|
||||
return str(detail)
|
||||
except Exception:
|
||||
return await resp.text()
|
||||
|
||||
|
||||
class SoniloExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
return [SoniloVideoToMusic, SoniloTextToMusic]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> SoniloExtension:
|
||||
return SoniloExtension()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -19,6 +19,8 @@ from comfy import utils
|
||||
from comfy_api.latest import IO
|
||||
from server import PromptServer
|
||||
|
||||
from comfy.deploy_environment import get_deploy_environment
|
||||
|
||||
from . import request_logger
|
||||
from ._helpers import (
|
||||
default_base_url,
|
||||
@@ -617,6 +619,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
|
||||
payload_headers = {"Accept": "*/*"} if expect_binary else {"Accept": "application/json"}
|
||||
if not parsed_url.scheme and not parsed_url.netloc: # is URL relative?
|
||||
payload_headers.update(get_auth_header(cfg.node_cls))
|
||||
payload_headers["X-Comfy-Env"] = get_deploy_environment()
|
||||
if cfg.endpoint.headers:
|
||||
payload_headers.update(cfg.endpoint.headers)
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@ class EmptyAceStepLatentAudio(io.ComfyNode):
|
||||
@classmethod
|
||||
def execute(cls, seconds, batch_size) -> io.NodeOutput:
|
||||
length = int(seconds * 44100 / 512 / 8)
|
||||
latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device())
|
||||
latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
return io.NodeOutput({"samples": latent, "type": "audio"})
|
||||
|
||||
|
||||
@@ -103,7 +103,7 @@ class EmptyAceStep15LatentAudio(io.ComfyNode):
|
||||
@classmethod
|
||||
def execute(cls, seconds, batch_size) -> io.NodeOutput:
|
||||
length = round((seconds * 48000 / 1920))
|
||||
latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device())
|
||||
latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
return io.NodeOutput({"samples": latent, "type": "audio"})
|
||||
|
||||
class ReferenceAudio(io.ComfyNode):
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
from comfy_api.input import CurveInput
|
||||
from typing_extensions import override
|
||||
@@ -32,10 +34,58 @@ class CurveEditor(io.ComfyNode):
|
||||
return io.NodeOutput(result, ui=ui) if ui else io.NodeOutput(result)
|
||||
|
||||
|
||||
class ImageHistogram(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ImageHistogram",
|
||||
display_name="Image Histogram",
|
||||
category="utils",
|
||||
inputs=[
|
||||
io.Image.Input("image"),
|
||||
],
|
||||
outputs=[
|
||||
io.Histogram.Output("rgb"),
|
||||
io.Histogram.Output("luminance"),
|
||||
io.Histogram.Output("red"),
|
||||
io.Histogram.Output("green"),
|
||||
io.Histogram.Output("blue"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, image) -> io.NodeOutput:
|
||||
img = image[0].cpu().numpy()
|
||||
img_uint8 = np.clip(img * 255, 0, 255).astype(np.uint8)
|
||||
|
||||
def bincount(data):
|
||||
return np.bincount(data.ravel(), minlength=256)[:256]
|
||||
|
||||
hist_r = bincount(img_uint8[:, :, 0])
|
||||
hist_g = bincount(img_uint8[:, :, 1])
|
||||
hist_b = bincount(img_uint8[:, :, 2])
|
||||
|
||||
# Average of R, G, B histograms (same as Photoshop's RGB composite)
|
||||
rgb = ((hist_r + hist_g + hist_b) // 3).tolist()
|
||||
|
||||
# ITU-R BT.709-6, Item 3.2 (p.6) — Derivation of luminance signal
|
||||
# https://www.itu.int/rec/R-REC-BT.709-6-201506-I/en
|
||||
lum = 0.2126 * img[:, :, 0] + 0.7152 * img[:, :, 1] + 0.0722 * img[:, :, 2]
|
||||
luminance = bincount(np.clip(lum * 255, 0, 255).astype(np.uint8)).tolist()
|
||||
|
||||
return io.NodeOutput(
|
||||
rgb,
|
||||
luminance,
|
||||
hist_r.tolist(),
|
||||
hist_g.tolist(),
|
||||
hist_b.tolist(),
|
||||
)
|
||||
|
||||
|
||||
class CurveExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self):
|
||||
return [CurveEditor]
|
||||
return [CurveEditor, ImageHistogram]
|
||||
|
||||
|
||||
async def comfy_entrypoint():
|
||||
|
||||
@@ -11,7 +11,7 @@ class PreviewAny():
|
||||
"required": {"source": (IO.ANY, {})},
|
||||
}
|
||||
|
||||
RETURN_TYPES = ()
|
||||
RETURN_TYPES = (IO.STRING,)
|
||||
FUNCTION = "main"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
@@ -33,7 +33,7 @@ class PreviewAny():
|
||||
except Exception:
|
||||
value = 'source exists, but could not be serialized.'
|
||||
|
||||
return {"ui": {"text": (value,)}}
|
||||
return {"ui": {"text": (value,)}, "result": (value,)}
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
"PreviewAny": PreviewAny,
|
||||
|
||||
156
comfy_extras/nodes_rtdetr.py
Normal file
156
comfy_extras/nodes_rtdetr.py
Normal file
@@ -0,0 +1,156 @@
|
||||
from typing_extensions import override
|
||||
|
||||
import torch
|
||||
from comfy.ldm.rt_detr.rtdetr_v4 import COCO_CLASSES
|
||||
import comfy.model_management
|
||||
import comfy.utils
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
from torchvision.transforms import ToPILImage, ToTensor
|
||||
from PIL import ImageDraw, ImageFont
|
||||
|
||||
|
||||
class RTDETR_detect(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="RTDETR_detect",
|
||||
display_name="RT-DETR Detect",
|
||||
category="detection/",
|
||||
search_aliases=["bbox", "bounding box", "object detection", "coco"],
|
||||
inputs=[
|
||||
io.Model.Input("model", display_name="model"),
|
||||
io.Image.Input("image", display_name="image"),
|
||||
io.Float.Input("threshold", display_name="threshold", default=0.5),
|
||||
io.Combo.Input("class_name", options=["all"] + COCO_CLASSES, default="all", tooltip="Filter detections by class. Set to 'all' to disable filtering."),
|
||||
io.Int.Input("max_detections", display_name="max_detections", default=100, tooltip="Maximum number of detections to return per image. In order of descending confidence score."),
|
||||
],
|
||||
outputs=[
|
||||
io.BoundingBox.Output("bboxes")],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, model, image, threshold, class_name, max_detections) -> io.NodeOutput:
|
||||
B, H, W, C = image.shape
|
||||
|
||||
comfy.model_management.load_model_gpu(model)
|
||||
results = []
|
||||
for i in range(0, B, 32):
|
||||
batch = image[i:i + 32]
|
||||
image_in = comfy.utils.common_upscale(batch.movedim(-1, 1), 640, 640, "bilinear", crop="disabled")
|
||||
results.extend(model.model.diffusion_model(image_in, (W, H)))
|
||||
|
||||
all_bbox_dicts = []
|
||||
|
||||
for det in results:
|
||||
keep = det['scores'] > threshold
|
||||
boxes = det['boxes'][keep].cpu()
|
||||
labels = det['labels'][keep].cpu()
|
||||
scores = det['scores'][keep].cpu()
|
||||
|
||||
bbox_dicts = [
|
||||
{
|
||||
"x": float(box[0]),
|
||||
"y": float(box[1]),
|
||||
"width": float(box[2] - box[0]),
|
||||
"height": float(box[3] - box[1]),
|
||||
"label": COCO_CLASSES[int(label)],
|
||||
"score": float(score)
|
||||
}
|
||||
for box, label, score in zip(boxes, labels, scores)
|
||||
if class_name == "all" or COCO_CLASSES[int(label)] == class_name
|
||||
]
|
||||
bbox_dicts.sort(key=lambda d: d["score"], reverse=True)
|
||||
all_bbox_dicts.append(bbox_dicts[:max_detections])
|
||||
|
||||
return io.NodeOutput(all_bbox_dicts)
|
||||
|
||||
|
||||
class DrawBBoxes(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="DrawBBoxes",
|
||||
display_name="Draw BBoxes",
|
||||
category="detection/",
|
||||
search_aliases=["bbox", "bounding box", "object detection", "rt_detr", "visualize detections", "coco"],
|
||||
inputs=[
|
||||
io.Image.Input("image", optional=True),
|
||||
io.BoundingBox.Input("bboxes", force_input=True),
|
||||
],
|
||||
outputs=[
|
||||
io.Image.Output("out_image"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, bboxes, image=None) -> io.NodeOutput:
|
||||
# Normalise to list[list[dict]], then fit to batch size B.
|
||||
B = image.shape[0] if image is not None else 1
|
||||
if isinstance(bboxes, dict):
|
||||
bboxes = [[bboxes]]
|
||||
elif not isinstance(bboxes, list) or not bboxes:
|
||||
bboxes = [[]]
|
||||
elif isinstance(bboxes[0], dict):
|
||||
bboxes = [bboxes] # flat list → same detections for every image
|
||||
|
||||
if len(bboxes) == 1:
|
||||
bboxes = bboxes * B
|
||||
bboxes = (bboxes + [[]] * B)[:B]
|
||||
|
||||
if image is None:
|
||||
B = len(bboxes)
|
||||
max_w = max((int(d["x"] + d["width"]) for frame in bboxes for d in frame), default=640)
|
||||
max_h = max((int(d["y"] + d["height"]) for frame in bboxes for d in frame), default=640)
|
||||
image = torch.zeros((B, max_h, max_w, 3), dtype=torch.float32)
|
||||
|
||||
all_out_images = []
|
||||
for i in range(B):
|
||||
detections = bboxes[i]
|
||||
if detections:
|
||||
boxes = torch.tensor([[d["x"], d["y"], d["x"] + d["width"], d["y"] + d["height"]] for d in detections])
|
||||
labels = [d.get("label") if d.get("label") in COCO_CLASSES else None for d in detections]
|
||||
scores = torch.tensor([d.get("score", 1.0) for d in detections])
|
||||
else:
|
||||
boxes = torch.zeros((0, 4))
|
||||
labels = []
|
||||
scores = torch.zeros((0,))
|
||||
|
||||
pil_image = image[i].movedim(-1, 0)
|
||||
img = ToPILImage()(pil_image)
|
||||
if detections:
|
||||
img = cls.draw_detections(img, boxes, labels, scores)
|
||||
all_out_images.append(ToTensor()(img).unsqueeze(0).movedim(1, -1))
|
||||
|
||||
out_images = torch.cat(all_out_images, dim=0).to(comfy.model_management.intermediate_device())
|
||||
return io.NodeOutput(out_images)
|
||||
|
||||
@classmethod
|
||||
def draw_detections(cls, img, boxes, labels, scores):
|
||||
draw = ImageDraw.Draw(img)
|
||||
try:
|
||||
font = ImageFont.truetype('arial.ttf', 16)
|
||||
except Exception:
|
||||
font = ImageFont.load_default()
|
||||
colors = [(255,0,0),(0,200,0),(0,0,255),(255,165,0),(128,0,128),
|
||||
(0,255,255),(255,20,147),(100,149,237)]
|
||||
for box, label, score in sorted(zip(boxes, labels, scores), key=lambda x: x[2].item()):
|
||||
x1, y1, x2, y2 = box.tolist()
|
||||
color_idx = COCO_CLASSES.index(label) if label is not None else 0
|
||||
c = colors[color_idx % len(colors)]
|
||||
draw.rectangle([x1, y1, x2, y2], outline=c, width=3)
|
||||
if label is not None:
|
||||
draw.text((x1 + 2, y1 + 2), f'{label} {score:.2f}', fill=c, font=font)
|
||||
return img
|
||||
|
||||
|
||||
class RTDETRExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
RTDETR_detect,
|
||||
DrawBBoxes,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> RTDETRExtension:
|
||||
return RTDETRExtension()
|
||||
@@ -1,5 +1,6 @@
|
||||
import torch
|
||||
import comfy.utils
|
||||
import comfy.model_management
|
||||
import numpy as np
|
||||
import math
|
||||
import colorsys
|
||||
@@ -410,7 +411,9 @@ class SDPoseDrawKeypoints(io.ComfyNode):
|
||||
pose_outputs.append(canvas)
|
||||
|
||||
pose_outputs_np = np.stack(pose_outputs) if len(pose_outputs) > 1 else np.expand_dims(pose_outputs[0], 0)
|
||||
final_pose_output = torch.from_numpy(pose_outputs_np).float() / 255.0
|
||||
final_pose_output = torch.from_numpy(pose_outputs_np).to(
|
||||
device=comfy.model_management.intermediate_device(),
|
||||
dtype=comfy.model_management.intermediate_dtype()) / 255.0
|
||||
return io.NodeOutput(final_pose_output)
|
||||
|
||||
class SDPoseKeypointExtractor(io.ComfyNode):
|
||||
@@ -459,6 +462,27 @@ class SDPoseKeypointExtractor(io.ComfyNode):
|
||||
model_h = int(head.heatmap_size[0]) * 4 # e.g. 192 * 4 = 768
|
||||
model_w = int(head.heatmap_size[1]) * 4 # e.g. 256 * 4 = 1024
|
||||
|
||||
def _resize_to_model(imgs):
|
||||
"""Aspect-preserving resize + zero-pad BHWC images to (model_h, model_w). Returns (resized_bhwc, scale, pad_top, pad_left)."""
|
||||
h, w = imgs.shape[-3], imgs.shape[-2]
|
||||
scale = min(model_h / h, model_w / w)
|
||||
sh, sw = int(round(h * scale)), int(round(w * scale))
|
||||
pt, pl = (model_h - sh) // 2, (model_w - sw) // 2
|
||||
chw = imgs.permute(0, 3, 1, 2).float()
|
||||
scaled = comfy.utils.common_upscale(chw, sw, sh, upscale_method="bilinear", crop="disabled")
|
||||
padded = torch.zeros(scaled.shape[0], scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
|
||||
padded[:, :, pt:pt + sh, pl:pl + sw] = scaled
|
||||
return padded.permute(0, 2, 3, 1), scale, pt, pl
|
||||
|
||||
def _remap_keypoints(kp, scale, pad_top, pad_left, offset_x=0, offset_y=0):
|
||||
"""Remap keypoints from model space back to original image space."""
|
||||
kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
|
||||
invalid = kp[..., 0] < 0
|
||||
kp[..., 0] = (kp[..., 0] - pad_left) / scale + offset_x
|
||||
kp[..., 1] = (kp[..., 1] - pad_top) / scale + offset_y
|
||||
kp[invalid] = -1
|
||||
return kp
|
||||
|
||||
def _run_on_latent(latent_batch):
|
||||
"""Run one forward pass and return (keypoints_list, scores_list) for the batch."""
|
||||
nonlocal captured_feat
|
||||
@@ -504,36 +528,19 @@ class SDPoseKeypointExtractor(io.ComfyNode):
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
continue
|
||||
|
||||
crop_h_px, crop_w_px = y2 - y1, x2 - x1
|
||||
crop = img[:, y1:y2, x1:x2, :] # (1, crop_h, crop_w, C)
|
||||
|
||||
# scale to fit inside (model_h, model_w) while preserving aspect ratio, then pad to exact model size.
|
||||
scale = min(model_h / crop_h_px, model_w / crop_w_px)
|
||||
scaled_h, scaled_w = int(round(crop_h_px * scale)), int(round(crop_w_px * scale))
|
||||
pad_top, pad_left = (model_h - scaled_h) // 2, (model_w - scaled_w) // 2
|
||||
|
||||
crop_chw = crop.permute(0, 3, 1, 2).float() # BHWC → BCHW
|
||||
scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled")
|
||||
padded = torch.zeros(1, scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
|
||||
padded[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled
|
||||
crop_resized = padded.permute(0, 2, 3, 1) # BCHW → BHWC
|
||||
crop_resized, scale, pad_top, pad_left = _resize_to_model(crop)
|
||||
|
||||
latent_crop = vae.encode(crop_resized)
|
||||
kp_batch, sc_batch = _run_on_latent(latent_crop)
|
||||
kp, sc = kp_batch[0], sc_batch[0] # (K, 2), coords in model pixel space
|
||||
|
||||
# remove padding offset, undo scale, offset to full-image coordinates.
|
||||
kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
|
||||
kp[..., 0] = (kp[..., 0] - pad_left) / scale + x1
|
||||
kp[..., 1] = (kp[..., 1] - pad_top) / scale + y1
|
||||
|
||||
kp = _remap_keypoints(kp_batch[0], scale, pad_top, pad_left, x1, y1)
|
||||
img_keypoints.append(kp)
|
||||
img_scores.append(sc)
|
||||
img_scores.append(sc_batch[0])
|
||||
else:
|
||||
# No bboxes for this image – run on the full image
|
||||
latent_img = vae.encode(img)
|
||||
img_resized, scale, pad_top, pad_left = _resize_to_model(img)
|
||||
latent_img = vae.encode(img_resized)
|
||||
kp_batch, sc_batch = _run_on_latent(latent_img)
|
||||
img_keypoints.append(kp_batch[0])
|
||||
img_keypoints.append(_remap_keypoints(kp_batch[0], scale, pad_top, pad_left))
|
||||
img_scores.append(sc_batch[0])
|
||||
|
||||
all_keypoints.append(img_keypoints)
|
||||
@@ -541,19 +548,16 @@ class SDPoseKeypointExtractor(io.ComfyNode):
|
||||
pbar.update(1)
|
||||
|
||||
else: # full-image mode, batched
|
||||
tqdm_pbar = tqdm(total=total_images, desc="Extracting keypoints")
|
||||
for batch_start in range(0, total_images, batch_size):
|
||||
batch_end = min(batch_start + batch_size, total_images)
|
||||
latent_batch = vae.encode(image[batch_start:batch_end])
|
||||
|
||||
for batch_start in tqdm(range(0, total_images, batch_size), desc="Extracting keypoints"):
|
||||
batch_resized, scale, pad_top, pad_left = _resize_to_model(image[batch_start:batch_start + batch_size])
|
||||
latent_batch = vae.encode(batch_resized)
|
||||
kp_batch, sc_batch = _run_on_latent(latent_batch)
|
||||
|
||||
for kp, sc in zip(kp_batch, sc_batch):
|
||||
all_keypoints.append([kp])
|
||||
all_keypoints.append([_remap_keypoints(kp, scale, pad_top, pad_left)])
|
||||
all_scores.append([sc])
|
||||
tqdm_pbar.update(1)
|
||||
|
||||
pbar.update(batch_end - batch_start)
|
||||
pbar.update(len(kp_batch))
|
||||
|
||||
openpose_frames = _to_openpose_frames(all_keypoints, all_scores, height, width)
|
||||
return io.NodeOutput(openpose_frames)
|
||||
@@ -661,6 +665,7 @@ class CropByBBoxes(io.ComfyNode):
|
||||
io.Int.Input("output_width", default=512, min=64, max=4096, step=8, tooltip="Width each crop is resized to."),
|
||||
io.Int.Input("output_height", default=512, min=64, max=4096, step=8, tooltip="Height each crop is resized to."),
|
||||
io.Int.Input("padding", default=0, min=0, max=1024, step=1, tooltip="Extra padding in pixels added on each side of the bbox before cropping."),
|
||||
io.Combo.Input("keep_aspect", options=["stretch", "pad"], default="stretch", tooltip="Whether to stretch the crop to fit the output size, or pad with black pixels to preserve aspect ratio."),
|
||||
],
|
||||
outputs=[
|
||||
io.Image.Output(tooltip="All crops stacked into a single image batch."),
|
||||
@@ -668,7 +673,7 @@ class CropByBBoxes(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, image, bboxes, output_width, output_height, padding) -> io.NodeOutput:
|
||||
def execute(cls, image, bboxes, output_width, output_height, padding, keep_aspect="stretch") -> io.NodeOutput:
|
||||
total_frames = image.shape[0]
|
||||
img_h = image.shape[1]
|
||||
img_w = image.shape[2]
|
||||
@@ -716,7 +721,19 @@ class CropByBBoxes(io.ComfyNode):
|
||||
x1, y1, x2, y2 = fb_x1, fb_y1, fb_x2, fb_y2
|
||||
|
||||
crop_chw = frame_chw[:, :, y1:y2, x1:x2] # (1, C, crop_h, crop_w)
|
||||
resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled")
|
||||
|
||||
if keep_aspect == "pad":
|
||||
crop_h, crop_w = y2 - y1, x2 - x1
|
||||
scale = min(output_width / crop_w, output_height / crop_h)
|
||||
scaled_w = int(round(crop_w * scale))
|
||||
scaled_h = int(round(crop_h * scale))
|
||||
scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled")
|
||||
pad_left = (output_width - scaled_w) // 2
|
||||
pad_top = (output_height - scaled_h) // 2
|
||||
resized = torch.zeros(1, num_ch, output_height, output_width, dtype=image.dtype, device=image.device)
|
||||
resized[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled
|
||||
else: # "stretch"
|
||||
resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled")
|
||||
crops.append(resized)
|
||||
|
||||
if not crops:
|
||||
|
||||
@@ -9,9 +9,9 @@ class StringConcatenate(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StringConcatenate",
|
||||
display_name="Concatenate",
|
||||
display_name="Text Concatenate",
|
||||
category="utils/string",
|
||||
search_aliases=["text concat", "join text", "merge text", "combine strings", "concat", "concatenate", "append text", "combine text", "string"],
|
||||
search_aliases=["Concatenate", "text concat", "join text", "merge text", "combine strings", "concat", "concatenate", "append text", "combine text", "string"],
|
||||
inputs=[
|
||||
io.String.Input("string_a", multiline=True),
|
||||
io.String.Input("string_b", multiline=True),
|
||||
@@ -32,8 +32,8 @@ class StringSubstring(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StringSubstring",
|
||||
search_aliases=["extract text", "text portion"],
|
||||
display_name="Substring",
|
||||
search_aliases=["Substring", "extract text", "text portion"],
|
||||
display_name="Text Substring",
|
||||
category="utils/string",
|
||||
inputs=[
|
||||
io.String.Input("string", multiline=True),
|
||||
@@ -55,8 +55,8 @@ class StringLength(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StringLength",
|
||||
search_aliases=["character count", "text size"],
|
||||
display_name="Length",
|
||||
search_aliases=["character count", "text size", "string length"],
|
||||
display_name="Text Length",
|
||||
category="utils/string",
|
||||
inputs=[
|
||||
io.String.Input("string", multiline=True),
|
||||
@@ -76,8 +76,8 @@ class CaseConverter(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CaseConverter",
|
||||
search_aliases=["text case", "uppercase", "lowercase", "capitalize"],
|
||||
display_name="Case Converter",
|
||||
search_aliases=["Case Converter", "text case", "uppercase", "lowercase", "capitalize"],
|
||||
display_name="Text Case Converter",
|
||||
category="utils/string",
|
||||
inputs=[
|
||||
io.String.Input("string", multiline=True),
|
||||
@@ -109,8 +109,8 @@ class StringTrim(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StringTrim",
|
||||
search_aliases=["clean whitespace", "remove whitespace"],
|
||||
display_name="Trim",
|
||||
search_aliases=["Trim", "clean whitespace", "remove whitespace", "strip"],
|
||||
display_name="Text Trim",
|
||||
category="utils/string",
|
||||
inputs=[
|
||||
io.String.Input("string", multiline=True),
|
||||
@@ -140,8 +140,8 @@ class StringReplace(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StringReplace",
|
||||
search_aliases=["find and replace", "substitute", "swap text"],
|
||||
display_name="Replace",
|
||||
search_aliases=["Replace", "find and replace", "substitute", "swap text"],
|
||||
display_name="Text Replace",
|
||||
category="utils/string",
|
||||
inputs=[
|
||||
io.String.Input("string", multiline=True),
|
||||
@@ -163,8 +163,8 @@ class StringContains(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StringContains",
|
||||
search_aliases=["text includes", "string includes"],
|
||||
display_name="Contains",
|
||||
search_aliases=["Contains", "text includes", "string includes"],
|
||||
display_name="Text Contains",
|
||||
category="utils/string",
|
||||
inputs=[
|
||||
io.String.Input("string", multiline=True),
|
||||
@@ -191,8 +191,8 @@ class StringCompare(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StringCompare",
|
||||
search_aliases=["text match", "string equals", "starts with", "ends with"],
|
||||
display_name="Compare",
|
||||
search_aliases=["Compare", "text match", "string equals", "starts with", "ends with"],
|
||||
display_name="Text Compare",
|
||||
category="utils/string",
|
||||
inputs=[
|
||||
io.String.Input("string_a", multiline=True),
|
||||
@@ -227,8 +227,8 @@ class RegexMatch(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="RegexMatch",
|
||||
search_aliases=["pattern match", "text contains", "string match"],
|
||||
display_name="Regex Match",
|
||||
search_aliases=["Regex Match", "regex", "pattern match", "text contains", "string match"],
|
||||
display_name="Text Match",
|
||||
category="utils/string",
|
||||
inputs=[
|
||||
io.String.Input("string", multiline=True),
|
||||
@@ -268,8 +268,8 @@ class RegexExtract(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="RegexExtract",
|
||||
search_aliases=["pattern extract", "text parser", "parse text"],
|
||||
display_name="Regex Extract",
|
||||
search_aliases=["Regex Extract", "regex", "pattern extract", "text parser", "parse text"],
|
||||
display_name="Text Extract Substring",
|
||||
category="utils/string",
|
||||
inputs=[
|
||||
io.String.Input("string", multiline=True),
|
||||
@@ -343,8 +343,8 @@ class RegexReplace(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="RegexReplace",
|
||||
search_aliases=["pattern replace", "find and replace", "substitution"],
|
||||
display_name="Regex Replace",
|
||||
search_aliases=["Regex Replace", "regex", "pattern replace", "regex replace", "substitution"],
|
||||
display_name="Text Replace (Regex)",
|
||||
category="utils/string",
|
||||
description="Find and replace text using regex patterns.",
|
||||
inputs=[
|
||||
|
||||
@@ -35,6 +35,7 @@ class TextGenerate(io.ComfyNode):
|
||||
io.Int.Input("max_length", default=256, min=1, max=2048),
|
||||
io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"),
|
||||
io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."),
|
||||
io.Boolean.Input("use_default_template", optional=True, default=True, tooltip="Use the built in system prompt/template if the model has one.", advanced=True),
|
||||
],
|
||||
outputs=[
|
||||
io.String.Output(display_name="generated_text"),
|
||||
@@ -42,9 +43,9 @@ class TextGenerate(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput:
|
||||
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False, use_default_template=True) -> io.NodeOutput:
|
||||
|
||||
tokens = clip.tokenize(prompt, image=image, skip_template=False, min_length=1, thinking=thinking)
|
||||
tokens = clip.tokenize(prompt, image=image, skip_template=not use_default_template, min_length=1, thinking=thinking)
|
||||
|
||||
# Get sampling parameters from dynamic combo
|
||||
do_sample = sampling_mode.get("sampling_mode") == "on"
|
||||
|
||||
@@ -6,6 +6,7 @@ import comfy.utils
|
||||
import folder_paths
|
||||
from typing_extensions import override
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
import comfy.model_management
|
||||
|
||||
try:
|
||||
from spandrel_extra_arches import EXTRA_REGISTRY
|
||||
@@ -78,13 +79,15 @@ class ImageUpscaleWithModel(io.ComfyNode):
|
||||
tile = 512
|
||||
overlap = 32
|
||||
|
||||
output_device = comfy.model_management.intermediate_device()
|
||||
|
||||
oom = True
|
||||
try:
|
||||
while oom:
|
||||
try:
|
||||
steps = in_img.shape[0] * comfy.utils.get_tiled_scale_steps(in_img.shape[3], in_img.shape[2], tile_x=tile, tile_y=tile, overlap=overlap)
|
||||
pbar = comfy.utils.ProgressBar(steps)
|
||||
s = comfy.utils.tiled_scale(in_img, lambda a: upscale_model(a), tile_x=tile, tile_y=tile, overlap=overlap, upscale_amount=upscale_model.scale, pbar=pbar)
|
||||
s = comfy.utils.tiled_scale(in_img, lambda a: upscale_model(a.float()), tile_x=tile, tile_y=tile, overlap=overlap, upscale_amount=upscale_model.scale, pbar=pbar, output_device=output_device)
|
||||
oom = False
|
||||
except Exception as e:
|
||||
model_management.raise_non_oom(e)
|
||||
@@ -94,7 +97,7 @@ class ImageUpscaleWithModel(io.ComfyNode):
|
||||
finally:
|
||||
upscale_model.to("cpu")
|
||||
|
||||
s = torch.clamp(s.movedim(-3,-1), min=0, max=1.0)
|
||||
s = torch.clamp(s.movedim(-3,-1), min=0, max=1.0).to(comfy.model_management.intermediate_dtype())
|
||||
return io.NodeOutput(s)
|
||||
|
||||
upscale = execute # TODO: remove
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# This file is automatically generated by the build process when version is
|
||||
# updated in pyproject.toml.
|
||||
__version__ = "0.18.1"
|
||||
__version__ = "0.19.1"
|
||||
|
||||
1
nodes.py
1
nodes.py
@@ -2457,6 +2457,7 @@ async def init_builtin_extra_nodes():
|
||||
"nodes_number_convert.py",
|
||||
"nodes_painter.py",
|
||||
"nodes_curve.py",
|
||||
"nodes_rtdetr.py"
|
||||
]
|
||||
|
||||
import_failed = []
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "ComfyUI"
|
||||
version = "0.18.1"
|
||||
version = "0.19.1"
|
||||
readme = "README.md"
|
||||
license = { file = "LICENSE" }
|
||||
requires-python = ">=3.10"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
comfyui-frontend-package==1.42.8
|
||||
comfyui-workflow-templates==0.9.39
|
||||
comfyui-frontend-package==1.42.11
|
||||
comfyui-workflow-templates==0.9.54
|
||||
comfyui-embedded-docs==0.4.3
|
||||
torch
|
||||
torchsde
|
||||
|
||||
@@ -146,6 +146,10 @@ def is_loopback(host):
|
||||
def create_origin_only_middleware():
|
||||
@web.middleware
|
||||
async def origin_only_middleware(request: web.Request, handler):
|
||||
if 'Sec-Fetch-Site' in request.headers:
|
||||
sec_fetch_site = request.headers['Sec-Fetch-Site']
|
||||
if sec_fetch_site == 'cross-site':
|
||||
return web.Response(status=403)
|
||||
#this code is used to prevent the case where a random website can queue comfy workflows by making a POST to 127.0.0.1 which browsers don't prevent for some dumb reason.
|
||||
#in that case the Host and Origin hostnames won't match
|
||||
#I know the proper fix would be to add a cookie but this should take care of the problem in the meantime
|
||||
|
||||
Reference in New Issue
Block a user