Update README. (#12931 )

comfy aimdo 0.2.11 + Improved RAM Pressure release strategies - Windows speedups (#12925 )
* Implement seek and read for pins Source pins from an mmap is pad because its its a CPU->CPU copy that attempts to fully buffer the same data twice. Instead, use seek and read which avoids the mmap buffering while usually being a faster read in the first place (avoiding mmap faulting etc). * pinned_memory: Use Aimdo pinner The aimdo pinner bypasses pytorches CPU allocator which can leak windows commit charge. * ops: bypass init() of weight for embedding layer This similarly consumes large commit charge especially for TEs. It can cause a permanement leaked commit charge which can destabilize on systems close to the commit ceiling and generally confuses the RAM stats. * model_patcher: implement pinned memory counter Implement a pinned memory counter for better accounting of what volume of memory pins have. * implement touch accounting Implement accounting of touching mmapped tensors. * mm+mp: add residency mmap getter * utils: use the aimdo mmap to load sft files * model_management: Implement tigher RAM pressure semantics Implement a pressure release on entire MMAPs as windows does perform faster when mmaps are unloaded and model loads free ramp into fully unallocated RAM. Make the concept of freeing for pins a completely separate concept. Now that pins are loadable directly from original file and don' touch the mmap, tighten the freeing budget to just the current loaded model - what you have left over. This still over-frees pins, but its a lot better than before. So after the pins are freed with that algorithm, bounce entire MMAPs to free RAM based on what the model needs, deducting off any known resident-in-mmap tensors to the free quota to keep it as tight as possible. * comfy-aimdo 0.2.11 Comfy aimdo 0.2.11 * mm: Implement file_slice path for QT * ruff * ops: put meta-tensors in place to allow custom nodes to check geo
2026-03-14 09:38:05 +00:00 · 2026-03-13 22:33:28 -04:00 · 2026-03-13 22:18:08 -04:00 · 2026-03-13 20:14:27 -04:00
12 changed files with 278 additions and 63 deletions
--- a/README.md
+++ b/README.md
@@ -38,6 +38,8 @@ ComfyUI lets you design and execute advanced stable diffusion pipelines using a
 ## Get Started
 ### Local
 #### [Desktop Application](https://www.comfy.org/download)
 - The easiest way to get started.
 - Available on Windows & macOS.
@@ -49,8 +51,13 @@ ComfyUI lets you design and execute advanced stable diffusion pipelines using a
 #### [Manual Install](#manual-install-windows-linux)
 Supports all operating systems and GPU types (NVIDIA, AMD, Intel, Apple Silicon, Ascend).
-## [Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
+### Cloud
-See what ComfyUI can do with the [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).
+
 #### [Comfy Cloud](https://www.comfy.org/cloud)
 - Our official paid cloud version for those who can't afford local hardware.
 ## Examples
 See what ComfyUI can do with the [newer template workflows](https://comfy.org/workflows) or old [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).
 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
--- a/comfy/memory_management.py
+++ b/comfy/memory_management.py
@@ -1,9 +1,68 @@
 import math
 import ctypes
 import threading
 import dataclasses
 import torch
 from typing import NamedTuple
 from comfy.quant_ops import QuantizedTensor
 class TensorFileSlice(NamedTuple):
    file_ref: object
    thread_id: int
    offset: int
    size: int
 def read_tensor_file_slice_into(tensor, destination):
    if isinstance(tensor, QuantizedTensor):
        if not isinstance(destination, QuantizedTensor):
            return False
        if tensor._layout_cls != destination._layout_cls:
            return False
        if not read_tensor_file_slice_into(tensor._qdata, destination._qdata):
            return False
        dst_orig_dtype = destination._params.orig_dtype
        destination._params.copy_from(tensor._params, non_blocking=False)
        destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype)
        return True
    info = getattr(tensor.untyped_storage(), "_comfy_tensor_file_slice", None)
    if info is None:
        return False
    file_obj = info.file_ref
    if (destination.device.type != "cpu"
            or file_obj is None
            or threading.get_ident() != info.thread_id
            or destination.numel() * destination.element_size() < info.size):
        return False
    if info.size == 0:
        return True
    buf_type = ctypes.c_ubyte * info.size
    view = memoryview(buf_type.from_address(destination.data_ptr()))
    try:
        file_obj.seek(info.offset)
        done = 0
        while done < info.size:
            try:
                n = file_obj.readinto(view[done:])
            except OSError:
                return False
            if n <= 0:
                return False
            done += n
        return True
    finally:
        view.release()
 class TensorGeometry(NamedTuple):
    shape: any
    dtype: torch.dtype
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -505,6 +505,28 @@ def module_size(module):
        module_mem += t.nbytes
    return module_mem
 def module_mmap_residency(module, free=False):
    mmap_touched_mem = 0
    module_mem = 0
    bounced_mmaps = set()
    sd = module.state_dict()
    for k in sd:
        t = sd[k]
        module_mem += t.nbytes
        storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
        if not getattr(storage, "_comfy_tensor_mmap_touched", False):
            continue
        mmap_touched_mem += t.nbytes
        if not free:
            continue
        storage._comfy_tensor_mmap_touched = False
        mmap_obj = storage._comfy_tensor_mmap_refs[0]
        if mmap_obj in bounced_mmaps:
            continue
        mmap_obj.bounce()
        bounced_mmaps.add(mmap_obj)
    return mmap_touched_mem, module_mem
 class LoadedModel:
    def __init__(self, model):
        self._set_model(model)
@@ -532,6 +554,9 @@ class LoadedModel:
    def model_memory(self):
        return self.model.model_size()
    def model_mmap_residency(self, free=False):
        return self.model.model_mmap_residency(free=free)
    def model_loaded_memory(self):
        return self.model.loaded_size()
@@ -633,7 +658,7 @@ def extra_reserved_memory():
 def minimum_inference_memory():
    return (1024 * 1024 * 1024) * 0.8 + extra_reserved_memory()
-def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, ram_required=0):
+def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins_required=0, ram_required=0):
    cleanup_models_gc()
    unloaded_model = []
    can_unload = []
@@ -646,13 +671,14 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, ram_
                can_unload.append((-shift_model.model_offloaded_memory(), sys.getrefcount(shift_model.model), shift_model.model_memory(), i))
                shift_model.currently_used = False
-    for x in sorted(can_unload):
+    can_unload_sorted = sorted(can_unload)
    for x in can_unload_sorted:
        i = x[-1]
        memory_to_free = 1e32
-        ram_to_free = 1e32
+        pins_to_free = 1e32
        if not DISABLE_SMART_MEMORY:
            memory_to_free = memory_required - get_free_memory(device)
-            ram_to_free = ram_required - get_free_ram()
+            pins_to_free = pins_required - get_free_ram()
            if current_loaded_models[i].model.is_dynamic() and for_dynamic:
                #don't actually unload dynamic models for the sake of other dynamic models
                #as that works on-demand.
@@ -661,9 +687,18 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, ram_
        if memory_to_free > 0 and current_loaded_models[i].model_unload(memory_to_free):
            logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
            unloaded_model.append(i)
-        if ram_to_free > 0:
+        if pins_to_free > 0:
            logging.debug(f"PIN Unloading {current_loaded_models[i].model.model.__class__.__name__}")
            current_loaded_models[i].model.partially_unload_ram(pins_to_free)
    for x in can_unload_sorted:
        i = x[-1]
        ram_to_free = ram_required - psutil.virtual_memory().available
        if ram_to_free <= 0 and i not in unloaded_model:
            continue
        resident_memory, _ = current_loaded_models[i].model_mmap_residency(free=True)
        if resident_memory > 0:
            logging.debug(f"RAM Unloading {current_loaded_models[i].model.model.__class__.__name__}")
            current_loaded_models[i].model.partially_unload_ram(ram_to_free)
    for i in sorted(unloaded_model, reverse=True):
        unloaded_models.append(current_loaded_models.pop(i))
@@ -729,17 +764,27 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
    total_memory_required = {}
    total_pins_required = {}
    total_ram_required = {}
    for loaded_model in models_to_load:
-        total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
+        device = loaded_model.device
-        #x2, one to make sure the OS can fit the model for loading in disk cache, and for us to do any pinning we
+        total_memory_required[device] = total_memory_required.get(device, 0) + loaded_model.model_memory_required(device)
-        #want to do.
+        resident_memory, model_memory = loaded_model.model_mmap_residency()
-        #FIXME: This should subtract off the to_load current pin consumption.
+        pinned_memory = loaded_model.model.pinned_memory_size()
-        total_ram_required[loaded_model.device] = total_ram_required.get(loaded_model.device, 0) + loaded_model.model_memory() * 2
+        #FIXME: This can over-free the pins as it budgets to pin the entire model. We should
        #make this JIT to keep as much pinned as possible.
        pins_required = model_memory - pinned_memory
        ram_required = model_memory - resident_memory
        total_pins_required[device] = total_pins_required.get(device, 0) + pins_required
        total_ram_required[device] = total_ram_required.get(device, 0) + ram_required
    for device in total_memory_required:
        if device != torch.device("cpu"):
-            free_memory(total_memory_required[device] * 1.1 + extra_mem, device, for_dynamic=free_for_dynamic, ram_required=total_ram_required[device])
+            free_memory(total_memory_required[device] * 1.1 + extra_mem,
                        device,
                        for_dynamic=free_for_dynamic,
                        pins_required=total_pins_required[device],
                        ram_required=total_ram_required[device])
    for device in total_memory_required:
        if device != torch.device("cpu"):
@@ -1225,6 +1270,11 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
            dest_view = dest_views.pop(0)
            if tensor is None:
                continue
            if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
                continue
            storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
            if hasattr(storage, "_comfy_tensor_mmap_touched"):
                storage._comfy_tensor_mmap_touched = True
            dest_view.copy_(tensor, non_blocking=non_blocking)
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -297,6 +297,9 @@ class ModelPatcher:
        self.size = comfy.model_management.module_size(self.model)
        return self.size
    def model_mmap_residency(self, free=False):
        return comfy.model_management.module_mmap_residency(self.model, free=free)
    def get_ram_usage(self):
        return self.model_size()
@@ -1063,6 +1066,10 @@ class ModelPatcher:
            return self.model.model_loaded_weight_memory - current_used
    def pinned_memory_size(self):
        # Pinned memory pressure tracking is only implemented for DynamicVram loading
        return 0
    def partially_unload_ram(self, ram_to_unload):
        pass
@@ -1653,6 +1660,16 @@ class ModelPatcherDynamic(ModelPatcher):
        return freed
    def pinned_memory_size(self):
        total = 0
        loading = self._load_list(for_dynamic=True)
        for x in loading:
            _, _, _, _, m, _ = x
            pin = comfy.pinned_memory.get_pin(m)
            if pin is not None:
                total += pin.numel() * pin.element_size()
        return total
    def partially_unload_ram(self, ram_to_unload):
        loading = self._load_list(for_dynamic=True, default_device=self.offload_device)
        for x in loading:
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -306,6 +306,33 @@ class CastWeightBiasOp:
    bias_function = []
 class disable_weight_init:
    @staticmethod
    def _lazy_load_from_state_dict(module, state_dict, prefix, local_metadata,
                                   missing_keys, unexpected_keys, weight_shape,
                                   bias_shape=None):
        assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False)
        prefix_len = len(prefix)
        for k, v in state_dict.items():
            key = k[prefix_len:]
            if key == "weight":
                if not assign_to_params_buffers:
                    v = v.clone()
                module.weight = torch.nn.Parameter(v, requires_grad=False)
            elif bias_shape is not None and key == "bias" and v is not None:
                if not assign_to_params_buffers:
                    v = v.clone()
                module.bias = torch.nn.Parameter(v, requires_grad=False)
            else:
                unexpected_keys.append(k)
        if module.weight is None:
            module.weight = torch.nn.Parameter(torch.zeros(weight_shape), requires_grad=False)
            missing_keys.append(prefix + "weight")
        if bias_shape is not None and module.bias is None and getattr(module, "comfy_need_lazy_init_bias", False):
            module.bias = torch.nn.Parameter(torch.zeros(bias_shape), requires_grad=False)
            missing_keys.append(prefix + "bias")
    class Linear(torch.nn.Linear, CastWeightBiasOp):
        def __init__(self, in_features, out_features, bias=True, device=None, dtype=None):
@@ -333,29 +360,16 @@ class disable_weight_init:
            if not comfy.model_management.WINDOWS or not comfy.memory_management.aimdo_enabled:
                return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
                                                     missing_keys, unexpected_keys, error_msgs)
-            assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False)
+            disable_weight_init._lazy_load_from_state_dict(
-            prefix_len = len(prefix)
+                self,
-            for k,v in state_dict.items():
+                state_dict,
-                if k[prefix_len:] == "weight":
+                prefix,
-                    if not assign_to_params_buffers:
+                local_metadata,
-                        v = v.clone()
+                missing_keys,
-                    self.weight = torch.nn.Parameter(v, requires_grad=False)
+                unexpected_keys,
-                elif k[prefix_len:] == "bias" and v is not None:
+                weight_shape=(self.in_features, self.out_features),
-                    if not assign_to_params_buffers:
+                bias_shape=(self.out_features,),
-                        v = v.clone()
+            )
                    self.bias = torch.nn.Parameter(v, requires_grad=False)
                else:
                    unexpected_keys.append(k)
            #Reconcile default construction of the weight if its missing.
            if self.weight is None:
                v = torch.zeros(self.in_features, self.out_features)
                self.weight = torch.nn.Parameter(v, requires_grad=False)
                missing_keys.append(prefix+"weight")
            if self.bias is None and self.comfy_need_lazy_init_bias:
                v = torch.zeros(self.out_features,)
                self.bias = torch.nn.Parameter(v, requires_grad=False)
                missing_keys.append(prefix+"bias")
        def reset_parameters(self):
@@ -547,6 +561,48 @@ class disable_weight_init:
                return super().forward(*args, **kwargs)
    class Embedding(torch.nn.Embedding, CastWeightBiasOp):
        def __init__(self, num_embeddings, embedding_dim, padding_idx=None, max_norm=None,
                     norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None,
                     _freeze=False, device=None, dtype=None):
            if not comfy.model_management.WINDOWS or not comfy.memory_management.aimdo_enabled:
                super().__init__(num_embeddings, embedding_dim, padding_idx, max_norm,
                                 norm_type, scale_grad_by_freq, sparse, _weight,
                                 _freeze, device, dtype)
                return
            torch.nn.Module.__init__(self)
            self.num_embeddings = num_embeddings
            self.embedding_dim = embedding_dim
            self.padding_idx = padding_idx
            self.max_norm = max_norm
            self.norm_type = norm_type
            self.scale_grad_by_freq = scale_grad_by_freq
            self.sparse = sparse
            # Keep shape/dtype visible for module introspection without reserving storage.
            embedding_dtype = dtype if dtype is not None else torch.get_default_dtype()
            self.weight = torch.nn.Parameter(
                torch.empty((num_embeddings, embedding_dim), device="meta", dtype=embedding_dtype),
                requires_grad=False,
            )
            self.bias = None
            self.weight_comfy_model_dtype = dtype
        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
                                strict, missing_keys, unexpected_keys, error_msgs):
            if not comfy.model_management.WINDOWS or not comfy.memory_management.aimdo_enabled:
                return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
                                                     missing_keys, unexpected_keys, error_msgs)
            disable_weight_init._lazy_load_from_state_dict(
                self,
                state_dict,
                prefix,
                local_metadata,
                missing_keys,
                unexpected_keys,
                weight_shape=(self.num_embeddings, self.embedding_dim),
            )
        def reset_parameters(self):
            self.bias = None
            return None
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@@ -1,6 +1,7 @@
 import torch
 import comfy.model_management
 import comfy.memory_management
 import comfy_aimdo.host_buffer
 import comfy_aimdo.torch
 from comfy.cli_args import args
@@ -12,18 +13,31 @@ def pin_memory(module):
        return
    #FIXME: This is a RAM cache trigger event
    size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
-    pin = torch.empty((size,), dtype=torch.uint8)
+
-    if comfy.model_management.pin_memory(pin):
+    if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY:
        module._pin = pin
    else:
        module.pin_failed = True
        return False
    try:
        hostbuf = comfy_aimdo.host_buffer.HostBuffer(size)
    except RuntimeError:
        module.pin_failed = True
        return False
    module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)
    module._pin_hostbuf = hostbuf
    comfy.model_management.TOTAL_PINNED_MEMORY += size
    return True
 def unpin_memory(module):
    if get_pin(module) is None:
        return 0
    size = module._pin.numel() * module._pin.element_size()
-    comfy.model_management.unpin_memory(module._pin)
+
    comfy.model_management.TOTAL_PINNED_MEMORY -= size
    if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
        comfy.model_management.TOTAL_PINNED_MEMORY = 0
    del module._pin
    del module._pin_hostbuf
    return size
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -20,6 +20,8 @@
 import torch
 import math
 import struct
 import ctypes
 import os
 import comfy.memory_management
 import safetensors.torch
 import numpy as np
@@ -32,7 +34,7 @@ from einops import rearrange
 from comfy.cli_args import args
 import json
 import time
-import mmap
+import threading
 import warnings
 MMAP_TORCH_FILES = args.mmap_torch_files
@@ -81,14 +83,17 @@ _TYPES = {
 }
 def load_safetensors(ckpt):
-    f = open(ckpt, "rb")
+    import comfy_aimdo.model_mmap
    mapping = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
    mv = memoryview(mapping)
-    header_size = struct.unpack("<Q", mapping[:8])[0]
+    f = open(ckpt, "rb", buffering=0)
-    header = json.loads(mapping[8:8+header_size].decode("utf-8"))
+    model_mmap = comfy_aimdo.model_mmap.ModelMMAP(ckpt)
    file_size = os.path.getsize(ckpt)
    mv = memoryview((ctypes.c_uint8 * file_size).from_address(model_mmap.get()))
-    mv = mv[8 + header_size:]
+    header_size = struct.unpack("<Q", mv[:8])[0]
    header = json.loads(mv[8:8 + header_size].tobytes().decode("utf-8"))
    mv = mv[(data_base_offset := 8 + header_size):]
    sd = {}
    for name, info in header.items():
@@ -102,7 +107,14 @@ def load_safetensors(ckpt):
            with warnings.catch_warnings():
                #We are working with read-only RAM by design
                warnings.filterwarnings("ignore", message="The given buffer is not writable")
-                sd[name] = torch.frombuffer(mv[start:end], dtype=_TYPES[info["dtype"]]).view(info["shape"])
+                tensor = torch.frombuffer(mv[start:end], dtype=_TYPES[info["dtype"]]).view(info["shape"])
                storage = tensor.untyped_storage()
                setattr(storage,
                        "_comfy_tensor_file_slice",
                        comfy.memory_management.TensorFileSlice(f, threading.get_ident(), data_base_offset + start, end - start))
                setattr(storage, "_comfy_tensor_mmap_refs", (model_mmap, mv))
                setattr(storage, "_comfy_tensor_mmap_touched", False)
                sd[name] = tensor
    return sd, header.get("__metadata__", {}),
--- a/manager_requirements.txt
+++ b/manager_requirements.txt
@@ -1 +1 @@
-comfyui_manager==4.1b2
+comfyui_manager==4.1b4
--- a/middleware/cache_middleware.py
+++ b/middleware/cache_middleware.py
@@ -32,7 +32,7 @@ async def cache_control(
    )
    if request.path.endswith(".js") or request.path.endswith(".css") or is_entry_point:
-        response.headers.setdefault("Cache-Control", "no-store")
+        response.headers.setdefault("Cache-Control", "no-cache")
        return response
    # Early return for non-image files - no cache headers needed
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,7 @@ SQLAlchemy
 filelock
 av>=14.2.0
 comfy-kitchen>=0.2.8
-comfy-aimdo>=0.2.10
+comfy-aimdo>=0.2.11
 requests
 simpleeval>=1.0.0
 blake3
--- a/server.py
+++ b/server.py
@@ -310,7 +310,7 @@ class PromptServer():
        @routes.get("/")
        async def get_root(request):
            response = web.FileResponse(os.path.join(self.web_root, "index.html"))
-            response.headers['Cache-Control'] = 'no-store, must-revalidate'
+            response.headers['Cache-Control'] = 'no-cache'
            response.headers["Pragma"] = "no-cache"
            response.headers["Expires"] = "0"
            return response
--- a/tests-unit/server_test/test_cache_control.py
+++ b/tests-unit/server_test/test_cache_control.py
@@ -28,31 +28,31 @@ CACHE_SCENARIOS = [
    },
    # JavaScript/CSS scenarios
    {
-        "name": "js_no_store",
+        "name": "js_no_cache",
        "path": "/script.js",
        "status": 200,
-        "expected_cache": "no-store",
+        "expected_cache": "no-cache",
        "should_have_header": True,
    },
    {
-        "name": "css_no_store",
+        "name": "css_no_cache",
        "path": "/styles.css",
        "status": 200,
-        "expected_cache": "no-store",
+        "expected_cache": "no-cache",
        "should_have_header": True,
    },
    {
-        "name": "index_json_no_store",
+        "name": "index_json_no_cache",
        "path": "/api/index.json",
        "status": 200,
-        "expected_cache": "no-store",
+        "expected_cache": "no-cache",
        "should_have_header": True,
    },
    {
-        "name": "localized_index_json_no_store",
+        "name": "localized_index_json_no_cache",
        "path": "/templates/index.zh.json",
        "status": 200,
-        "expected_cache": "no-store",
+        "expected_cache": "no-cache",
        "should_have_header": True,
    },
    # Non-matching files
		`@@ -1 +1 @@`
			`comfyui_manager==4.1b2`				`comfyui_manager==4.1b4`