diff --git a/python/mscclpp/ext/ep/buffer.py b/python/mscclpp/ext/ep/buffer.py
index 3164048f..c747f750 100644
--- a/python/mscclpp/ext/ep/buffer.py
+++ b/python/mscclpp/ext/ep/buffer.py
@@ -17,8 +17,9 @@ Current status (see ``src/ext/ep/README.md``):
 * Internode HT (MSCCL++ PortChannel + MemoryChannel) dispatch and combine:
   ported and validated on 2 nodes x 8 H100 GPUs with
   ``test/python/ext/ep/test_internode_multirank.py``.
-* Internode low-latency kernels: structural port (NVSHMEM/IBGDA ->
-  MSCCL++ PortChannel), **untested on multi-node H100**.
+* Internode low-latency kernels (NVSHMEM/IBGDA -> MSCCL++ PortChannel):
+  ported and validated on 2 nodes x 8 H100 GPUs with
+  ``test/python/ext/ep/test_low_latency_multirank.py``.
 """
 
 from __future__ import annotations
diff --git a/src/ext/ep/event.hpp b/src/ext/ep/event.hpp
index 1cfc828e..d5a77526 100644
--- a/src/ext/ep/event.hpp
+++ b/src/ext/ep/event.hpp
@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
+#pragma once
 
 #include <ATen/cuda/CUDAContext.h>
 #include <memory>
diff --git a/src/ext/ep/kernels/internode_ll.cu b/src/ext/ep/kernels/internode_ll.cu
index 47b3a279..613f9e63 100644
--- a/src/ext/ep/kernels/internode_ll.cu
+++ b/src/ext/ep/kernels/internode_ll.cu
@@ -22,9 +22,9 @@
 //     position in the connected-peer map. In the recommended 1-GPU-per-node
 //     LL topology, `peer_idx == dst_rank`; see src/ext/ep/README.md.
 //
-// WARNING: This port is untested on multi-node H100; performance will NOT
-// match IBGDA (host-proxy adds latency). Functional correctness needs
-// validation on real hardware.
+// Validated on 2 nodes x 8 H100 GPUs via
+// `test/python/ext/ep/test_low_latency_multirank.py`. Performance does NOT
+// match IBGDA (host-proxy adds latency); see README for measurements.
 
 #include "configs.cuh"
 #include "exception.cuh"