fix(kt): synchronize INT4 double-buffer slot reuse in fallback prefill

2026-04-20 14:29:32 +00:00 · 2026-03-03 03:45:30 +00:00
parent f1a12b9a93
commit b3356b6c46
1 changed files with 5 additions and 0 deletions
--- a/python/sglang/srt/layers/moe/kt_ep_wrapper.py
+++ b/python/sglang/srt/layers/moe/kt_ep_wrapper.py
@@ -697,6 +697,11 @@ class SharedFullContext:
            if do_write:
                wrapper.sync_write_weight_scale_to_buffer()
                if e + 1 < num_experts:
+                    # Before writing to slot (e+1)%2, make sure the previous
+                    # copy from that slot has completed to avoid overwriting
+                    # pinned host memory while DMA is in-flight.
+                    if e > 0:
+                        events[e - 1].synchronize()
                    submit_write_expert(e + 1)

            # Barrier to ensure all ranks see the written data