server : catch sampler/grammar exceptions to avoid process abort (#1725) (#1726)

Wrap the two slot-level sample/accept call sites in try/catch (std::exception). On exception: log, send_error to the task, release the slot, continue serving. Matches the existing try/catch around common_sampler_init in the same file. Without this, llama_grammar_accept_token throwing "Unexpected empty grammar stack after accepting piece: <pad> (0)" (reproducible on Gemma 4 + json_schema + ctx_shift, see #1725) unwinds out of update_slots -> queue start_loop -> main, hits std::terminate, and aborts the whole server process.
2026-05-11 16:40:16 +00:00 · 2026-05-02 22:21:09 -07:00
parent e76700119d
commit bc549da0f7
1 changed files with 31 additions and 4 deletions
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@@ -3810,7 +3810,22 @@ void server_context::speculative_decoding_accept() {
        apply_server_biases(slot);

        // the accepted tokens from the speculation
-        const auto ids = common_sampler_sample_and_accept_n(slot.ctx_sampling, ctx, slot.i_batch_dft, slot.drafted);
+        std::vector<llama_token> ids;
+        try {
+            ids = common_sampler_sample_and_accept_n(slot.ctx_sampling, ctx, slot.i_batch_dft, slot.drafted);
+        } catch (const std::exception & e) {
+            LOG_ERROR("speculative sampling failed, releasing slot", {
+                {"id_slot", slot.id},
+                {"id_task", slot.id_task},
+                {"error", e.what()},
+            });
+            send_error(slot, std::string("sampling error: ") + e.what(), ERROR_TYPE_SERVER);
+            slot.release();
+            slot.i_batch = -1;
+            slot.i_batch_dft.clear();
+            slot.drafted.clear();
+            continue;
+        }

        int32_t mtp_n_past_base = 0;
        std::vector<float> mtp_hidden_state_pre;
@@ -4320,9 +4335,21 @@ void server_context::process_batch_tokens(int32_t & n_batch) {

            apply_server_biases(slot);

-            const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, tok_idx);
-
-            common_sampler_accept(slot.ctx_sampling, ctx, id, true);
+            llama_token id;
+            try {
+                id = common_sampler_sample(slot.ctx_sampling, ctx, tok_idx);
+                common_sampler_accept(slot.ctx_sampling, ctx, id, true);
+            } catch (const std::exception & e) {
+                LOG_ERROR("sampling failed, releasing slot", {
+                    {"id_slot", slot.id},
+                    {"id_task", slot.id_task},
+                    {"error", e.what()},
+                });
+                send_error(slot, std::string("sampling error: ") + e.what(), ERROR_TYPE_SERVER);
+                slot.release();
+                slot.i_batch = -1;
+                continue;
+            }

            slot.n_decoded += 1;
            const int64_t t_current = ggml_time_us();