server: improve speed of speculative decoding (#1119)

* server: improve speed of speculative decoding

change logs

rpc: add recompute

spec dec fix

* Fix n_batch_size not set to context size for draft model

---------

Co-authored-by: firecoperana <firecoperana>
This commit is contained in:
firecoperana
2026-01-10 00:01:22 -06:00
committed by GitHub
parent 52ad1c6421
commit c03ee1a4d2
7 changed files with 164 additions and 135 deletions

View File

@@ -9,7 +9,7 @@ extern "C" {
#define RPC_PROTO_MAJOR_VERSION 3
#define RPC_PROTO_MINOR_VERSION 5
#define RPC_PROTO_PATCH_VERSION 1
#define RPC_PROTO_PATCH_VERSION 2
#define GGML_RPC_MAX_SERVERS 16
// backend API

View File

@@ -1828,6 +1828,16 @@ static void rpc_serve_client(const std::vector<ggml_backend_t>& backends, const
}
break;
}
case RPC_CMD_GRAPH_RECOMPUTE: {
rpc_msg_graph_recompute_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
return;
}
if (!server.graph_recompute(request)) {
return;
}
break;
}
case RPC_CMD_GET_DEVICE_MEMORY: {
rpc_msg_get_device_memory_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {