RPC: support multiple devices including cpu (#1024)

* RPC support multiple devices * rpc : update documentation (#16441) Update the README file to match the newly added functionality of exposing multiple devices from a single server. Co-authored-by: Diego Devesa <slarengh@gmail.com> # Conflicts: # examples/rpc/README.md * Remove memory settings * rpc : cache and reuse compute graphs (#15405) Store the last computed graph and reuse it when possible. Also do not return response from GRAPH_COMPUTE and assume it always completes successfully. If this this is not the case, the server closes the connection. This saves us a network round trip to the server. * Add -cpu to include cpu backend --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Radoslav Gerganov <rgerganov@gmail.com>
2026-04-28 02:11:50 +00:00 · 2025-11-30 11:48:02 -06:00
parent 52adcf1e90
commit e89064e657
8 changed files with 734 additions and 381 deletions
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -7,22 +7,22 @@
 extern "C" {
 #endif

-#define RPC_PROTO_MAJOR_VERSION    2
-#define RPC_PROTO_MINOR_VERSION    0
+#define RPC_PROTO_MAJOR_VERSION    3
+#define RPC_PROTO_MINOR_VERSION    5
 #define RPC_PROTO_PATCH_VERSION    1
 #define GGML_RPC_MAX_SERVERS       16

 // backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device);
 GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);

-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device);

-GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+GGML_API GGML_CALL uint32_t ggml_backend_rpc_get_device_count(const char* endpoint);

-GGML_API GGML_CALL void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
-                                                    const char * cache_dir,
-                                                    size_t free_mem, size_t total_mem);
+GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
+
+GGML_API GGML_CALL void ggml_backend_rpc_start_server(const char * endpoint, const char* cache_dir, size_t device, ggml_backend_t * devices, size_t* free_mem, size_t* total_mem);

 #ifdef  __cplusplus
 }