RPC: support multiple devices including cpu (#1024)

* RPC support multiple devices * rpc : update documentation (#16441) Update the README file to match the newly added functionality of exposing multiple devices from a single server. Co-authored-by: Diego Devesa <slarengh@gmail.com> # Conflicts: # examples/rpc/README.md * Remove memory settings * rpc : cache and reuse compute graphs (#15405) Store the last computed graph and reuse it when possible. Also do not return response from GRAPH_COMPUTE and assume it always completes successfully. If this this is not the case, the server closes the connection. This saves us a network round trip to the server. * Add -cpu to include cpu backend --------- Co-authored-by: firecoperana <firecoperana> Co-authored-by: Radoslav Gerganov <rgerganov@gmail.com>
2026-02-20 05:04:11 +00:00 · 2025-11-30 11:48:02 -06:00
parent 52adcf1e90
commit e89064e657
8 changed files with 734 additions and 381 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -270,6 +270,28 @@ static std::string parse_device_list(const std::string& value) {
    return value;
 }

+static std::string add_rpc_devices(std::string& servers) {
+    std::string rpc_devices;
+    std::vector<std::string> rpc_servers = string_split(servers, ",");
+    if (rpc_servers.empty()) {
+        throw std::invalid_argument("no RPC servers specified");
+    }
+    for (auto& server : rpc_servers) {
+        uint32_t dev_count = ggml_backend_rpc_get_device_count(server.c_str());
+        uint32_t device = 0;
+        for (uint32_t i = 0; i < dev_count; ++i) {
+            const auto buft = ggml_backend_rpc_buffer_type(server.c_str(), device);
+            if (buft != nullptr) {
+                rpc_devices = rpc_devices + server + "|" + std::to_string(device) + ",";
+                ++device;
+            }
+        }
+    }
+    if (!rpc_devices.empty()) {
+        rpc_devices = rpc_devices.substr(0, rpc_devices.size() - 1); // remove trailing comma
+    }
+    return rpc_devices;
+}

 std::pair<long, std::vector<char>> common_remote_get_content(const std::string& url, const common_remote_params&) {
    if (!url.empty()) {
@@ -1296,15 +1318,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    if (arg == "--rpc") {
        CHECK_ARG
 #ifdef GGML_USE_RPC
-        params.rpc_servers = argv[i];
-        std::string servers(params.rpc_servers);
-        size_t pos = 0;
-        while ((pos = servers.find(",")) != std::string::npos) {
-            std::string server = servers.substr(0, pos);
-            ggml_backend_rpc_buffer_type(server.c_str());
-            servers.erase(0, pos + 1);
+        std::string servers(argv[i]);
+        servers = add_rpc_devices(servers);
+        if (servers.empty()) {
+            return false;
        }
-        ggml_backend_rpc_buffer_type(servers.c_str());
+        params.rpc_servers = servers;
 #endif
        return true;
    }
@@ -1319,10 +1338,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--override-tensor" || arg == "-ot") {
        CHECK_ARG
-            /*for (auto endpoint : params.rpc_servers.split)
-            {
-
-            }*/
        if (!parse_buft_overrides(std::string{ argv[i] }, params.tensor_buft_overrides)) {
            fprintf(stderr, "error: Invalid tensor buffer type override: %s\n", argv[i]);
            invalid_param = true;