Enable CUDA graphs for MoE models + GPT-OSS support (#689)

* gmp-oss: common * gpt-oss: attnetion sinks, swiglu_oai * gpt-oss: WIP llama Model loads and runs (CPU only), but PPL is much to high (~1500 for 1st batch vs ~200 in mainline). Is it because of SWA, because of vocab, or did I introduce a bug somewhere? * gpt-oss: CPU seems to be working It was the SWA thta was missing in the previous commit. There are issues with EOG tokens, so this still needs to be added. * CUDA: ADD_ID Just a copy from mainline * gpt-oss: Seems to be working on CUDA * gpt-oss: add sinks to the attn-vec kernels * CUDA: add head size of 64 to new mma Haven't turned it on yet, but observe slightly better PP and slightly worse TG performance with that. * gpt-oss: add ability to use -fmoe (only CUDA for now) * Move row sums to the write place * Add sinks to iqk flash attention * gpt_oss: Implement -fmoe on the CPU * Simdify swiglu_oai Turning it off for now as performance becomes more variable, so perhaps I'm running into thermal trottling imore often because of making the CPU work too hard. * llama: factor out model loader * Builds successfully * It runs, but mmap does not work * Fix llama_mmap so mmap works * Minor * Fix CUDA after latest changes * Attempt to use CUDA graphs with MoE models - not working * CUDA graphs WIP - still not working * CUDA graphs - seems to be working Likely not all MLA variants are working. I no longer remember why I added the q8_0 cpy that transposes the tensor, but if really needed, this is now missing. Also missing is q6_0. * Make q8_0 cache work for DeepSeek models with CUDA graphs * cuda: cpy for q6_0 * Fix llama_mmap on non-Linux platforms * Adding forgotten file * Iterating on Windows build failures * cuda: re-add q8_0 -> q8_0 transpose so mla = 2 can be used with CUDA graphs and q8_0 cache. * Disable graphs without -fmoe * Minor * Turn graphs on by default --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-30 11:21:56 +00:00 · 2025-08-15 09:18:07 +03:00
parent c00335684c
commit 633e0617b0
56 changed files with 8720 additions and 5115 deletions
--- a/common/chat.h
+++ b/common/chat.h
@@ -13,20 +13,20 @@ struct common_chat_templates;
 struct common_string_range {
    size_t begin;
    size_t end;
-    
+
    common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
        if (begin > end) {
            throw std::runtime_error("Invalid range");
        }
    }
-    
+
    // prevent default ctor
    common_string_range() = delete;
-    
+
    bool empty() const {
        return begin == end;
    }
-    
+
    bool operator==(const common_string_range & other) const {
        return begin == other.begin && end == other.end;
    }
@@ -40,7 +40,7 @@ struct common_chat_tool_call {
    bool operator==(const common_chat_tool_call & other) const {
        return name == other.name && arguments == other.arguments && id == other.id;
    }
-    
+
    bool operator!=(const common_chat_tool_call & other) const {
        return !(*this == other);
    }
@@ -65,10 +65,10 @@ struct common_chat_msg {
    std::string tool_call_id;

    bool empty() const {
-        return content.empty() && content_parts.empty() && tool_calls.empty() && 
+        return content.empty() && content_parts.empty() && tool_calls.empty() &&
               reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
    }
-    
+
    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
            if (ids_cache.size() <= i) {
@@ -91,7 +91,7 @@ struct common_chat_msg {
            && tool_name == other.tool_name
            && tool_call_id == other.tool_call_id;
    }
-    
+
    bool operator!=(const common_chat_msg & other) const {
        return !(*this == other);
    }
@@ -110,7 +110,7 @@ struct common_chat_msg_diff {
        && tool_call_index == other.tool_call_index
        && tool_call_delta == other.tool_call_delta;
    }
-    
+
    bool operator!=(const common_chat_msg_diff & other) const {
        return !(*this == other);
    }
@@ -132,18 +132,20 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_CONTENT_ONLY,
    COMMON_CHAT_FORMAT_GENERIC,
    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_GPT_OSS,
    COMMON_CHAT_FORMAT_KIMI_K2,  // Our custom format (keep last for backward compatibility)
 };

 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
+    COMMON_REASONING_FORMAT_AUTO,
    COMMON_REASONING_FORMAT_DEEPSEEK,
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY,
 };

 struct common_chat_syntax {
    common_chat_format format = COMMON_CHAT_FORMAT_KIMI_K2;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO; //COMMON_REASONING_FORMAT_NONE;
    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
    bool reasoning_in_content = false;
    bool thinking_forced_open = false;
@@ -165,11 +167,12 @@ class common_chat_msg_partial_exception : public std::runtime_error {
 // Format detection from chat template
 common_chat_format common_chat_format_detect(const std::string & chat_template);
 const char* common_chat_format_name(common_chat_format format);
+const char* common_reasoning_format_name(common_reasoning_format format);

 // Main parsing function (entry point for original llama.cpp compatibility)
 common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);

-// Forward declare parser class  
+// Forward declare parser class
 class common_chat_msg_parser;

 // Format-specific parsing functions (accessible from chat-parser)