exllamav3/examples/transformers_integration.py

import sys, os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Gemma3ForConditionalGeneration
from exllamav3.integration.transformers import patch_transformers

# At the moment, ExLlamaV3 integrates into Transformers by injecting a couple of classes into Transformers' lists
# of recognized quantization formats. Expect this method to change
patch_transformers()

@torch.inference_mode
def main():

    # Model ID. Currently, this needs to point to a local directory and models can't be loaded directly from the HF
    # Hub. All models supported by ExLlamaV3 _should_ work here, except for:
    #
    # Models with fused q/k/v or up/gate projections (e.g. Phi4) are currently not handled correctly. ExLlamaV3
    # un-fuses those layers during quantization.
    #
    # Nemotron-Ultra specifically can only be quantized by splitting a couple of extremely wide MLP layers into slices,
    # which breaks compatibility with the model implementation in Transformers
    model_id = "/mnt/str/models/llama3.1-70b-instruct/exl3/1.6bpw_H3/"

    # Create the AutoModel
    model = AutoModelForCausalLM.from_pretrained(model_id, device_map = "auto")

    # Format and tokenize a prompt
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    input_ids = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "You are a very nice assistant."},
            {"role": "user", "content": "Hello!"},
        ],
        tokenize = True,
        return_tensors = "pt",
        add_generation_prompt = True
    ).to(model.device)

    # Generate a response
    output_ids = model.generate(input_ids = input_ids, max_new_tokens = 100, do_sample = True, top_p = 0.8)
    output = tokenizer.decode(output_ids[0].tolist())
    print(output)

if __name__ == "__main__":
    main()