{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "V100" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "source": [ "Clone ExLlamaV2 and install dependencies" ], "metadata": { "id": "NTOhV8supsTA" } }, { "cell_type": "code", "source": [ "!git clone https://github.com/turboderp/exllamav2\n", "!cd exllamav2 && pip install -r requirements.txt" ], "metadata": { "id": "MkEIIMJdpk_d" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Optional: install Flash Attention" ], "metadata": { "id": "Wv96vR6HpNZF" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KjOm-GNZpMA0" }, "outputs": [], "source": [ "!pip install -U flash-attn" ] }, { "cell_type": "markdown", "source": [ "Download a model. This may take a moment." ], "metadata": { "id": "MqAu9pcBqAf4" } }, { "cell_type": "code", "source": [ "!mkdir my_model\n", "!huggingface-cli download turboderp/Mistral-7B-instruct-exl2 --revision 4.0bpw --local-dir my_model" ], "metadata": { "id": "4OotLYL3p7rD" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Launch the chatbot example. On the first launch, this will compile ExLlamaV2's C++/CUDA extension, which can take several minutes on Colab." ], "metadata": { "id": "4DcCw_URrHja" } }, { "cell_type": "code", "source": [ "!cd exllamav2 && python examples/chat.py -m ../my_model -mode llama -pt -ncf -ngram" ], "metadata": { "id": "HbpsCnOoqKzk" }, "execution_count": null, "outputs": [] } ] }