[feat](kt-kernel): Add automatic deployment workflow (#1719)

2026-04-20 06:18:59 +00:00 · 2025-12-16 15:20:06 +08:00
parent f25e58ad69
commit 1f79f6da92
31 changed files with 3691 additions and 552 deletions
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -5,9 +5,24 @@ on:
    types: [published]
  workflow_dispatch:
    inputs:
-      choose:
-        description: 'Will you push the image to DockerHub? 0 for No, 1 for Yes'
+      push_to_dockerhub:
+        description: 'Push image to DockerHub? (true/false)'
        required: true
+        default: 'false'
+        type: boolean
+      cuda_version:
+        description: 'CUDA version (e.g., 12.8.1)'
+        required: false
+        default: '12.8.1'
+        type: string
+      push_simplified_tag:
+        description: 'Also push simplified tag? (true/false)'
+        required: false
+        default: 'true'
+        type: boolean
+      ubuntu_mirror:
+        description: 'Use Tsinghua Ubuntu mirror? (0/1)'
+        required: false
        default: '0'
        type: string

@@ -20,79 +35,108 @@ jobs:
  test:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
      - name: Run tests
        run: |
          if [ -f docker-compose.test.yml ]; then
            docker-compose --file docker-compose.test.yml build
            docker-compose --file docker-compose.test.yml run sut
          else
-            docker build . --file Dockerfile
+            docker build . --file docker/Dockerfile
          fi

-  docker_task:
+  build-and-push:
    needs: test
-    name: ${{ matrix.instruct}}
+    name: Build and Push Multi-Variant Docker Image
    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        # for amd64
-          - {instruct: "FANCY",   platform: "linux/amd64"}
-          - {instruct: "AVX512",  platform: "linux/amd64"}
-          - {instruct: "AVX2",    platform: "linux/amd64"}   
-          - {instruct: "NATIVE",  platform: "linux/amd64"}
-        # for arm64
-          - {instruct: "NATIVE",  platform: "linux/arm64"}

    steps:
-        - name: Move Docker data directory
-          run: |
-            sudo systemctl stop docker
-            sudo mkdir -p /mnt/docker
-            sudo rsync -avz /var/lib/docker/ /mnt/docker
-            sudo rm -rf /var/lib/docker 
-            sudo ln -s /mnt/docker /var/lib/docker
-            sudo systemctl start docker
+      - name: Checkout repository
+        uses: actions/checkout@v4

-        -
-          name: Set up QEMU
-          uses: docker/setup-qemu-action@v3
+      - name: Move Docker data directory
+        run: |
+          sudo systemctl stop docker
+          sudo mkdir -p /mnt/docker
+          sudo rsync -avz /var/lib/docker/ /mnt/docker
+          sudo rm -rf /var/lib/docker
+          sudo ln -s /mnt/docker /var/lib/docker
+          sudo systemctl start docker

-        -
-          name: Set up Docker Buildx
-          uses: docker/setup-buildx-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3

-        -
-          name: Login to Docker Hub
-          uses: docker/login-action@v3
-          with:
-            username: ${{ secrets.DOCKERHUB_USERNAME }}
-            password: ${{ secrets.DOCKERHUB_TOKEN }}
-        -
-          name: Build and push for amd64
-          if: matrix.platform == 'linux/amd64'
-          uses: docker/build-push-action@v6
-          with:
-            push: true
-            platforms: |
-              linux/amd64
-            tags: |
-              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
-              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
-            build-args: |
-              CPU_INSTRUCT=${{ matrix.instruct }}
-        -
-          name: Build and push for arm64
-          if: matrix.platform == 'linux/arm64'
-          uses: docker/build-push-action@v6
-          with:
-            push: true
-            platforms: |
-              linux/arm64
-            tags: |
-              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
-              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
-            build-args: |
-              CPU_INSTRUCT=${{ matrix.instruct }}
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Determine build parameters
+        id: params
+        run: |
+          # Determine if we should push
+          if [ "${{ github.event_name }}" = "release" ]; then
+            echo "should_push=true" >> $GITHUB_OUTPUT
+            echo "push_simplified=true" >> $GITHUB_OUTPUT
+          elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "should_push=${{ inputs.push_to_dockerhub }}" >> $GITHUB_OUTPUT
+            echo "push_simplified=${{ inputs.push_simplified_tag }}" >> $GITHUB_OUTPUT
+          else
+            echo "should_push=false" >> $GITHUB_OUTPUT
+            echo "push_simplified=false" >> $GITHUB_OUTPUT
+          fi
+
+          # Determine CUDA version
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.cuda_version }}" ]; then
+            echo "cuda_version=${{ inputs.cuda_version }}" >> $GITHUB_OUTPUT
+          else
+            echo "cuda_version=12.8.1" >> $GITHUB_OUTPUT
+          fi
+
+          # Determine Ubuntu mirror setting
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.ubuntu_mirror }}" ]; then
+            echo "ubuntu_mirror=${{ inputs.ubuntu_mirror }}" >> $GITHUB_OUTPUT
+          else
+            echo "ubuntu_mirror=0" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Build and push Docker image
+        run: |
+          cd docker
+
+          # Build command arguments
+          BUILD_ARGS=(
+            --cuda-version "${{ steps.params.outputs.cuda_version }}"
+            --ubuntu-mirror "${{ steps.params.outputs.ubuntu_mirror }}"
+            --repository "${{ env.DOCKERHUB_REPO }}"
+          )
+
+          # Add simplified tag option if enabled
+          if [ "${{ steps.params.outputs.push_simplified }}" = "true" ]; then
+            BUILD_ARGS+=(--also-push-simplified)
+          fi
+
+          # Add HTTP proxy if available
+          if [ -n "${{ secrets.HTTP_PROXY }}" ]; then
+            BUILD_ARGS+=(--http-proxy "${{ secrets.HTTP_PROXY }}")
+          fi
+
+          # Add HTTPS proxy if available
+          if [ -n "${{ secrets.HTTPS_PROXY }}" ]; then
+            BUILD_ARGS+=(--https-proxy "${{ secrets.HTTPS_PROXY }}")
+          fi
+
+          # Dry run if not pushing
+          if [ "${{ steps.params.outputs.should_push }}" != "true" ]; then
+            BUILD_ARGS+=(--dry-run)
+          fi
+
+          # Execute build script
+          ./push-to-dockerhub.sh "${BUILD_ARGS[@]}"
+
+      - name: Display image information
+        if: steps.params.outputs.should_push == 'true'
+        run: |
+          echo "::notice title=Docker Image::Image pushed successfully to ${{ env.DOCKERHUB_REPO }}"
+          echo "Pull command: docker pull ${{ env.DOCKERHUB_REPO }}:v\$(VERSION)-cu\$(CUDA_SHORT)"
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -1,71 +0,0 @@
-name: Install / Test KTransformers
-run-name: Install / Test KTransformers
-on:
-  workflow_dispatch:
-    inputs:
-      job_to_run:
-        description: "Which job to run?"
-        required: true
-        default: "test"
-        type: choice
-        options:
-          - create-install-test
-          - install-test
-          - test
-jobs:
-  Install-Test-KTransformers:
-    runs-on: self-hosted
-    steps:
-      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
-      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
-      - name: Check out repository code
-        uses: actions/checkout@v4
-      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
-      - name: Remove old conda environment
-        continue-on-error: true
-        if: contains(inputs.job_to_run, 'create')
-        run: |
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda env remove --name ktransformers-dev -y
-      - name: Create conda environment
-        if: contains(inputs.job_to_run, 'create')
-        run: |
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda create --name ktransformers-dev python=3.11
-          conda activate ktransformers-dev
-          conda install -c conda-forge libstdcxx-ng -y
-      - name: Install dependencies
-        if: contains(inputs.job_to_run, 'create')
-        run: |
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda activate ktransformers-dev
-          pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
-          pip3 install packaging ninja cpufeature numpy
-          pip install ~/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
-      - name: Install KTransformers
-        if: contains(inputs.job_to_run, 'install')
-        run: |
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda activate ktransformers-dev
-          pip3 uninstall ktransformers -y
-          cd ${{ github.workspace }}
-          git submodule init
-          git submodule update
-          bash install.sh
-      - name: Test Local Chat 1
-        run: |
-          set -e
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda activate ktransformers-dev
-          export PATH=/usr/local/cuda-12.4/bin:$PATH
-          export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
-          export CUDA_HOME=/usr/local/cuda-12.4
-          cd ${{ github.workspace }}
-          echo "Running Local Chat 1 (book.txt) ..."
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
-          sed -n '/Prompt:/,$p' log1.txt
-          echo "Running Local Chat 2 [force think] (chinese.txt) ..."
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt -f > log2.txt
-          sed -n '/Prompt:/,$p' log2.txt
-
-      - run: echo "This job's status is ${{ job.status }}."
--- a/.github/workflows/package_wheel_release.yml
+++ b/.github/workflows/package_wheel_release.yml
@@ -1,231 +0,0 @@
-name: Build Wheels
-on: 
-  workflow_dispatch:
-    inputs:
-      release:
-        description: 'Release? 1 = yes, 0 = no'
-        default: '0'
-        required: true
-        type: string
-jobs:
-  build_wheels:
-    name: ${{ matrix.os }} Python=${{ matrix.pyver }} CUDA=${{ matrix.cuda }} CPU_INSTRUCT=${{ matrix.instruct }} Torch=${{ matrix.torch }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        # Ubuntu
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-
-         # Windows
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-
-    defaults:
-      run:
-        shell: pwsh
-    
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Free Disk Space
-        uses: jlumbroso/free-disk-space@v1.3.1
-        if: runner.os == 'Linux'
-        with:
-          tool-cache: true
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: false
-          swap-storage: true
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.pyver }}
-
-      - name: check_space
-        run: |
-          if($IsLinux) {df -h}
-          if($IsWindows) {Get-PSDrive -PSProvider 'FileSystem'}
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup Mamba
-        if: matrix.cuda != ''
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          activate-environment: "ktransformers"
-          python-version: ${{ matrix.pyver }}
-          miniforge-variant: Miniforge3
-          miniforge-version: latest
-          use-mamba: true
-          add-pip-as-python-dependency: true
-          auto-activate-base: false
-
-
-
-      - name: build web
-        run: |
-          cd ktransformers/website/
-          npm install
-          npm run build
-          cd ../../
-
-      - name: build for cuda
-        if: matrix.cuda != ''
-        env:
-          USE_BALANCE_SERVE: "1"
-        run: |
-          git submodule init
-          git submodule update
-          if($IsWindows){
-            $originalPath = Get-Location
-            Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-            Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -DevCmdArguments '-arch=x64 -host_arch=x64'
-            $env:DISTUTILS_USE_SDK=1
-            Set-Location $originalPath
-          }
-          $cudaVersion = '${{ matrix.cuda }}'
-          $env:MAMBA_NO_LOW_SPEED_LIMIT = 1
-          mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime
-          $env:CUDA_PATH = $env:CONDA_PREFIX
-          $env:CUDA_HOME = $env:CONDA_PREFIX
-          if ($IsLinux) {
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib/python${{ matrix.pyver }}/site-packages/nvidia/nvjitlink/lib:' + $env:LD_LIBRARY_PATH
-            if (!(Test-Path $env:CUDA_HOME/lib64)) {
-              New-Item -ItemType SymbolicLink -Path $env:CUDA_HOME/lib64 -Target $env:CUDA_HOME/lib
-            }
-          }
-          if ($IsWindows) {
-            if (Test-Path -Path "$env:CUDA_PATH/Library/bin/nvcc.exe"){
-              $env:CUDA_PATH = "$env:CUDA_PATH/Library"
-              $env:CUDA_HOME = $env:CUDA_PATH
-            }
-            $env:PATH = "$env:CUDA_PATH/bin;" + $env:PATH
-            $directory = "$env:CUDA_PATH/lib/x64/"
-            if (-not (Test-Path -Path $directory)) {
-              New-Item -ItemType Directory -Path $directory
-              Write-Output "Directory '$directory' created."
-            }
-            cp $env:CUDA_PATH/lib/*.lib $env:CUDA_PATH/lib/x64/
-            $env:INCLUDE =$env:CUDA_PATH + "/include/targets/x64;" + $env:INCLUDE
-            $env:INCLUDE =$env:CONDA_PREFIX + "/include;" + $env:INCLUDE
-          }
-          python -m pip install torch==${{ matrix.torch }} torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${{ matrix.torch_cu }}
-          python -m pip install cpufeature build wheel ninja packaging setuptools
-          $env:KTRANSFORMERS_FORCE_BUILD = "TRUE"
-          $env:CPU_INSTRUCT = '${{ matrix.instruct }}'
-          $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}'
-          python -m build --no-isolation --verbose
-
-
-      - name: create Rlease dir
-        run: |
-          if ($IsWindows) {
-            $env:date = $(Get-Date -Format "yyyy-MM-dd")
-            New-Item -ItemType Directory -Force -Path "$Env:USERPROFILE\.ssh"
-            $Env:SSH_PATH = "$Env:USERPROFILE\.ssh\id_rsa"
-            Set-Content -Path $Env:SSH_PATH -Value "${{ secrets.SSH_PRIVATE_KEY }}"
-            (Get-Content -Path $Env:SSH_PATH).Replace("`r`n","`n") | Set-Content -Path $Env:SSH_PATH
-            chmod 600 $Env:SSH_PATH
-          }
-          if ($IsLinux) {
-            $env:date = $(date +%Y-%m-%d)
-            mkdir -p ~/.ssh/
-            echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
-            chmod 600 ~/.ssh/id_rsa
-          }
-          
-          ssh -p ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no root@${{ secrets.SSH_SERVER }} "mkdir -p /mnt/data/release-$env:date"
-          scp -P ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no dist/*.whl root@${{ secrets.SSH_SERVER }}:/mnt/data/release-$env:date/
--- a/.github/workflows/package_wheel_test.yml
+++ b/.github/workflows/package_wheel_test.yml
@@ -1,141 +0,0 @@
-name: Build Wheels Tests
-on: 
-  workflow_dispatch:
-    inputs:
-      release:
-        description: 'Release? 1 = yes, 0 = no'
-        default: '0'
-        required: true
-        type: string
-jobs:
-  build_wheels:
-    name: ${{ matrix.os }} Python=${{ matrix.pyver }} CUDA=${{ matrix.cuda }} CPU_INSTRUCT=${{ matrix.instruct }} Torch=${{ matrix.torch }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        # Ubuntu
-        - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-        - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-        - { os: windows-2022, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-        - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-
-    defaults:
-      run:
-        shell: pwsh
-    
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Free Disk Space
-        uses: jlumbroso/free-disk-space@v1.3.1
-        if: runner.os == 'Linux'
-        with:
-          tool-cache: true
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: false
-          swap-storage: true
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.pyver }}
-
-      - name: check_space
-        run: |
-          if($IsLinux) {df -h}
-          if($IsWindows) {Get-PSDrive -PSProvider 'FileSystem'}
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup Mamba
-        if: matrix.cuda != ''
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          activate-environment: "ktransformers"
-          python-version: ${{ matrix.pyver }}
-          miniforge-variant: Miniforge3
-          miniforge-version: latest
-          use-mamba: true
-          add-pip-as-python-dependency: true
-          auto-activate-base: false
-
-
-
-      - name: build web
-        run: |
-          cd ktransformers/website/
-          npm install
-          npm run build
-          cd ../../
-
-      - name: build for cuda
-        if: matrix.cuda != ''
-        run: |
-          git submodule init
-          git submodule update
-          if($IsWindows){
-            $originalPath = Get-Location
-            Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-            Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -DevCmdArguments '-arch=x64 -host_arch=x64'
-            $env:DISTUTILS_USE_SDK=1
-            Set-Location $originalPath
-          }
-          $cudaVersion = '${{ matrix.cuda }}'
-          $env:MAMBA_NO_LOW_SPEED_LIMIT = 1
-          mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime
-          $env:CUDA_PATH = $env:CONDA_PREFIX
-          $env:CUDA_HOME = $env:CONDA_PREFIX
-          if ($IsLinux) {
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib/python${{ matrix.pyver }}/site-packages/nvidia/nvjitlink/lib:' + $env:LD_LIBRARY_PATH
-            if (!(Test-Path $env:CUDA_HOME/lib64)) {
-              New-Item -ItemType SymbolicLink -Path $env:CUDA_HOME/lib64 -Target $env:CUDA_HOME/lib
-            }
-          }
-          if ($IsWindows) {
-            if (Test-Path -Path "$env:CUDA_PATH/Library/bin/nvcc.exe"){
-              $env:CUDA_PATH = "$env:CUDA_PATH/Library"
-              $env:CUDA_HOME = $env:CUDA_PATH
-            }
-            $env:PATH = "$env:CUDA_PATH/bin;" + $env:PATH
-            $directory = "$env:CUDA_PATH/lib/x64/"
-            if (-not (Test-Path -Path $directory)) {
-              New-Item -ItemType Directory -Path $directory
-              Write-Output "Directory '$directory' created."
-            }
-            cp $env:CUDA_PATH/lib/*.lib $env:CUDA_PATH/lib/x64/
-            $env:INCLUDE =$env:CUDA_PATH + "/include/targets/x64;" + $env:INCLUDE
-            $env:INCLUDE =$env:CONDA_PREFIX + "/include;" + $env:INCLUDE
-          }
-          python -m pip install torch==${{ matrix.torch }} torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${{ matrix.torch_cu }}
-          python -m pip install cpufeature build wheel ninja packaging setuptools
-          $env:KTRANSFORMERS_FORCE_BUILD = "TRUE"
-          $env:CPU_INSTRUCT = '${{ matrix.instruct }}'
-          $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}'
-          python -m build --no-isolation --verbose
-
-
-      - name: create Rlease dir
-        run: |
-          if ($IsWindows) {
-            $env:date = $(Get-Date -Format "yyyy-MM-dd")
-            New-Item -ItemType Directory -Force -Path "$Env:USERPROFILE\.ssh"
-            $Env:SSH_PATH = "$Env:USERPROFILE\.ssh\id_rsa"
-            Set-Content -Path $Env:SSH_PATH -Value "${{ secrets.SSH_PRIVATE_KEY }}"
-            (Get-Content -Path $Env:SSH_PATH).Replace("`r`n","`n") | Set-Content -Path $Env:SSH_PATH
-            chmod 600 $Env:SSH_PATH
-          }
-          if ($IsLinux) {
-            $env:date = $(date +%Y-%m-%d)
-            mkdir -p ~/.ssh/
-            echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
-            chmod 600 ~/.ssh/id_rsa
-          }
-          
-          ssh -p ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no root@${{ secrets.SSH_SERVER }} "mkdir -p /mnt/data/release-$env:date"
-          scp -P ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no dist/*.whl root@${{ secrets.SSH_SERVER }}:/mnt/data/release-$env:date/
--- a/.github/workflows/release-fake-tag.yml
+++ b/.github/workflows/release-fake-tag.yml
@@ -0,0 +1,36 @@
+name: Release Fake Tag
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "version.py"
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  publish:
+    if: github.repository == 'kvcache-ai/ktransformers'
+    runs-on: ubuntu-latest
+    environment: 'prod'
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Get version
+        id: get_version
+        run: |
+          version=$(cat version.py | grep '__version__' | cut -d'"' -f2)
+          echo "TAG=v$version" >> $GITHUB_OUTPUT
+
+      - name: Create and push tag
+        run: |
+          git config user.name "ktransformers-bot"
+          git config user.email "ktransformers-bot@users.noreply.github.com"
+          git tag ${{ steps.get_version.outputs.TAG }}
+          git push origin ${{ steps.get_version.outputs.TAG }}
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -0,0 +1,163 @@
+name: Release to PyPI
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "version.py"
+  workflow_dispatch:
+    inputs:
+      test_pypi:
+        description: 'Publish to TestPyPI instead of PyPI (for testing)'
+        required: false
+        default: 'false'
+        type: choice
+        options:
+          - 'true'
+          - 'false'
+
+permissions:
+  contents: read
+
+jobs:
+  build-kt-kernel:
+    name: Build kt-kernel CPU-only (Python ${{ matrix.python-version }})
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y cmake libhwloc-dev pkg-config libnuma-dev
+
+      - name: Install Python build tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install build wheel setuptools
+
+      - name: Build kt-kernel wheel (CPU-only, multi-variant)
+        working-directory: kt-kernel
+        env:
+          CPUINFER_BUILD_ALL_VARIANTS: '1'
+          CPUINFER_USE_CUDA: '0'
+          CPUINFER_BUILD_TYPE: 'Release'
+          CPUINFER_PARALLEL: '4'
+          CPUINFER_FORCE_REBUILD: '1'
+        run: |
+          echo "Building kt-kernel CPU-only with all CPU variants (AMX, AVX512, AVX2)"
+          python -m build --wheel --no-isolation -v
+
+      - name: List generated wheels
+        working-directory: kt-kernel
+        run: |
+          echo "Generated wheels:"
+          ls -lh dist/
+
+      - name: Test wheel import
+        working-directory: kt-kernel
+        run: |
+          pip install dist/*.whl
+          python -c "import kt_kernel; print('✓ Import successful'); print(f'CPU variant detected: {kt_kernel.__cpu_variant__}'); print(f'Version: {kt_kernel.__version__}')"
+
+      - name: Verify wheel contains all variants
+        working-directory: kt-kernel
+        run: |
+          echo "Checking wheel contents for CPU variants..."
+          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_" || echo "ERROR: No variant .so files found!"
+          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_amx.cpython" && echo "✓ AMX variant found" || echo "✗ AMX variant missing"
+          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_avx512.cpython" && echo "✓ AVX512 variant found" || echo "✗ AVX512 variant missing"
+          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_avx2.cpython" && echo "✓ AVX2 variant found" || echo "✗ AVX2 variant missing"
+
+      - name: Upload wheel artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: kt-kernel-wheels-py${{ matrix.python-version }}
+          path: kt-kernel/dist/*.whl
+          retention-days: 7
+
+  publish-pypi:
+    name: Publish to PyPI
+    needs: build-kt-kernel
+    runs-on: ubuntu-latest
+    if: github.repository == 'kvcache-ai/ktransformers' && github.ref == 'refs/heads/main'
+    environment: prod
+    permissions:
+      id-token: write  # For trusted publishing (OIDC)
+      contents: read
+
+    steps:
+      - name: Download all wheel artifacts
+        uses: actions/download-artifact@v3
+        with:
+          path: artifacts/
+
+      - name: Organize wheels into dist/
+        run: |
+          mkdir -p dist/
+          find artifacts/ -name "*.whl" -exec cp {} dist/ \;
+          echo "Wheels to publish:"
+          ls -lh dist/
+
+      - name: Get version from wheel
+        id: get_version
+        run: |
+          # Extract version from first wheel filename
+          wheel_name=$(ls dist/*.whl | head -1 | xargs basename)
+          # Extract version (format: kt_kernel-X.Y.Z-...)
+          version=$(echo "$wheel_name" | sed 's/kt_kernel-\([0-9.]*\)-.*/\1/')
+          echo "VERSION=$version" >> $GITHUB_OUTPUT
+          echo "Publishing version: $version"
+
+      - name: Publish to TestPyPI (if requested)
+        if: github.event.inputs.test_pypi == 'true'
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          repository-url: https://test.pypi.org/legacy/
+          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+          skip-existing: true
+          print-hash: true
+
+      - name: Publish to PyPI
+        if: github.event.inputs.test_pypi != 'true'
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.PYPI_API_TOKEN }}
+          skip-existing: true
+          print-hash: true
+
+      - name: Create release summary
+        run: |
+          echo "## 🎉 kt-kernel v${{ steps.get_version.outputs.VERSION }} Published to PyPI" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Installation" >> $GITHUB_STEP_SUMMARY
+          echo '```bash' >> $GITHUB_STEP_SUMMARY
+          echo "pip install kt-kernel==${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
+          echo '```' >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Published Wheels" >> $GITHUB_STEP_SUMMARY
+          echo "Total: $(ls -1 dist/*.whl | wc -l) wheels (3 Python versions: 3.10, 3.11, 3.12)" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Features" >> $GITHUB_STEP_SUMMARY
+          echo "**CPU-only build with multi-variant support:**" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ AMX (Intel Sapphire Rapids+)" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ AVX512 (Intel Skylake-X/Ice Lake/Cascade Lake)" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ AVX2 (Maximum compatibility)" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Runtime CPU detection:** Automatically selects the best variant for your CPU" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "PyPI link: https://pypi.org/project/kt-kernel/#history" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/score.yml
+++ b/.github/workflows/score.yml
@@ -1,24 +0,0 @@
-name: Human Eval Score
-run-name: Human Eval Score
-on: workflow_dispatch
-jobs:
-  Human-Eval-Score:
-    runs-on: self-hosted
-    steps:
-      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
-      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
-      - name: Check out repository code
-        uses: actions/checkout@v4
-      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
-      - name: Human Eval Run
-        run: |
-          set -e
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda activate ktransformers-dev
-          export PATH=/usr/local/cuda-12.4/bin:$PATH
-          export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
-          export CUDA_HOME=/usr/local/cuda-12.4
-          cd ${{ github.workspace }}
-          python ktransformers/tests/score.py
-
-      - run: echo "This job's status is ${{ job.status }}."