[feat](kt-kernel): Add automatic deployment workflow (#1719)

2026-03-14 18:37:23 +00:00 · 2025-12-16 15:20:06 +08:00
parent f25e58ad69
commit 1f79f6da92
31 changed files with 3691 additions and 552 deletions
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -5,9 +5,24 @@ on:
    types: [published]
  workflow_dispatch:
    inputs:
-      choose:
-        description: 'Will you push the image to DockerHub? 0 for No, 1 for Yes'
+      push_to_dockerhub:
+        description: 'Push image to DockerHub? (true/false)'
        required: true
+        default: 'false'
+        type: boolean
+      cuda_version:
+        description: 'CUDA version (e.g., 12.8.1)'
+        required: false
+        default: '12.8.1'
+        type: string
+      push_simplified_tag:
+        description: 'Also push simplified tag? (true/false)'
+        required: false
+        default: 'true'
+        type: boolean
+      ubuntu_mirror:
+        description: 'Use Tsinghua Ubuntu mirror? (0/1)'
+        required: false
        default: '0'
        type: string

@@ -20,79 +35,108 @@ jobs:
  test:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
      - name: Run tests
        run: |
          if [ -f docker-compose.test.yml ]; then
            docker-compose --file docker-compose.test.yml build
            docker-compose --file docker-compose.test.yml run sut
          else
-            docker build . --file Dockerfile
+            docker build . --file docker/Dockerfile
          fi

-  docker_task:
+  build-and-push:
    needs: test
-    name: ${{ matrix.instruct}}
+    name: Build and Push Multi-Variant Docker Image
    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        # for amd64
-          - {instruct: "FANCY",   platform: "linux/amd64"}
-          - {instruct: "AVX512",  platform: "linux/amd64"}
-          - {instruct: "AVX2",    platform: "linux/amd64"}   
-          - {instruct: "NATIVE",  platform: "linux/amd64"}
-        # for arm64
-          - {instruct: "NATIVE",  platform: "linux/arm64"}

    steps:
-        - name: Move Docker data directory
-          run: |
-            sudo systemctl stop docker
-            sudo mkdir -p /mnt/docker
-            sudo rsync -avz /var/lib/docker/ /mnt/docker
-            sudo rm -rf /var/lib/docker 
-            sudo ln -s /mnt/docker /var/lib/docker
-            sudo systemctl start docker
+      - name: Checkout repository
+        uses: actions/checkout@v4

-        -
-          name: Set up QEMU
-          uses: docker/setup-qemu-action@v3
+      - name: Move Docker data directory
+        run: |
+          sudo systemctl stop docker
+          sudo mkdir -p /mnt/docker
+          sudo rsync -avz /var/lib/docker/ /mnt/docker
+          sudo rm -rf /var/lib/docker
+          sudo ln -s /mnt/docker /var/lib/docker
+          sudo systemctl start docker

-        -
-          name: Set up Docker Buildx
-          uses: docker/setup-buildx-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3

-        -
-          name: Login to Docker Hub
-          uses: docker/login-action@v3
-          with:
-            username: ${{ secrets.DOCKERHUB_USERNAME }}
-            password: ${{ secrets.DOCKERHUB_TOKEN }}
-        -
-          name: Build and push for amd64
-          if: matrix.platform == 'linux/amd64'
-          uses: docker/build-push-action@v6
-          with:
-            push: true
-            platforms: |
-              linux/amd64
-            tags: |
-              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
-              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
-            build-args: |
-              CPU_INSTRUCT=${{ matrix.instruct }}
-        -
-          name: Build and push for arm64
-          if: matrix.platform == 'linux/arm64'
-          uses: docker/build-push-action@v6
-          with:
-            push: true
-            platforms: |
-              linux/arm64
-            tags: |
-              ${{ env.DOCKERHUB_REPO }}:latest-${{ matrix.instruct }}
-              ${{ env.DOCKERHUB_REPO }}:${{ github.event.release.tag_name }}-${{ matrix.instruct }}
-            build-args: |
-              CPU_INSTRUCT=${{ matrix.instruct }}
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Determine build parameters
+        id: params
+        run: |
+          # Determine if we should push
+          if [ "${{ github.event_name }}" = "release" ]; then
+            echo "should_push=true" >> $GITHUB_OUTPUT
+            echo "push_simplified=true" >> $GITHUB_OUTPUT
+          elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "should_push=${{ inputs.push_to_dockerhub }}" >> $GITHUB_OUTPUT
+            echo "push_simplified=${{ inputs.push_simplified_tag }}" >> $GITHUB_OUTPUT
+          else
+            echo "should_push=false" >> $GITHUB_OUTPUT
+            echo "push_simplified=false" >> $GITHUB_OUTPUT
+          fi
+
+          # Determine CUDA version
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.cuda_version }}" ]; then
+            echo "cuda_version=${{ inputs.cuda_version }}" >> $GITHUB_OUTPUT
+          else
+            echo "cuda_version=12.8.1" >> $GITHUB_OUTPUT
+          fi
+
+          # Determine Ubuntu mirror setting
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.ubuntu_mirror }}" ]; then
+            echo "ubuntu_mirror=${{ inputs.ubuntu_mirror }}" >> $GITHUB_OUTPUT
+          else
+            echo "ubuntu_mirror=0" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Build and push Docker image
+        run: |
+          cd docker
+
+          # Build command arguments
+          BUILD_ARGS=(
+            --cuda-version "${{ steps.params.outputs.cuda_version }}"
+            --ubuntu-mirror "${{ steps.params.outputs.ubuntu_mirror }}"
+            --repository "${{ env.DOCKERHUB_REPO }}"
+          )
+
+          # Add simplified tag option if enabled
+          if [ "${{ steps.params.outputs.push_simplified }}" = "true" ]; then
+            BUILD_ARGS+=(--also-push-simplified)
+          fi
+
+          # Add HTTP proxy if available
+          if [ -n "${{ secrets.HTTP_PROXY }}" ]; then
+            BUILD_ARGS+=(--http-proxy "${{ secrets.HTTP_PROXY }}")
+          fi
+
+          # Add HTTPS proxy if available
+          if [ -n "${{ secrets.HTTPS_PROXY }}" ]; then
+            BUILD_ARGS+=(--https-proxy "${{ secrets.HTTPS_PROXY }}")
+          fi
+
+          # Dry run if not pushing
+          if [ "${{ steps.params.outputs.should_push }}" != "true" ]; then
+            BUILD_ARGS+=(--dry-run)
+          fi
+
+          # Execute build script
+          ./push-to-dockerhub.sh "${BUILD_ARGS[@]}"
+
+      - name: Display image information
+        if: steps.params.outputs.should_push == 'true'
+        run: |
+          echo "::notice title=Docker Image::Image pushed successfully to ${{ env.DOCKERHUB_REPO }}"
+          echo "Pull command: docker pull ${{ env.DOCKERHUB_REPO }}:v\$(VERSION)-cu\$(CUDA_SHORT)"
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -1,71 +0,0 @@
-name: Install / Test KTransformers
-run-name: Install / Test KTransformers
-on:
-  workflow_dispatch:
-    inputs:
-      job_to_run:
-        description: "Which job to run?"
-        required: true
-        default: "test"
-        type: choice
-        options:
-          - create-install-test
-          - install-test
-          - test
-jobs:
-  Install-Test-KTransformers:
-    runs-on: self-hosted
-    steps:
-      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
-      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
-      - name: Check out repository code
-        uses: actions/checkout@v4
-      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
-      - name: Remove old conda environment
-        continue-on-error: true
-        if: contains(inputs.job_to_run, 'create')
-        run: |
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda env remove --name ktransformers-dev -y
-      - name: Create conda environment
-        if: contains(inputs.job_to_run, 'create')
-        run: |
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda create --name ktransformers-dev python=3.11
-          conda activate ktransformers-dev
-          conda install -c conda-forge libstdcxx-ng -y
-      - name: Install dependencies
-        if: contains(inputs.job_to_run, 'create')
-        run: |
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda activate ktransformers-dev
-          pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
-          pip3 install packaging ninja cpufeature numpy
-          pip install ~/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
-      - name: Install KTransformers
-        if: contains(inputs.job_to_run, 'install')
-        run: |
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda activate ktransformers-dev
-          pip3 uninstall ktransformers -y
-          cd ${{ github.workspace }}
-          git submodule init
-          git submodule update
-          bash install.sh
-      - name: Test Local Chat 1
-        run: |
-          set -e
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda activate ktransformers-dev
-          export PATH=/usr/local/cuda-12.4/bin:$PATH
-          export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
-          export CUDA_HOME=/usr/local/cuda-12.4
-          cd ${{ github.workspace }}
-          echo "Running Local Chat 1 (book.txt) ..."
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/book.txt > log1.txt
-          sed -n '/Prompt:/,$p' log1.txt
-          echo "Running Local Chat 2 [force think] (chinese.txt) ..."
-          python ktransformers/local_chat_test.py --model_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/config --gguf_path /home/qujing3/models/DeepSeek-R1-Q4_K_M/ --max_new_tokens 256 --cpu_infer 64 --prompt_file /home/qujing3/prompts/chinese.txt -f > log2.txt
-          sed -n '/Prompt:/,$p' log2.txt
-
-      - run: echo "This job's status is ${{ job.status }}."
--- a/.github/workflows/package_wheel_release.yml
+++ b/.github/workflows/package_wheel_release.yml
@@ -1,231 +0,0 @@
-name: Build Wheels
-on: 
-  workflow_dispatch:
-    inputs:
-      release:
-        description: 'Release? 1 = yes, 0 = no'
-        default: '0'
-        required: true
-        type: string
-jobs:
-  build_wheels:
-    name: ${{ matrix.os }} Python=${{ matrix.pyver }} CUDA=${{ matrix.cuda }} CPU_INSTRUCT=${{ matrix.instruct }} Torch=${{ matrix.torch }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        # Ubuntu
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: ubuntu-20.04, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-
-         # Windows
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.11', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.4.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.4.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX512', torch_cu: '121'}
-         - { os: windows-2022, pyver:  '3.10', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.0;8.6;8.7;8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-
-    defaults:
-      run:
-        shell: pwsh
-    
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Free Disk Space
-        uses: jlumbroso/free-disk-space@v1.3.1
-        if: runner.os == 'Linux'
-        with:
-          tool-cache: true
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: false
-          swap-storage: true
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.pyver }}
-
-      - name: check_space
-        run: |
-          if($IsLinux) {df -h}
-          if($IsWindows) {Get-PSDrive -PSProvider 'FileSystem'}
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup Mamba
-        if: matrix.cuda != ''
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          activate-environment: "ktransformers"
-          python-version: ${{ matrix.pyver }}
-          miniforge-variant: Miniforge3
-          miniforge-version: latest
-          use-mamba: true
-          add-pip-as-python-dependency: true
-          auto-activate-base: false
-
-
-
-      - name: build web
-        run: |
-          cd ktransformers/website/
-          npm install
-          npm run build
-          cd ../../
-
-      - name: build for cuda
-        if: matrix.cuda != ''
-        env:
-          USE_BALANCE_SERVE: "1"
-        run: |
-          git submodule init
-          git submodule update
-          if($IsWindows){
-            $originalPath = Get-Location
-            Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-            Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -DevCmdArguments '-arch=x64 -host_arch=x64'
-            $env:DISTUTILS_USE_SDK=1
-            Set-Location $originalPath
-          }
-          $cudaVersion = '${{ matrix.cuda }}'
-          $env:MAMBA_NO_LOW_SPEED_LIMIT = 1
-          mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime
-          $env:CUDA_PATH = $env:CONDA_PREFIX
-          $env:CUDA_HOME = $env:CONDA_PREFIX
-          if ($IsLinux) {
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib/python${{ matrix.pyver }}/site-packages/nvidia/nvjitlink/lib:' + $env:LD_LIBRARY_PATH
-            if (!(Test-Path $env:CUDA_HOME/lib64)) {
-              New-Item -ItemType SymbolicLink -Path $env:CUDA_HOME/lib64 -Target $env:CUDA_HOME/lib
-            }
-          }
-          if ($IsWindows) {
-            if (Test-Path -Path "$env:CUDA_PATH/Library/bin/nvcc.exe"){
-              $env:CUDA_PATH = "$env:CUDA_PATH/Library"
-              $env:CUDA_HOME = $env:CUDA_PATH
-            }
-            $env:PATH = "$env:CUDA_PATH/bin;" + $env:PATH
-            $directory = "$env:CUDA_PATH/lib/x64/"
-            if (-not (Test-Path -Path $directory)) {
-              New-Item -ItemType Directory -Path $directory
-              Write-Output "Directory '$directory' created."
-            }
-            cp $env:CUDA_PATH/lib/*.lib $env:CUDA_PATH/lib/x64/
-            $env:INCLUDE =$env:CUDA_PATH + "/include/targets/x64;" + $env:INCLUDE
-            $env:INCLUDE =$env:CONDA_PREFIX + "/include;" + $env:INCLUDE
-          }
-          python -m pip install torch==${{ matrix.torch }} torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${{ matrix.torch_cu }}
-          python -m pip install cpufeature build wheel ninja packaging setuptools
-          $env:KTRANSFORMERS_FORCE_BUILD = "TRUE"
-          $env:CPU_INSTRUCT = '${{ matrix.instruct }}'
-          $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}'
-          python -m build --no-isolation --verbose
-
-
-      - name: create Rlease dir
-        run: |
-          if ($IsWindows) {
-            $env:date = $(Get-Date -Format "yyyy-MM-dd")
-            New-Item -ItemType Directory -Force -Path "$Env:USERPROFILE\.ssh"
-            $Env:SSH_PATH = "$Env:USERPROFILE\.ssh\id_rsa"
-            Set-Content -Path $Env:SSH_PATH -Value "${{ secrets.SSH_PRIVATE_KEY }}"
-            (Get-Content -Path $Env:SSH_PATH).Replace("`r`n","`n") | Set-Content -Path $Env:SSH_PATH
-            chmod 600 $Env:SSH_PATH
-          }
-          if ($IsLinux) {
-            $env:date = $(date +%Y-%m-%d)
-            mkdir -p ~/.ssh/
-            echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
-            chmod 600 ~/.ssh/id_rsa
-          }
-          
-          ssh -p ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no root@${{ secrets.SSH_SERVER }} "mkdir -p /mnt/data/release-$env:date"
-          scp -P ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no dist/*.whl root@${{ secrets.SSH_SERVER }}:/mnt/data/release-$env:date/
--- a/.github/workflows/package_wheel_test.yml
+++ b/.github/workflows/package_wheel_test.yml
@@ -1,141 +0,0 @@
-name: Build Wheels Tests
-on: 
-  workflow_dispatch:
-    inputs:
-      release:
-        description: 'Release? 1 = yes, 0 = no'
-        default: '0'
-        required: true
-        type: string
-jobs:
-  build_wheels:
-    name: ${{ matrix.os }} Python=${{ matrix.pyver }} CUDA=${{ matrix.cuda }} CPU_INSTRUCT=${{ matrix.instruct }} Torch=${{ matrix.torch }}
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        # Ubuntu
-        - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '124'}
-        - { os: ubuntu-20.04, pyver:  '3.12', cuda: '12.2.2', torch: '2.3.0', cudaarch: '8.9;9.0+PTX', instruct: 'FANCY', torch_cu: '121'}
-        - { os: windows-2022, pyver:  '3.11', cuda: '12.5.1', torch: '2.4.0', cudaarch: '8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '124'}
-        - { os: windows-2022, pyver:  '3.12', cuda: '12.1.1', torch: '2.3.0', cudaarch: '8.9;9.0+PTX', instruct: 'AVX2', torch_cu: '121'}
-
-    defaults:
-      run:
-        shell: pwsh
-    
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Free Disk Space
-        uses: jlumbroso/free-disk-space@v1.3.1
-        if: runner.os == 'Linux'
-        with:
-          tool-cache: true
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: false
-          swap-storage: true
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.pyver }}
-
-      - name: check_space
-        run: |
-          if($IsLinux) {df -h}
-          if($IsWindows) {Get-PSDrive -PSProvider 'FileSystem'}
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Setup Mamba
-        if: matrix.cuda != ''
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          activate-environment: "ktransformers"
-          python-version: ${{ matrix.pyver }}
-          miniforge-variant: Miniforge3
-          miniforge-version: latest
-          use-mamba: true
-          add-pip-as-python-dependency: true
-          auto-activate-base: false
-
-
-
-      - name: build web
-        run: |
-          cd ktransformers/website/
-          npm install
-          npm run build
-          cd ../../
-
-      - name: build for cuda
-        if: matrix.cuda != ''
-        run: |
-          git submodule init
-          git submodule update
-          if($IsWindows){
-            $originalPath = Get-Location
-            Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
-            Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -DevCmdArguments '-arch=x64 -host_arch=x64'
-            $env:DISTUTILS_USE_SDK=1
-            Set-Location $originalPath
-          }
-          $cudaVersion = '${{ matrix.cuda }}'
-          $env:MAMBA_NO_LOW_SPEED_LIMIT = 1
-          mamba install -y -c nvidia/label/cuda-$cudaVersion cuda-toolkit cuda-runtime
-          $env:CUDA_PATH = $env:CONDA_PREFIX
-          $env:CUDA_HOME = $env:CONDA_PREFIX
-          if ($IsLinux) {
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
-            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib/python${{ matrix.pyver }}/site-packages/nvidia/nvjitlink/lib:' + $env:LD_LIBRARY_PATH
-            if (!(Test-Path $env:CUDA_HOME/lib64)) {
-              New-Item -ItemType SymbolicLink -Path $env:CUDA_HOME/lib64 -Target $env:CUDA_HOME/lib
-            }
-          }
-          if ($IsWindows) {
-            if (Test-Path -Path "$env:CUDA_PATH/Library/bin/nvcc.exe"){
-              $env:CUDA_PATH = "$env:CUDA_PATH/Library"
-              $env:CUDA_HOME = $env:CUDA_PATH
-            }
-            $env:PATH = "$env:CUDA_PATH/bin;" + $env:PATH
-            $directory = "$env:CUDA_PATH/lib/x64/"
-            if (-not (Test-Path -Path $directory)) {
-              New-Item -ItemType Directory -Path $directory
-              Write-Output "Directory '$directory' created."
-            }
-            cp $env:CUDA_PATH/lib/*.lib $env:CUDA_PATH/lib/x64/
-            $env:INCLUDE =$env:CUDA_PATH + "/include/targets/x64;" + $env:INCLUDE
-            $env:INCLUDE =$env:CONDA_PREFIX + "/include;" + $env:INCLUDE
-          }
-          python -m pip install torch==${{ matrix.torch }} torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${{ matrix.torch_cu }}
-          python -m pip install cpufeature build wheel ninja packaging setuptools
-          $env:KTRANSFORMERS_FORCE_BUILD = "TRUE"
-          $env:CPU_INSTRUCT = '${{ matrix.instruct }}'
-          $env:TORCH_CUDA_ARCH_LIST = '${{ matrix.cudaarch }}'
-          python -m build --no-isolation --verbose
-
-
-      - name: create Rlease dir
-        run: |
-          if ($IsWindows) {
-            $env:date = $(Get-Date -Format "yyyy-MM-dd")
-            New-Item -ItemType Directory -Force -Path "$Env:USERPROFILE\.ssh"
-            $Env:SSH_PATH = "$Env:USERPROFILE\.ssh\id_rsa"
-            Set-Content -Path $Env:SSH_PATH -Value "${{ secrets.SSH_PRIVATE_KEY }}"
-            (Get-Content -Path $Env:SSH_PATH).Replace("`r`n","`n") | Set-Content -Path $Env:SSH_PATH
-            chmod 600 $Env:SSH_PATH
-          }
-          if ($IsLinux) {
-            $env:date = $(date +%Y-%m-%d)
-            mkdir -p ~/.ssh/
-            echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
-            chmod 600 ~/.ssh/id_rsa
-          }
-          
-          ssh -p ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no root@${{ secrets.SSH_SERVER }} "mkdir -p /mnt/data/release-$env:date"
-          scp -P ${{ secrets.SSH_PORT }} -o StrictHostKeyChecking=no dist/*.whl root@${{ secrets.SSH_SERVER }}:/mnt/data/release-$env:date/
--- a/.github/workflows/release-fake-tag.yml
+++ b/.github/workflows/release-fake-tag.yml
@@ -0,0 +1,36 @@
+name: Release Fake Tag
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "version.py"
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  publish:
+    if: github.repository == 'kvcache-ai/ktransformers'
+    runs-on: ubuntu-latest
+    environment: 'prod'
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Get version
+        id: get_version
+        run: |
+          version=$(cat version.py | grep '__version__' | cut -d'"' -f2)
+          echo "TAG=v$version" >> $GITHUB_OUTPUT
+
+      - name: Create and push tag
+        run: |
+          git config user.name "ktransformers-bot"
+          git config user.email "ktransformers-bot@users.noreply.github.com"
+          git tag ${{ steps.get_version.outputs.TAG }}
+          git push origin ${{ steps.get_version.outputs.TAG }}
--- a/.github/workflows/release-pypi.yml
+++ b/.github/workflows/release-pypi.yml
@@ -0,0 +1,163 @@
+name: Release to PyPI
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "version.py"
+  workflow_dispatch:
+    inputs:
+      test_pypi:
+        description: 'Publish to TestPyPI instead of PyPI (for testing)'
+        required: false
+        default: 'false'
+        type: choice
+        options:
+          - 'true'
+          - 'false'
+
+permissions:
+  contents: read
+
+jobs:
+  build-kt-kernel:
+    name: Build kt-kernel CPU-only (Python ${{ matrix.python-version }})
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y cmake libhwloc-dev pkg-config libnuma-dev
+
+      - name: Install Python build tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install build wheel setuptools
+
+      - name: Build kt-kernel wheel (CPU-only, multi-variant)
+        working-directory: kt-kernel
+        env:
+          CPUINFER_BUILD_ALL_VARIANTS: '1'
+          CPUINFER_USE_CUDA: '0'
+          CPUINFER_BUILD_TYPE: 'Release'
+          CPUINFER_PARALLEL: '4'
+          CPUINFER_FORCE_REBUILD: '1'
+        run: |
+          echo "Building kt-kernel CPU-only with all CPU variants (AMX, AVX512, AVX2)"
+          python -m build --wheel --no-isolation -v
+
+      - name: List generated wheels
+        working-directory: kt-kernel
+        run: |
+          echo "Generated wheels:"
+          ls -lh dist/
+
+      - name: Test wheel import
+        working-directory: kt-kernel
+        run: |
+          pip install dist/*.whl
+          python -c "import kt_kernel; print('✓ Import successful'); print(f'CPU variant detected: {kt_kernel.__cpu_variant__}'); print(f'Version: {kt_kernel.__version__}')"
+
+      - name: Verify wheel contains all variants
+        working-directory: kt-kernel
+        run: |
+          echo "Checking wheel contents for CPU variants..."
+          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_" || echo "ERROR: No variant .so files found!"
+          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_amx.cpython" && echo "✓ AMX variant found" || echo "✗ AMX variant missing"
+          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_avx512.cpython" && echo "✓ AVX512 variant found" || echo "✗ AVX512 variant missing"
+          python -m zipfile -l dist/*.whl | grep "_kt_kernel_ext_avx2.cpython" && echo "✓ AVX2 variant found" || echo "✗ AVX2 variant missing"
+
+      - name: Upload wheel artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: kt-kernel-wheels-py${{ matrix.python-version }}
+          path: kt-kernel/dist/*.whl
+          retention-days: 7
+
+  publish-pypi:
+    name: Publish to PyPI
+    needs: build-kt-kernel
+    runs-on: ubuntu-latest
+    if: github.repository == 'kvcache-ai/ktransformers' && github.ref == 'refs/heads/main'
+    environment: prod
+    permissions:
+      id-token: write  # For trusted publishing (OIDC)
+      contents: read
+
+    steps:
+      - name: Download all wheel artifacts
+        uses: actions/download-artifact@v3
+        with:
+          path: artifacts/
+
+      - name: Organize wheels into dist/
+        run: |
+          mkdir -p dist/
+          find artifacts/ -name "*.whl" -exec cp {} dist/ \;
+          echo "Wheels to publish:"
+          ls -lh dist/
+
+      - name: Get version from wheel
+        id: get_version
+        run: |
+          # Extract version from first wheel filename
+          wheel_name=$(ls dist/*.whl | head -1 | xargs basename)
+          # Extract version (format: kt_kernel-X.Y.Z-...)
+          version=$(echo "$wheel_name" | sed 's/kt_kernel-\([0-9.]*\)-.*/\1/')
+          echo "VERSION=$version" >> $GITHUB_OUTPUT
+          echo "Publishing version: $version"
+
+      - name: Publish to TestPyPI (if requested)
+        if: github.event.inputs.test_pypi == 'true'
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          repository-url: https://test.pypi.org/legacy/
+          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+          skip-existing: true
+          print-hash: true
+
+      - name: Publish to PyPI
+        if: github.event.inputs.test_pypi != 'true'
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.PYPI_API_TOKEN }}
+          skip-existing: true
+          print-hash: true
+
+      - name: Create release summary
+        run: |
+          echo "## 🎉 kt-kernel v${{ steps.get_version.outputs.VERSION }} Published to PyPI" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Installation" >> $GITHUB_STEP_SUMMARY
+          echo '```bash' >> $GITHUB_STEP_SUMMARY
+          echo "pip install kt-kernel==${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
+          echo '```' >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Published Wheels" >> $GITHUB_STEP_SUMMARY
+          echo "Total: $(ls -1 dist/*.whl | wc -l) wheels (3 Python versions: 3.10, 3.11, 3.12)" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "### Features" >> $GITHUB_STEP_SUMMARY
+          echo "**CPU-only build with multi-variant support:**" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ AMX (Intel Sapphire Rapids+)" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ AVX512 (Intel Skylake-X/Ice Lake/Cascade Lake)" >> $GITHUB_STEP_SUMMARY
+          echo "- ✅ AVX2 (Maximum compatibility)" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Runtime CPU detection:** Automatically selects the best variant for your CPU" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "PyPI link: https://pypi.org/project/kt-kernel/#history" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/score.yml
+++ b/.github/workflows/score.yml
@@ -1,24 +0,0 @@
-name: Human Eval Score
-run-name: Human Eval Score
-on: workflow_dispatch
-jobs:
-  Human-Eval-Score:
-    runs-on: self-hosted
-    steps:
-      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
-      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
-      - name: Check out repository code
-        uses: actions/checkout@v4
-      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
-      - name: Human Eval Run
-        run: |
-          set -e
-          source /home/qujing3/anaconda3/etc/profile.d/conda.sh
-          conda activate ktransformers-dev
-          export PATH=/usr/local/cuda-12.4/bin:$PATH
-          export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH
-          export CUDA_HOME=/usr/local/cuda-12.4
-          cd ${{ github.workspace }}
-          python ktransformers/tests/score.py
-
-      - run: echo "This job's status is ${{ job.status }}."
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -0,0 +1,408 @@
+ARG CUDA_VERSION=12.8.1
+FROM docker.1ms.run/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu24.04 AS base
+
+ARG TARGETARCH
+ARG GRACE_BLACKWELL=0
+ARG HOPPER_SBO=0
+ARG CPU_VARIANT=x86-intel-multi
+ARG BUILD_ALL_CPU_VARIANTS=1
+
+# Proxy settings for build-time network access
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG http_proxy
+ARG https_proxy
+ENV HTTP_PROXY=${HTTP_PROXY} \
+    HTTPS_PROXY=${HTTPS_PROXY} \
+    http_proxy=${http_proxy} \
+    https_proxy=${https_proxy}
+
+ARG GRACE_BLACKWELL_DEEPEP_BRANCH=gb200_blog_part_2
+ARG HOPPER_SBO_DEEPEP_COMMIT=9f2fc4b3182a51044ae7ecb6610f7c9c3258c4d6
+ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
+ARG BUILD_AND_DOWNLOAD_PARALLEL=8
+ARG SGL_KERNEL_VERSION=0.3.19
+ARG SGL_VERSION=0.5.6.post1
+ARG USE_LATEST_SGLANG=0
+ARG GDRCOPY_VERSION=2.5.1
+ARG UBUNTU_MIRROR
+ARG GITHUB_ARTIFACTORY=github.com
+ARG FLASHINFER_VERSION=0.5.3
+
+# ktransformers wheel version (cu128torch28 for CUDA 12.8 + PyTorch 2.8)
+ARG KTRANSFORMERS_VERSION=0.4.2
+ARG KTRANSFORMERS_WHEEL=ktransformers-0.4.2+cu128torch28fancy-cp312-cp312-linux_x86_64.whl
+
+# flash_attn wheel for fine-tune env
+ARG FLASH_ATTN_WHEEL=flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    CUDA_HOME=/usr/local/cuda \
+    GDRCOPY_HOME=/usr/src/gdrdrv-${GDRCOPY_VERSION}/ \
+    FLASHINFER_VERSION=${FLASHINFER_VERSION}
+
+# Add GKE default lib and bin locations
+ENV PATH="${PATH}:/usr/local/nvidia/bin" \
+    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
+
+# Replace Ubuntu sources with Tsinghua mirror for Ubuntu 24.04 (noble)
+RUN if [ -n "$UBUNTU_MIRROR" ]; then \
+    echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ noble main restricted universe multiverse" > /etc/apt/sources.list && \
+    echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ noble-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
+    echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ noble-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
+    echo "deb http://security.ubuntu.com/ubuntu/ noble-security main restricted universe multiverse" >> /etc/apt/sources.list && \
+    rm -f /etc/apt/sources.list.d/ubuntu.sources; \
+fi
+
+# Install system dependencies (organized by category for better caching)
+RUN --mount=type=cache,target=/var/cache/apt,id=base-apt \
+    echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update && apt-get install -y --no-install-recommends --allow-change-held-packages \
+    # Core system utilities
+    tzdata \
+    ca-certificates \
+    software-properties-common \
+    netcat-openbsd \
+    kmod \
+    unzip \
+    openssh-server \
+    curl \
+    wget \
+    lsof \
+    locales \
+    # Build essentials
+    build-essential \
+    cmake \
+    perl \
+    patchelf \
+    ccache \
+    git \
+    git-lfs \
+    # MPI and NUMA
+    libopenmpi-dev \
+    libnuma1 \
+    libnuma-dev \
+    numactl \
+    # transformers multimodal VLM
+    ffmpeg \
+    # InfiniBand/RDMA
+    libibverbs-dev \
+    libibverbs1 \
+    libibumad3 \
+    librdmacm1 \
+    libnl-3-200 \
+    libnl-route-3-200 \
+    libnl-route-3-dev \
+    libnl-3-dev \
+    ibverbs-providers \
+    infiniband-diags \
+    perftest \
+    # Development libraries
+    libgoogle-glog-dev \
+    libgtest-dev \
+    libjsoncpp-dev \
+    libunwind-dev \
+    libboost-all-dev \
+    libssl-dev \
+    libgrpc-dev \
+    libgrpc++-dev \
+    libprotobuf-dev \
+    protobuf-compiler \
+    protobuf-compiler-grpc \
+    pybind11-dev \
+    libhiredis-dev \
+    libcurl4-openssl-dev \
+    libczmq4 \
+    libczmq-dev \
+    libfabric-dev \
+    # Package building tools
+    devscripts \
+    debhelper \
+    fakeroot \
+    dkms \
+    check \
+    libsubunit0 \
+    libsubunit-dev \
+    # Development tools
+    gdb \
+    ninja-build \
+    vim \
+    tmux \
+    htop \
+    zsh \
+    tree \
+    less \
+    rdma-core \
+    # NCCL
+    libnccl2 \
+    libnccl-dev \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# GDRCopy installation
+RUN mkdir -p /tmp/gdrcopy && cd /tmp \
+    && curl --retry 3 --retry-delay 2 -fsSL -o v${GDRCOPY_VERSION}.tar.gz \
+        https://${GITHUB_ARTIFACTORY}/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \
+    && tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \
+    && cd gdrcopy-${GDRCOPY_VERSION}/packages \
+    && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+    && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
+    && cd / && rm -rf /tmp/gdrcopy
+
+# Fix DeepEP IBGDA symlink
+RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so
+
+# Set up locale
+RUN locale-gen en_US.UTF-8
+ENV LANG=en_US.UTF-8 \
+    LANGUAGE=en_US:en \
+    LC_ALL=en_US.UTF-8
+
+########################################################
+########## Install Miniconda ###########################
+########################################################
+
+RUN mkdir -p /opt/miniconda3 \
+    && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /opt/miniconda3/miniconda.sh \
+    && bash /opt/miniconda3/miniconda.sh -b -u -p /opt/miniconda3 \
+    && rm /opt/miniconda3/miniconda.sh
+
+# Add conda to PATH
+ENV PATH="/opt/miniconda3/bin:${PATH}"
+
+# Accept conda TOS
+RUN conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
+    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
+
+# Configure conda to use Tsinghua mirror
+RUN conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main \
+    && conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free \
+    && conda config --set show_channel_urls yes
+
+########################################################
+########## Dual Conda Environment Setup ################
+########################################################
+
+FROM base AS framework
+
+ARG CUDA_VERSION
+ARG BUILD_AND_DOWNLOAD_PARALLEL
+ARG SGL_KERNEL_VERSION
+ARG SGL_VERSION
+ARG USE_LATEST_SGLANG
+ARG FLASHINFER_VERSION
+ARG GRACE_BLACKWELL
+ARG GRACE_BLACKWELL_DEEPEP_BRANCH
+ARG HOPPER_SBO
+ARG HOPPER_SBO_DEEPEP_COMMIT
+ARG DEEPEP_COMMIT
+ARG GITHUB_ARTIFACTORY
+ARG KTRANSFORMERS_VERSION
+ARG KTRANSFORMERS_WHEEL
+ARG FLASH_ATTN_WHEEL
+
+WORKDIR /workspace
+
+# Create two conda environments with Python 3.12
+RUN conda create -n serve python=3.12 -y \
+    && conda create -n fine-tune python=3.12 -y
+
+# Set pip mirror for both conda envs
+RUN /opt/miniconda3/envs/serve/bin/pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple \
+    && /opt/miniconda3/envs/fine-tune/bin/pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+# Clone repositories
+# Use kvcache-ai/sglang fork with kimi_k2 branch
+RUN git clone https://${GITHUB_ARTIFACTORY}/kvcache-ai/sglang.git /workspace/sglang \
+    && cd /workspace/sglang && git checkout kimi_k2
+
+RUN git clone --depth 1 https://${GITHUB_ARTIFACTORY}/hiyouga/LLaMA-Factory.git /workspace/LLaMA-Factory \
+    && git clone --depth 1 https://${GITHUB_ARTIFACTORY}/kvcache-ai/ktransformers.git /workspace/ktransformers \
+    && cd /workspace/ktransformers && git submodule update --init --recursive
+
+# Download ktransformers wheel and flash_attn wheel for fine-tune env
+RUN curl --retry 3 --retry-delay 2 -fsSL -o /workspace/${KTRANSFORMERS_WHEEL} \
+    https://${GITHUB_ARTIFACTORY}/kvcache-ai/ktransformers/releases/download/v${KTRANSFORMERS_VERSION}/${KTRANSFORMERS_WHEEL} \
+    && curl --retry 3 --retry-delay 2 -fsSL -o /workspace/${FLASH_ATTN_WHEEL} \
+    https://${GITHUB_ARTIFACTORY}/Dao-AILab/flash-attention/releases/download/v2.8.3/${FLASH_ATTN_WHEEL}
+
+########################################################
+# Environment 1: serve (sglang + kt-kernel)
+########################################################
+
+# Upgrade pip and install basic tools in serve env
+RUN --mount=type=cache,target=/root/.cache/pip \
+    /opt/miniconda3/envs/serve/bin/pip install --upgrade pip setuptools wheel html5lib six
+
+# Install sgl-kernel
+RUN --mount=type=cache,target=/root/.cache/pip \
+    case "$CUDA_VERSION" in \
+        12.6.1) CUINDEX=126 ;; \
+        12.8.1) CUINDEX=128 ;; \
+        12.9.1) CUINDEX=129 ;; \
+        13.0.1) CUINDEX=130 ;; \
+        *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
+    esac \
+    && if [ "$CUDA_VERSION" = "12.6.1" ]; then \
+        /opt/miniconda3/envs/serve/bin/pip install https://${GITHUB_ARTIFACTORY}/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu124-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
+    ; \
+    elif [ "$CUDA_VERSION" = "12.8.1" ] || [ "$CUDA_VERSION" = "12.9.1" ]; then \
+        /opt/miniconda3/envs/serve/bin/pip install sgl-kernel==${SGL_KERNEL_VERSION} \
+    ; \
+    elif [ "$CUDA_VERSION" = "13.0.1" ]; then \
+        /opt/miniconda3/envs/serve/bin/pip install https://github.com/sgl-project/whl/releases/download/v${SGL_KERNEL_VERSION}/sgl_kernel-${SGL_KERNEL_VERSION}+cu130-cp310-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps \
+    ; \
+    fi
+
+# Install SGLang in serve env
+RUN --mount=type=cache,target=/root/.cache/pip \
+    case "$CUDA_VERSION" in \
+        12.6.1) CUINDEX=126 ;; \
+        12.8.1) CUINDEX=128 ;; \
+        12.9.1) CUINDEX=129 ;; \
+        13.0.1) CUINDEX=130 ;; \
+    esac \
+    && cd /workspace/sglang \
+    && /opt/miniconda3/envs/serve/bin/pip install -e "python[all]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX}
+
+# Download FlashInfer cubin for serve env
+RUN --mount=type=cache,target=/root/.cache/pip \
+    FLASHINFER_CUBIN_DOWNLOAD_THREADS=${BUILD_AND_DOWNLOAD_PARALLEL} FLASHINFER_LOGGING_LEVEL=warning \
+    /opt/miniconda3/envs/serve/bin/python -m flashinfer --download-cubin
+
+# Install DeepEP in serve env
+RUN set -eux; \
+    if [ "$GRACE_BLACKWELL" = "1" ]; then \
+      git clone https://github.com/fzyzcjy/DeepEP.git /workspace/DeepEP && \
+      cd /workspace/DeepEP && \
+      git checkout ${GRACE_BLACKWELL_DEEPEP_BRANCH} && \
+      sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh; \
+    elif [ "$HOPPER_SBO" = "1" ]; then \
+      git clone https://github.com/deepseek-ai/DeepEP.git -b antgroup-opt /workspace/DeepEP && \
+      cd /workspace/DeepEP && \
+      git checkout ${HOPPER_SBO_DEEPEP_COMMIT} && \
+      sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh; \
+    else \
+      curl --retry 3 --retry-delay 2 -fsSL -o /tmp/${DEEPEP_COMMIT}.zip \
+          https://${GITHUB_ARTIFACTORY}/deepseek-ai/DeepEP/archive/${DEEPEP_COMMIT}.zip && \
+      unzip -q /tmp/${DEEPEP_COMMIT}.zip -d /tmp && rm /tmp/${DEEPEP_COMMIT}.zip && \
+      mv /tmp/DeepEP-${DEEPEP_COMMIT} /workspace/DeepEP && \
+      cd /workspace/DeepEP && \
+      sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    cd /workspace/DeepEP && \
+    case "$CUDA_VERSION" in \
+        12.6.1) CHOSEN_TORCH_CUDA_ARCH_LIST='9.0' ;; \
+        12.8.1) CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0' ;; \
+        12.9.1|13.0.1) CHOSEN_TORCH_CUDA_ARCH_LIST='9.0;10.0;10.3' ;; \
+        *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
+    esac && \
+    . /opt/miniconda3/etc/profile.d/conda.sh && conda activate serve && \
+    TORCH_CUDA_ARCH_LIST="${CHOSEN_TORCH_CUDA_ARCH_LIST}" MAX_JOBS=${BUILD_AND_DOWNLOAD_PARALLEL} \
+    pip install --no-build-isolation .
+
+# Install NCCL for serve env
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
+        /opt/miniconda3/envs/serve/bin/pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps ; \
+    elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \
+        /opt/miniconda3/envs/serve/bin/pip install nvidia-nccl-cu13==2.28.3 --force-reinstall --no-deps ; \
+    fi
+
+# Install kt-kernel in serve env with all CPU variants
+RUN . /opt/miniconda3/etc/profile.d/conda.sh && conda activate serve \
+    && cd /workspace/ktransformers/kt-kernel \
+    && CPUINFER_BUILD_ALL_VARIANTS=1 ./install.sh build
+
+########################################################
+# Environment 2: fine-tune (LLaMA-Factory + ktransformers)
+########################################################
+
+# Install dependency libraries for ktransformers (CUDA 11.8 runtime required)
+RUN conda install -n fine-tune -y -c conda-forge libstdcxx-ng gcc_impl_linux-64 \
+    && conda install -n fine-tune -y -c nvidia/label/cuda-11.8.0 cuda-runtime
+
+# Install PyTorch 2.8 in fine-tune env
+RUN --mount=type=cache,target=/root/.cache/pip \
+    case "$CUDA_VERSION" in \
+        12.6.1) CUINDEX=126 ;; \
+        12.8.1) CUINDEX=128 ;; \
+        12.9.1) CUINDEX=129 ;; \
+        13.0.1) CUINDEX=130 ;; \
+    esac \
+    && /opt/miniconda3/envs/fine-tune/bin/pip install --upgrade pip setuptools wheel \
+    && /opt/miniconda3/envs/fine-tune/bin/pip install \
+        torch==2.8.0 \
+        torchvision \
+        torchaudio \
+        --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX}
+
+# Install LLaMA-Factory in fine-tune env
+RUN --mount=type=cache,target=/root/.cache/pip \
+    cd /workspace/LLaMA-Factory \
+    && /opt/miniconda3/envs/fine-tune/bin/pip install -e ".[torch,metrics]" --no-build-isolation
+
+# Install ktransformers wheel in fine-tune env
+RUN --mount=type=cache,target=/root/.cache/pip \
+    /opt/miniconda3/envs/fine-tune/bin/pip install /workspace/${KTRANSFORMERS_WHEEL}
+
+# Install flash_attn wheel in fine-tune env
+RUN --mount=type=cache,target=/root/.cache/pip \
+    /opt/miniconda3/envs/fine-tune/bin/pip install /workspace/${FLASH_ATTN_WHEEL}
+
+# Install NCCL for fine-tune env
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "${CUDA_VERSION%%.*}" = "12" ]; then \
+        /opt/miniconda3/envs/fine-tune/bin/pip install nvidia-nccl-cu12==2.28.3 --force-reinstall --no-deps ; \
+    elif [ "${CUDA_VERSION%%.*}" = "13" ]; then \
+        /opt/miniconda3/envs/fine-tune/bin/pip install nvidia-nccl-cu13==2.28.3 --force-reinstall --no-deps ; \
+    fi
+
+########################################################
+# Cleanup and final setup
+########################################################
+
+# Clean up downloaded wheels
+RUN rm -f /workspace/${KTRANSFORMERS_WHEEL} /workspace/${FLASH_ATTN_WHEEL}
+
+# Initialize conda for bash
+RUN /opt/miniconda3/bin/conda init bash
+
+# Create shell aliases for convenience
+RUN echo '\n# Conda environment aliases\nalias serve="conda activate serve"\nalias finetune="conda activate fine-tune"' >> /root/.bashrc
+
+########################################################
+# Extract version information for image naming
+########################################################
+
+# Extract versions from each component and save to versions.env
+RUN set -x && \
+    # SGLang version (from version.py file)
+    cd /workspace/sglang/python/sglang && \
+    SGLANG_VERSION=$(python3 -c "exec(open('version.py').read()); print(__version__)" 2>/dev/null || echo "unknown") && \
+    echo "SGLANG_VERSION=$SGLANG_VERSION" > /workspace/versions.env && \
+    echo "Extracted SGLang version: $SGLANG_VERSION" && \
+    \
+    # KTransformers version (from version.py in repo)
+    cd /workspace/ktransformers && \
+    KTRANSFORMERS_VERSION=$(python3 -c "exec(open('version.py').read()); print(__version__)" 2>/dev/null || echo "unknown") && \
+    echo "KTRANSFORMERS_VERSION=$KTRANSFORMERS_VERSION" >> /workspace/versions.env && \
+    echo "Extracted KTransformers version: $KTRANSFORMERS_VERSION" && \
+    \
+    # LLaMA-Factory version (from fine-tune environment)
+    . /opt/miniconda3/etc/profile.d/conda.sh && conda activate fine-tune && \
+    cd /workspace/LLaMA-Factory && \
+    LLAMAFACTORY_VERSION=$(python -c "import sys; sys.path.insert(0, 'src'); from llamafactory import __version__; print(__version__)" 2>/dev/null || echo "unknown") && \
+    echo "LLAMAFACTORY_VERSION=$LLAMAFACTORY_VERSION" >> /workspace/versions.env && \
+    echo "Extracted LLaMA-Factory version: $LLAMAFACTORY_VERSION" && \
+    \
+    # Display all versions
+    echo "=== Version Summary ===" && \
+    cat /workspace/versions.env
+
+WORKDIR /workspace
+
+CMD ["/bin/bash"]
--- a/docker/README-packaging.md
+++ b/docker/README-packaging.md
@@ -0,0 +1,387 @@
+# KTransformers Docker Packaging Guide
+
+This directory contains scripts for building and distributing KTransformers Docker images with standardized naming conventions.
+
+## Overview
+
+The packaging system provides:
+
+- **Automated version detection** from sglang, ktransformers, and LLaMA-Factory
+- **Multi-CPU variant support** (AMX, AVX512, AVX2) with runtime auto-detection
+- **Standardized naming convention** for easy identification and management
+- **Two distribution methods**:
+  - Local tar file export for offline distribution
+  - DockerHub publishing for online distribution
+
+## Naming Convention
+
+Docker images follow this naming pattern:
+
+```
+sglang-v{sglang版本}_ktransformers-v{ktransformers版本}_{cpu信息}_{gpu信息}_{功能模式}_{时间戳}
+```
+
+### Example Names
+
+**Tar file:**
+```
+sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022.tar
+```
+
+**DockerHub tags:**
+```
+Full tag:
+kvcache/ktransformers:sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022
+
+Simplified tag:
+kvcache/ktransformers:v0.4.3-cu128
+```
+
+### Name Components
+
+| Component | Description | Example |
+|-----------|-------------|---------|
+| sglang version | SGLang package version | `v0.5.6` |
+| ktransformers version | KTransformers version | `v0.4.3` |
+| cpu info | CPU instruction set support | `x86-intel-multi` (includes AMX/AVX512/AVX2) |
+| gpu info | CUDA version | `cu128` (CUDA 12.8) |
+| functionality | Feature mode | `sft_llamafactory-v0.9.3` or `infer` |
+| timestamp | Build time (Beijing/UTC+8) | `20241212143022` |
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `Dockerfile` | Main Dockerfile with multi-CPU build and version extraction |
+| `docker-utils.sh` | Shared utility functions for both scripts |
+| `build-docker-tar.sh` | Build and export Docker image to tar file |
+| `push-to-dockerhub.sh` | Build and push Docker image to DockerHub |
+
+## Prerequisites
+
+- Docker installed and running
+- For DockerHub push: Docker Hub account and login (`docker login`)
+- Sufficient disk space (at least 20GB recommended)
+- Internet access (or local mirrors configured)
+
+## Quick Start
+
+### Build Local Tar File
+
+```bash
+cd docker
+
+# Basic build
+./build-docker-tar.sh
+
+# With specific CUDA version and mirror
+./build-docker-tar.sh \
+  --cuda-version 12.8.1 \
+  --ubuntu-mirror 1
+
+# With proxy
+./build-docker-tar.sh \
+  --cuda-version 12.8.1 \
+  --ubuntu-mirror 1 \
+  --http-proxy "http://127.0.0.1:16981" \
+  --https-proxy "http://127.0.0.1:16981" \
+  --output-dir /path/to/output
+```
+
+### Push to DockerHub
+
+```bash
+cd docker
+
+# Basic push (requires --repository)
+./push-to-dockerhub.sh \
+  --repository kvcache/ktransformers
+
+# With simplified tag
+./push-to-dockerhub.sh \
+  --cuda-version 12.8.1 \
+  --repository kvcache/ktransformers \
+  --also-push-simplified
+
+# Skip build if image exists
+./push-to-dockerhub.sh \
+  --repository kvcache/ktransformers \
+  --skip-build
+```
+
+## Script Options
+
+### build-docker-tar.sh
+
+```
+Build Configuration:
+  --cuda-version VERSION       CUDA version (default: 12.8.1)
+  --ubuntu-mirror 0|1         Use Tsinghua mirror (default: 0)
+  --http-proxy URL            HTTP proxy URL
+  --https-proxy URL           HTTPS proxy URL
+  --cpu-variant VARIANT       CPU variant (default: x86-intel-multi)
+  --functionality TYPE        Mode: sft or infer (default: sft)
+
+Paths:
+  --dockerfile PATH           Path to Dockerfile (default: ./Dockerfile)
+  --context-dir PATH          Build context directory (default: .)
+  --output-dir PATH           Output directory for tar (default: .)
+
+Options:
+  --dry-run                   Preview without building
+  --keep-image                Keep Docker image after export
+  --build-arg KEY=VALUE       Additional build arguments
+  -h, --help                  Show help message
+```
+
+### push-to-dockerhub.sh
+
+```
+All options from build-docker-tar.sh, plus:
+
+Registry Settings:
+  --registry REGISTRY         Docker registry (default: docker.io)
+  --repository REPO           Repository name (REQUIRED)
+
+Options:
+  --skip-build                Skip build if image exists
+  --also-push-simplified      Also push simplified tag
+  --max-retries N             Max push retries (default: 3)
+  --retry-delay SECONDS       Delay between retries (default: 5)
+```
+
+## Usage Examples
+
+### Example 1: Local Development Build
+
+For testing on your local machine:
+
+```bash
+./build-docker-tar.sh \
+  --cuda-version 12.8.1 \
+  --output-dir ./builds \
+  --keep-image
+```
+
+This will:
+1. Build the Docker image
+2. Export to tar in `./builds/` directory
+3. Keep the Docker image for local testing
+
+### Example 2: Production Build for Distribution
+
+For creating a production build with mirrors and proxy:
+
+```bash
+./build-docker-tar.sh \
+  --cuda-version 12.8.1 \
+  --ubuntu-mirror 1 \
+  --http-proxy "http://127.0.0.1:16981" \
+  --https-proxy "http://127.0.0.1:16981" \
+  --output-dir /mnt/data/releases
+```
+
+### Example 3: Publish to DockerHub
+
+For publishing to DockerHub:
+
+```bash
+# First, login to Docker Hub
+docker login
+
+# Then push
+./push-to-dockerhub.sh \
+  --cuda-version 12.8.1 \
+  --repository kvcache/ktransformers \
+  --also-push-simplified
+```
+
+This creates two tags:
+- Full: `kvcache/ktransformers:sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022`
+- Simplified: `kvcache/ktransformers:v0.4.3-cu128`
+
+### Example 4: Dry Run
+
+Preview the build without actually building:
+
+```bash
+./build-docker-tar.sh --cuda-version 12.8.1 --dry-run
+```
+
+### Example 5: Custom Build Arguments
+
+Pass additional Docker build arguments:
+
+```bash
+./build-docker-tar.sh \
+  --cuda-version 12.8.1 \
+  --build-arg SGL_VERSION=0.5.7 \
+  --build-arg FLASHINFER_VERSION=0.5.4
+```
+
+## Using the Built Images
+
+### Load from Tar File
+
+```bash
+# Load the image
+docker load -i sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022.tar
+
+# Run the container
+docker run -it --rm \
+  --gpus all \
+  sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022 \
+  /bin/bash
+```
+
+### Pull from DockerHub
+
+```bash
+# Pull with full tag
+docker pull kvcache/ktransformers:sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022
+
+# Or pull with simplified tag
+docker pull kvcache/ktransformers:v0.4.3-cu128
+
+# Run the container
+docker run -it --rm \
+  --gpus all \
+  kvcache/ktransformers:v0.4.3-cu128 \
+  /bin/bash
+```
+
+### Inside the Container
+
+The image contains two conda environments:
+
+```bash
+# Activate serve environment (for inference with sglang)
+conda activate serve
+# or use the alias:
+serve
+
+# Activate fine-tune environment (for training with LLaMA-Factory)
+conda activate fine-tune
+# or use the alias:
+finetune
+```
+
+## Multi-CPU Variant Support
+
+The Docker image includes all three CPU variants:
+- **AMX** - For Intel Sapphire Rapids and newer (4th Gen Xeon+)
+- **AVX512** - For Intel Skylake-X, Ice Lake, Cascade Lake
+- **AVX2** - Maximum compatibility for older CPUs
+
+The runtime automatically detects your CPU and loads the appropriate variant. To override:
+
+```bash
+# Force use of AVX2 variant
+export KT_KERNEL_CPU_VARIANT=avx2
+python your_script.py
+
+# Enable debug output to see which variant is loaded
+export KT_KERNEL_DEBUG=1
+python your_script.py
+```
+
+## Version Extraction
+
+Versions are automatically extracted during Docker build from:
+
+- **SGLang**: From `sglang.__version__` in serve environment
+- **KTransformers**: From `version.py` in ktransformers repository
+- **LLaMA-Factory**: From `llamafactory.__version__` in fine-tune environment
+
+The versions are saved to `/workspace/versions.env` in the image:
+
+```bash
+# View versions in running container
+cat /workspace/versions.env
+
+# Output:
+SGLANG_VERSION=0.5.6
+KTRANSFORMERS_VERSION=0.4.3
+LLAMAFACTORY_VERSION=0.9.3
+```
+
+## Troubleshooting
+
+### Build Fails with Out of Disk Space
+
+Check available disk space:
+```bash
+df -h
+```
+
+The build requires approximately 15-20GB of disk space. Clean up Docker:
+```bash
+docker system prune -a
+```
+
+### Version Extraction Fails
+
+If version extraction fails (shows "unknown"), check:
+
+1. The cloned repositories have the correct branches
+2. Python packages are properly installed in conda environments
+3. Version files exist in expected locations
+
+You can manually verify by running:
+```bash
+docker run --rm <image> /bin/bash -c "
+  source /opt/miniconda3/etc/profile.d/conda.sh &&
+  conda activate serve &&
+  python -c 'import sglang; print(sglang.__version__)'
+"
+```
+
+### Push to DockerHub Fails
+
+1. **Check login**: `docker login`
+2. **Check repository name**: Must include namespace (e.g., `kvcache/ktransformers`, not just `ktransformers`)
+3. **Network issues**: Use `--max-retries` and `--retry-delay` options
+4. **Rate limiting**: DockerHub has pull/push rate limits for free accounts
+
+## Advanced Topics
+
+### Custom Dockerfile Location
+
+```bash
+./build-docker-tar.sh \
+  --dockerfile /path/to/custom/Dockerfile \
+  --context-dir /path/to/build/context
+```
+
+### Building Only Inference Image (Future)
+
+Currently, the image always includes both serve and fine-tune environments. To create an inference-only image, modify the Dockerfile to skip the fine-tune environment section.
+
+### Customizing CPU Variants
+
+To build only specific CPU variants, modify `kt-kernel/install.sh` or set environment variables in the Dockerfile.
+
+### CI/CD Integration
+
+The scripts are designed for manual execution but can be integrated into CI/CD pipelines:
+
+```yaml
+# Example GitHub Actions workflow
+- name: Build and push Docker image
+  run: |
+    cd docker
+    ./push-to-dockerhub.sh \
+      --cuda-version ${{ matrix.cuda_version }} \
+      --repository ${{ secrets.DOCKER_REPOSITORY }} \
+      --also-push-simplified
+```
+
+## Support
+
+For issues and questions:
+- File an issue at: https://github.com/kvcache-ai/ktransformers/issues
+- Check documentation: https://github.com/kvcache-ai/ktransformers
+
+## License
+
+This packaging system is part of KTransformers and follows the same license.
--- a/docker/build-docker-tar.sh
+++ b/docker/build-docker-tar.sh
@@ -0,0 +1,498 @@
+#!/usr/bin/env bash
+#
+# build-docker-tar.sh - Build Docker image and export to tar file
+#
+# This script builds a Docker image for ktransformers with standardized naming
+# and exports it to a tar file for distribution.
+#
+# Features:
+# - Automatic version detection from built image
+# - Standardized naming convention
+# - Multi-CPU variant support (AMX/AVX512/AVX2)
+# - Configurable build parameters
+# - Comprehensive error handling
+#
+# Usage:
+#   ./build-docker-tar.sh [OPTIONS]
+#
+# Example:
+#   ./build-docker-tar.sh \
+#     --cuda-version 12.8.1 \
+#     --ubuntu-mirror 1 \
+#     --http-proxy "http://127.0.0.1:16981" \
+#     --output-dir /path/to/output
+
+set -euo pipefail
+
+# Get script directory
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Source utility functions
+# shellcheck source=docker-utils.sh
+source "$SCRIPT_DIR/docker-utils.sh"
+
+################################################################################
+# Default Configuration
+################################################################################
+
+# Build parameters
+CUDA_VERSION="12.8.1"
+UBUNTU_MIRROR="0"
+HTTP_PROXY=""
+HTTPS_PROXY=""
+CPU_VARIANT="x86-intel-multi"
+FUNCTIONALITY="sft"
+
+# Paths
+DOCKERFILE="$SCRIPT_DIR/Dockerfile"
+CONTEXT_DIR="$SCRIPT_DIR"
+OUTPUT_DIR="."
+
+# Options
+DRY_RUN=false
+KEEP_IMAGE=false
+EXTRA_BUILD_ARGS=()
+
+################################################################################
+# Help Message
+################################################################################
+
+usage() {
+    cat <<EOF
+Usage: $0 [OPTIONS]
+
+Build Docker image and export to tar file with standardized naming.
+
+OPTIONS:
+    Build Configuration:
+        --cuda-version VERSION      CUDA version (default: 12.8.1)
+                                   Examples: 12.8.1, 12.6.1, 13.0.1
+
+        --ubuntu-mirror 0|1         Use Tsinghua mirror for Ubuntu packages
+                                   (default: 0)
+
+        --http-proxy URL           HTTP proxy URL
+                                   Example: http://127.0.0.1:16981
+
+        --https-proxy URL          HTTPS proxy URL
+                                   Example: http://127.0.0.1:16981
+
+        --cpu-variant VARIANT      CPU variant identifier
+                                   (default: x86-intel-multi)
+
+        --functionality TYPE       Functionality mode: sft or infer
+                                   (default: sft, includes LLaMA-Factory)
+
+    Paths:
+        --dockerfile PATH          Path to Dockerfile
+                                   (default: ./Dockerfile)
+
+        --context-dir PATH         Docker build context directory
+                                   (default: .)
+
+        --output-dir PATH          Output directory for tar file
+                                   (default: current directory)
+
+    Options:
+        --dry-run                  Preview build command without executing
+        --keep-image               Keep Docker image after exporting tar
+        --build-arg KEY=VALUE      Additional build arguments (can be repeated)
+        -h, --help                 Show this help message
+
+EXAMPLES:
+    # Basic build with default settings
+    $0
+
+    # Build with CUDA 12.8.1 and mirror
+    $0 --cuda-version 12.8.1 --ubuntu-mirror 1
+
+    # Build with proxy and custom output directory
+    $0 \\
+        --cuda-version 12.8.1 \\
+        --http-proxy "http://127.0.0.1:16981" \\
+        --https-proxy "http://127.0.0.1:16981" \\
+        --output-dir /mnt/data/docker-images
+
+    # Dry run to preview
+    $0 --cuda-version 12.8.1 --dry-run
+
+OUTPUT:
+    The tar file will be named following the convention:
+    sglang-v{ver}_ktransformers-v{ver}_{cpu}_{gpu}_{func}_{timestamp}.tar
+
+    Example: sglang-v0.5.6_ktransformers-v0.4.3_x86-intel-multi_cu128_sft_llamafactory-v0.9.3_20241212143022.tar
+
+EOF
+    exit 0
+}
+
+################################################################################
+# Argument Parsing
+################################################################################
+
+parse_args() {
+    while [[ $# -gt 0 ]]; do
+        case "$1" in
+            --cuda-version)
+                CUDA_VERSION="$2"
+                shift 2
+                ;;
+            --ubuntu-mirror)
+                UBUNTU_MIRROR="$2"
+                shift 2
+                ;;
+            --http-proxy)
+                HTTP_PROXY="$2"
+                shift 2
+                ;;
+            --https-proxy)
+                HTTPS_PROXY="$2"
+                shift 2
+                ;;
+            --cpu-variant)
+                CPU_VARIANT="$2"
+                shift 2
+                ;;
+            --functionality)
+                FUNCTIONALITY="$2"
+                shift 2
+                ;;
+            --dockerfile)
+                DOCKERFILE="$2"
+                shift 2
+                ;;
+            --context-dir)
+                CONTEXT_DIR="$2"
+                shift 2
+                ;;
+            --output-dir)
+                OUTPUT_DIR="$2"
+                shift 2
+                ;;
+            --dry-run)
+                DRY_RUN=true
+                shift
+                ;;
+            --keep-image)
+                KEEP_IMAGE=true
+                shift
+                ;;
+            --build-arg)
+                EXTRA_BUILD_ARGS+=("--build-arg" "$2")
+                shift 2
+                ;;
+            -h|--help)
+                usage
+                ;;
+            *)
+                log_error "Unknown option: $1"
+                echo "Use -h or --help for usage information"
+                exit 1
+                ;;
+        esac
+    done
+}
+
+################################################################################
+# Validation
+################################################################################
+
+validate_config() {
+    log_step "Validating configuration"
+
+    # Check Docker is running
+    check_docker_running || exit 1
+
+    # Validate CUDA version
+    validate_cuda_version "$CUDA_VERSION" || exit 1
+
+    # Check Dockerfile exists
+    if [ ! -f "$DOCKERFILE" ]; then
+        log_error "Dockerfile not found: $DOCKERFILE"
+        exit 1
+    fi
+    log_info "Using Dockerfile: $DOCKERFILE"
+
+    # Check context directory exists
+    if [ ! -d "$CONTEXT_DIR" ]; then
+        log_error "Context directory not found: $CONTEXT_DIR"
+        exit 1
+    fi
+    log_info "Using context directory: $CONTEXT_DIR"
+
+    # Create output directory if it doesn't exist
+    if [ ! -d "$OUTPUT_DIR" ]; then
+        log_info "Creating output directory: $OUTPUT_DIR"
+        mkdir -p "$OUTPUT_DIR"
+    fi
+
+    # Check output directory is writable
+    check_writable "$OUTPUT_DIR" || exit 1
+    log_info "Output directory: $OUTPUT_DIR"
+
+    # Check disk space (recommend at least 20GB free)
+    check_disk_space 20 "$OUTPUT_DIR" || {
+        log_warning "Continuing despite low disk space warning..."
+    }
+
+    # Validate functionality mode
+    if [[ "$FUNCTIONALITY" != "sft" && "$FUNCTIONALITY" != "infer" ]]; then
+        log_error "Invalid functionality mode: $FUNCTIONALITY"
+        log_error "Must be 'sft' or 'infer'"
+        exit 1
+    fi
+
+    log_success "Configuration validated"
+}
+
+################################################################################
+# Build Docker Image
+################################################################################
+
+build_image() {
+    local temp_tag="ktransformers:temp-build-$(get_beijing_timestamp)"
+
+    log_step "Building Docker image" >&2
+    log_info "Temporary tag: $temp_tag" >&2
+
+    # Prepare build arguments
+    local build_args=()
+    build_args+=("--build-arg" "CUDA_VERSION=$CUDA_VERSION")
+    build_args+=("--build-arg" "UBUNTU_MIRROR=$UBUNTU_MIRROR")
+    build_args+=("--build-arg" "CPU_VARIANT=$CPU_VARIANT")
+    build_args+=("--build-arg" "BUILD_ALL_CPU_VARIANTS=1")
+
+    # Add proxy settings if provided
+    if [ -n "$HTTP_PROXY" ]; then
+        build_args+=("--build-arg" "HTTP_PROXY=$HTTP_PROXY")
+    fi
+    if [ -n "$HTTPS_PROXY" ]; then
+        build_args+=("--build-arg" "HTTPS_PROXY=$HTTPS_PROXY")
+    fi
+
+    # Add extra build args
+    build_args+=("${EXTRA_BUILD_ARGS[@]}")
+
+    # Add network host
+    build_args+=("--network" "host")
+
+    # Build command
+    local build_cmd=(
+        docker build
+        -f "$DOCKERFILE"
+        "${build_args[@]}"
+        -t "$temp_tag"
+        "$CONTEXT_DIR"
+    )
+
+    # Display build command
+    {
+        log_info "Build command:"
+        printf '  %s \\\n' "${build_cmd[@]:0:${#build_cmd[@]}-1}"
+        printf '  %s\n' "${build_cmd[-1]}"
+    } >&2
+
+    if [ "$DRY_RUN" = true ]; then
+        log_warning "DRY RUN: Skipping actual build" >&2
+        echo "$temp_tag"
+        return 0
+    fi
+
+    # Execute build
+    log_info "Starting Docker build (this may take 30-60 minutes)..." >&2
+    if "${build_cmd[@]}" >&2; then
+        log_success "Docker image built successfully" >&2
+        echo "$temp_tag"
+    else
+        log_error "Docker build failed" >&2
+        exit 1
+    fi
+}
+
+################################################################################
+# Extract Versions and Generate Name
+################################################################################
+
+generate_tar_name() {
+    local image_tag="$1"
+    local timestamp="$2"
+
+    if [ "$DRY_RUN" = true ]; then
+        log_warning "DRY RUN: Using placeholder versions"
+        # Use placeholder versions for dry run
+        local versions="SGLANG_VERSION=0.5.6
+KTRANSFORMERS_VERSION=0.4.3
+LLAMAFACTORY_VERSION=0.9.3"
+    else
+        # Extract versions from image
+        local versions
+        versions=$(extract_versions_from_image "$image_tag")
+
+        if [ $? -ne 0 ]; then
+            log_error "Failed to extract versions from image"
+            exit 1
+        fi
+
+        # Validate versions
+        if ! validate_versions "$versions"; then
+            log_error "Version validation failed"
+            exit 1
+        fi
+    fi
+
+    # Generate standardized image name
+    local tar_name
+    tar_name=$(generate_image_name "$versions" "$CUDA_VERSION" "$CPU_VARIANT" "$FUNCTIONALITY" "$timestamp")
+
+    if [ -z "$tar_name" ]; then
+        log_error "Failed to generate image name"
+        exit 1
+    fi
+
+    echo "$tar_name"
+}
+
+################################################################################
+# Export to Tar
+################################################################################
+
+export_to_tar() {
+    local image_tag="$1"
+    local tar_name="$2"
+    local tar_path="$OUTPUT_DIR/${tar_name}.tar"
+
+    log_step "Exporting image to tar file" >&2
+    log_info "Output: $tar_path" >&2
+
+    if [ "$DRY_RUN" = true ]; then
+        log_warning "DRY RUN: Skipping actual export" >&2
+        return 0
+    fi
+
+    # Check if tar file already exists
+    if [ -f "$tar_path" ]; then
+        log_warning "Tar file already exists: $tar_path" >&2
+        read -p "Overwrite? (y/N) " -n 1 -r
+        echo
+        if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+            log_error "Export cancelled by user" >&2
+            exit 1
+        fi
+        rm -f "$tar_path"
+    fi
+
+    # Tag image with the standardized name before saving
+    log_info "Tagging image with standardized name: $tar_name" >&2
+    if ! docker tag "$image_tag" "$tar_name"; then
+        log_error "Failed to tag image" >&2
+        exit 1
+    fi
+
+    # Export image with the standardized tag
+    log_info "Exporting image (this may take several minutes)..." >&2
+    if docker save -o "$tar_path" "$tar_name"; then
+        log_success "Image exported successfully" >&2
+
+        # Get file size
+        local size
+        size=$(du -h "$tar_path" | cut -f1)
+        log_info "Tar file size: $size" >&2
+    else
+        log_error "Failed to export image" >&2
+        exit 1
+    fi
+
+    echo "$tar_path"
+}
+
+################################################################################
+# Cleanup
+################################################################################
+
+cleanup() {
+    local image_tag="$1"
+
+    if [ "$KEEP_IMAGE" = true ]; then
+        log_info "Keeping Docker image as requested: $image_tag"
+    else
+        cleanup_temp_images "$image_tag"
+    fi
+}
+
+################################################################################
+# Main
+################################################################################
+
+main() {
+    log_step "KTransformers Docker Image Build and Export"
+
+    # Parse arguments
+    parse_args "$@"
+
+    # Validate configuration
+    validate_config
+
+    # Generate timestamp
+    TIMESTAMP=$(get_beijing_timestamp)
+    log_info "Build timestamp: $TIMESTAMP"
+
+    # Display configuration
+    display_summary "Build Configuration" \
+        "CUDA Version: $CUDA_VERSION" \
+        "Ubuntu Mirror: $UBUNTU_MIRROR" \
+        "CPU Variant: $CPU_VARIANT" \
+        "Functionality: $FUNCTIONALITY" \
+        "HTTP Proxy: ${HTTP_PROXY:-<not set>}" \
+        "HTTPS Proxy: ${HTTPS_PROXY:-<not set>}" \
+        "Dockerfile: $DOCKERFILE" \
+        "Context Dir: $CONTEXT_DIR" \
+        "Output Dir: $OUTPUT_DIR" \
+        "Timestamp: $TIMESTAMP" \
+        "Dry Run: $DRY_RUN"
+
+    # Build image
+    TEMP_TAG=$(build_image)
+
+    # Generate tar name
+    TAR_NAME=$(generate_tar_name "$TEMP_TAG" "$TIMESTAMP")
+    log_info "Generated tar name: $TAR_NAME.tar"
+
+    if [ "$DRY_RUN" = true ]; then
+        # Display dry-run summary
+        display_summary "DRY RUN Preview" \
+            "This is what would be built:" \
+            "" \
+            "Temporary Docker tag: $TEMP_TAG" \
+            "Tar filename: $TAR_NAME.tar" \
+            "Output path: $OUTPUT_DIR/$TAR_NAME.tar" \
+            "" \
+            "After build, you would run:" \
+            "  docker load -i $OUTPUT_DIR/$TAR_NAME.tar" \
+            "  docker run -it --rm ${TAR_NAME} /bin/bash"
+
+        log_success "DRY RUN: Preview complete. Remove --dry-run to build."
+        exit 0
+    fi
+
+    # Export to tar
+    TAR_PATH=$(export_to_tar "$TEMP_TAG" "$TAR_NAME")
+
+    # Cleanup
+    cleanup "$TEMP_TAG"
+
+    # Display summary
+    display_summary "Build Complete" \
+        "Docker Image: $TEMP_TAG ($([ "$KEEP_IMAGE" = true ] && echo "kept" || echo "removed"))" \
+        "Tar File: $TAR_PATH" \
+        "" \
+        "To load the image:" \
+        "  docker load -i $TAR_PATH" \
+        "" \
+        "To run the container:" \
+        "  docker run -it --rm ${TAR_NAME} /bin/bash"
+
+    log_success "All done!"
+}
+
+# Run main function
+main "$@"
--- a/docker/docker-utils.sh
+++ b/docker/docker-utils.sh
@@ -0,0 +1,372 @@
+#!/usr/bin/env bash
+#
+# docker-utils.sh - Shared utility functions for Docker image build and publish scripts
+#
+# This script provides common functions for:
+# - Timestamp generation (Beijing timezone)
+# - Version extraction from Docker images
+# - Image name generation following naming conventions
+# - Colored logging
+# - Validation and error handling
+#
+# Usage: source docker-utils.sh
+
+set -euo pipefail
+
+# Color codes for logging
+COLOR_RED='\033[0;31m'
+COLOR_GREEN='\033[0;32m'
+COLOR_YELLOW='\033[1;33m'
+COLOR_BLUE='\033[0;34m'
+COLOR_CYAN='\033[0;36m'
+COLOR_RESET='\033[0m'
+
+################################################################################
+# Logging Functions
+################################################################################
+
+log_info() {
+    echo -e "${COLOR_BLUE}[INFO]${COLOR_RESET} $*"
+}
+
+log_success() {
+    echo -e "${COLOR_GREEN}[SUCCESS]${COLOR_RESET} $*"
+}
+
+log_warning() {
+    echo -e "${COLOR_YELLOW}[WARNING]${COLOR_RESET} $*"
+}
+
+log_error() {
+    echo -e "${COLOR_RED}[ERROR]${COLOR_RESET} $*" >&2
+}
+
+log_step() {
+    echo -e "\n${COLOR_CYAN}==>${COLOR_RESET} $*"
+}
+
+################################################################################
+# Timestamp Functions
+################################################################################
+
+# Generate timestamp in Beijing timezone (UTC+8)
+# Format: YYYYMMDDHHMMSS
+# Example: 20241212143022
+get_beijing_timestamp() {
+    # Try to use TZ environment variable approach
+    if date --version &>/dev/null 2>&1; then
+        # GNU date (Linux)
+        TZ='Asia/Shanghai' date '+%Y%m%d%H%M%S'
+    else
+        # BSD date (macOS)
+        TZ='Asia/Shanghai' date '+%Y%m%d%H%M%S'
+    fi
+}
+
+################################################################################
+# CUDA Version Parsing
+################################################################################
+
+# Parse CUDA version to short format
+# Input: 12.8.1 or 12.8 or 13.0.1
+# Output: cu128 or cu130
+parse_cuda_short_version() {
+    local cuda_version="$1"
+
+    # Extract major and minor version
+    local major minor
+    major=$(echo "$cuda_version" | cut -d. -f1)
+    minor=$(echo "$cuda_version" | cut -d. -f2)
+
+    # Validate
+    if [[ ! "$major" =~ ^[0-9]+$ ]] || [[ ! "$minor" =~ ^[0-9]+$ ]]; then
+        log_error "Invalid CUDA version format: $cuda_version"
+        log_error "Expected format: X.Y.Z (e.g., 12.8.1)"
+        return 1
+    fi
+
+    echo "cu${major}${minor}"
+}
+
+################################################################################
+# Version Extraction
+################################################################################
+
+# Extract versions from built Docker image
+# Input: image tag (e.g., ktransformers:temp-build-20241212)
+# Output: Sets environment variables or prints to stdout
+#   SGLANG_VERSION=x.y.z
+#   KTRANSFORMERS_VERSION=x.y.z
+#   LLAMAFACTORY_VERSION=x.y.z
+extract_versions_from_image() {
+    local image_tag="$1"
+
+    log_step "Extracting versions from image: $image_tag"
+
+    # Check if image exists
+    if ! docker image inspect "$image_tag" &>/dev/null; then
+        log_error "Image not found: $image_tag"
+        return 1
+    fi
+
+    # Extract versions.env file from the image
+    local versions_content
+    versions_content=$(docker run --rm "$image_tag" cat /workspace/versions.env 2>/dev/null)
+
+    if [ -z "$versions_content" ]; then
+        log_error "Failed to extract versions from image"
+        log_error "The /workspace/versions.env file may not exist in the image"
+        return 1
+    fi
+
+    # Parse and display versions
+    log_info "Extracted versions:"
+    echo "$versions_content" | while IFS= read -r line; do
+        log_info "  $line"
+    done
+
+    # Output the content (caller can parse this or eval it)
+    echo "$versions_content"
+}
+
+# Validate that all required versions were extracted
+# Input: versions string (output from extract_versions_from_image)
+validate_versions() {
+    local versions="$1"
+    local all_valid=true
+
+    # Check each required version
+    for var in SGLANG_VERSION KTRANSFORMERS_VERSION LLAMAFACTORY_VERSION; do
+        local value
+        value=$(echo "$versions" | grep "^${var}=" | cut -d= -f2)
+
+        if [ -z "$value" ]; then
+            log_error "Missing version: $var"
+            all_valid=false
+        elif [ "$value" = "unknown" ]; then
+            log_warning "Version is 'unknown': $var"
+            # Don't fail, but warn user
+        fi
+    done
+
+    if [ "$all_valid" = false ]; then
+        return 1
+    fi
+
+    return 0
+}
+
+################################################################################
+# Image Naming
+################################################################################
+
+# Generate standardized image name
+# Input:
+#   $1: versions string (from extract_versions_from_image)
+#   $2: cuda_version (e.g., 12.8.1)
+#   $3: cpu_variant (e.g., x86-intel-multi)
+#   $4: functionality (e.g., sft_llamafactory or infer)
+#   $5: timestamp (optional, will generate if not provided)
+# Output: Standardized image name
+# Format: sglang-v{ver}_ktransformers-v{ver}_{cpu}_{gpu}_{func}_{timestamp}
+generate_image_name() {
+    local versions="$1"
+    local cuda_version="$2"
+    local cpu_variant="$3"
+    local functionality="$4"
+    local timestamp="${5:-$(get_beijing_timestamp)}"
+
+    # Parse versions from the versions string
+    local sglang_ver ktrans_ver llama_ver
+    sglang_ver=$(echo "$versions" | grep "^SGLANG_VERSION=" | cut -d= -f2)
+    ktrans_ver=$(echo "$versions" | grep "^KTRANSFORMERS_VERSION=" | cut -d= -f2)
+    llama_ver=$(echo "$versions" | grep "^LLAMAFACTORY_VERSION=" | cut -d= -f2)
+
+    # Validate versions were extracted
+    if [ -z "$sglang_ver" ] || [ -z "$ktrans_ver" ] || [ -z "$llama_ver" ]; then
+        log_error "Failed to parse versions from input"
+        return 1
+    fi
+
+    # Parse CUDA short version
+    local cuda_short
+    cuda_short=$(parse_cuda_short_version "$cuda_version")
+
+    # Build functionality string
+    local func_str
+    if [ "$functionality" = "sft" ]; then
+        func_str="sft_llamafactory-v${llama_ver}"
+    else
+        func_str="infer"
+    fi
+
+    # Generate full image name
+    # Format: sglang-v{ver}_ktransformers-v{ver}_{cpu}_{gpu}_{func}_{timestamp}
+    local image_name
+    image_name="sglang-v${sglang_ver}_ktransformers-v${ktrans_ver}_${cpu_variant}_${cuda_short}_${func_str}_${timestamp}"
+
+    echo "$image_name"
+}
+
+# Generate simplified tag for DockerHub
+# Input:
+#   $1: ktransformers_version (e.g., 0.4.3)
+#   $2: cuda_version (e.g., 12.8.1)
+# Output: Simplified tag (e.g., v0.4.3-cu128)
+generate_simplified_tag() {
+    local ktrans_ver="$1"
+    local cuda_version="$2"
+
+    local cuda_short
+    cuda_short=$(parse_cuda_short_version "$cuda_version")
+
+    echo "v${ktrans_ver}-${cuda_short}"
+}
+
+################################################################################
+# Validation Functions
+################################################################################
+
+# Check if Docker daemon is running
+check_docker_running() {
+    if ! docker info &>/dev/null; then
+        log_error "Docker daemon is not running"
+        log_error "Please start Docker and try again"
+        return 1
+    fi
+    return 0
+}
+
+# Check if user is logged into Docker registry
+# Input: registry (optional, default: docker.io)
+check_docker_login() {
+    local registry="${1:-docker.io}"
+
+    # Try to check auth by attempting a trivial operation
+    if ! docker login --help &>/dev/null; then
+        log_error "Docker CLI is not available"
+        return 1
+    fi
+
+    # Note: This is a best-effort check
+    # docker login status is not always easy to check programmatically
+    log_info "Assuming Docker login is configured"
+    log_info "If push fails, please run: docker login $registry"
+
+    return 0
+}
+
+# Validate CUDA version format
+validate_cuda_version() {
+    local cuda_version="$1"
+
+    if [[ ! "$cuda_version" =~ ^[0-9]+\.[0-9]+(\.[0-9]+)?$ ]]; then
+        log_error "Invalid CUDA version format: $cuda_version"
+        log_error "Expected format: X.Y or X.Y.Z (e.g., 12.8 or 12.8.1)"
+        return 1
+    fi
+
+    return 0
+}
+
+# Check available disk space
+# Input: required space in GB
+check_disk_space() {
+    local required_gb="$1"
+    local output_dir="${2:-.}"
+
+    # Get available space in GB (works on Linux and macOS)
+    local available_kb
+    if df -k "$output_dir" &>/dev/null; then
+        available_kb=$(df -k "$output_dir" | tail -1 | awk '{print $4}')
+        local available_gb=$((available_kb / 1024 / 1024))
+
+        log_info "Available disk space: ${available_gb}GB"
+
+        if [ "$available_gb" -lt "$required_gb" ]; then
+            log_warning "Low disk space: ${available_gb}GB available, ${required_gb}GB recommended"
+            return 1
+        fi
+    else
+        log_warning "Unable to check disk space"
+    fi
+
+    return 0
+}
+
+# Check if file/directory exists and is writable
+check_writable() {
+    local path="$1"
+
+    if [ -e "$path" ]; then
+        if [ ! -w "$path" ]; then
+            log_error "Path exists but is not writable: $path"
+            return 1
+        fi
+    else
+        # Try to create parent directory to test writability
+        local parent_dir
+        parent_dir=$(dirname "$path")
+        if [ ! -w "$parent_dir" ]; then
+            log_error "Parent directory is not writable: $parent_dir"
+            return 1
+        fi
+    fi
+
+    return 0
+}
+
+################################################################################
+# Cleanup Functions
+################################################################################
+
+# Remove intermediate Docker images
+cleanup_temp_images() {
+    local image_tag="$1"
+
+    log_step "Cleaning up temporary image: $image_tag"
+
+    if docker image inspect "$image_tag" &>/dev/null; then
+        docker rmi "$image_tag" &>/dev/null || true
+        log_success "Cleaned up temporary image"
+    fi
+}
+
+################################################################################
+# Display Functions
+################################################################################
+
+# Display a summary box
+display_summary() {
+    local title="$1"
+    shift
+    local lines=("$@")
+
+    local width=80
+    local border=$(printf '=%.0s' $(seq 1 $width))
+
+    echo ""
+    echo "$border"
+    echo "  $title"
+    echo "$border"
+    for line in "${lines[@]}"; do
+        echo "  $line"
+    done
+    echo "$border"
+    echo ""
+}
+
+################################################################################
+# Export functions
+################################################################################
+
+# Export all functions so they can be used by scripts that source this file
+export -f log_info log_success log_warning log_error log_step
+export -f get_beijing_timestamp
+export -f parse_cuda_short_version
+export -f extract_versions_from_image validate_versions
+export -f generate_image_name generate_simplified_tag
+export -f check_docker_running check_docker_login validate_cuda_version
+export -f check_disk_space check_writable
+export -f cleanup_temp_images
+export -f display_summary
--- a/docker/push-to-dockerhub.sh
+++ b/docker/push-to-dockerhub.sh
--- a/kt-kernel/CMakeLists.txt
+++ b/kt-kernel/CMakeLists.txt
@@ -28,7 +28,7 @@ option(KTRANSFORMERS_CPU_MOE_AMD "ktransformers: CPU use moe kernel for amd" OFF
 # LTO control
 option(CPUINFER_ENABLE_LTO "Enable link time optimization (IPO)" OFF)

-project(kt_kernel_ext VERSION 0.1.0)
+project(kt_kernel_ext VERSION 0.4.2)
 # Choose compilers BEFORE project() so CMake honors them
 if(USE_CONDA_TOOLCHAIN)
    if(NOT DEFINED ENV{CONDA_PREFIX} OR NOT EXISTS "$ENV{CONDA_PREFIX}")
--- a/kt-kernel/MANIFEST.in
+++ b/kt-kernel/MANIFEST.in
@@ -0,0 +1,37 @@
+# MANIFEST.in for kt-kernel
+# Ensures source distribution includes all necessary files for building from source
+
+# Core build files
+include CMakeLists.txt
+include CMakePresets.json
+include setup.py
+include pyproject.toml
+include requirements.txt
+include README.md
+include LICENSE
+
+# CMake modules and configuration
+recursive-include cmake *.cmake *.in
+
+# C++ source files
+recursive-include cpu_backend *.h *.hpp *.cpp *.c *.cc
+recursive-include operators *.h *.hpp *.cpp *.c *.cc
+include ext_bindings.cpp
+
+# Python package
+recursive-include python *.py
+
+# Third-party dependencies (vendored)
+recursive-include third_party *
+
+# Exclude compiled and cache files
+global-exclude *.pyc
+global-exclude *.pyo
+global-exclude __pycache__
+global-exclude .git*
+global-exclude *.so
+global-exclude *.o
+global-exclude *.a
+global-exclude build
+global-exclude dist
+global-exclude *.egg-info
--- a/kt-kernel/README.md
+++ b/kt-kernel/README.md
@@ -47,14 +47,75 @@ High-performance kernel operations for KTransformers, featuring CPU-optimized Mo

 ## Installation

-### Prerequisites
+### Option 1: Install from PyPI (Recommended for Most Users)
+
+Choose the version matching your CUDA installation:
+
+```bash
+# For CUDA 11.8
+pip install kt-kernel==0.4.2.cu118
+
+# For CUDA 12.1
+pip install kt-kernel==0.4.2.cu121
+
+# For CUDA 12.4
+pip install kt-kernel==0.4.2.cu124
+
+# For CUDA 12.6
+pip install kt-kernel==0.4.2.cu126
+```
+
+> **Note**: Replace `0.4.2` with the [latest version](https://pypi.org/project/kt-kernel/#history) if available.
+
+**Features:**
+- ✅ **Automatic CPU detection**: Detects your CPU and loads the optimal kernel variant
+- ✅ **Multi-variant wheel**: Includes AMX, AVX512, and AVX2 variants in a single package
+- ✅ **No compilation needed**: Pre-built wheels for Python 3.10, 3.11, 3.12
+- ✅ **Multiple CUDA versions**: Choose the version matching your environment
+
+**Requirements:**
+- CUDA 11.8+ or 12.x runtime (must match the package version you install)
+- PyTorch 2.0+ (install separately, must match CUDA version)
+- Linux x86-64
+
+**CPU Variants Included:**
+| Variant | CPU Support | Use Case |
+|---------|-------------|----------|
+| **AMX** | Intel Sapphire Rapids+ | Best performance on latest Intel CPUs |
+| **AVX512** | Intel Skylake-X/Ice Lake/Cascade Lake | AVX512-capable CPUs without AMX |
+| **AVX2** | Intel Haswell+, AMD Zen+ | Maximum compatibility |
+
+**Check which variant is loaded:**
+```python
+import kt_kernel
+print(f"CPU variant: {kt_kernel.__cpu_variant__}")  # 'amx', 'avx512', or 'avx2'
+print(f"Version: {kt_kernel.__version__}")
+```
+
+**Environment Variables:**
+```bash
+# Override automatic CPU detection
+export KT_KERNEL_CPU_VARIANT=avx2  # or 'avx512', 'amx'
+
+# Enable debug output
+export KT_KERNEL_DEBUG=1
+python -c "import kt_kernel"
+```
+
+---
+
+### Option 2: Install from Source (For AMD, ARM, or Custom Builds)
+
+If you need AMD (BLIS), ARM (KML), or custom CUDA versions, build from source:
+
+#### Prerequisites

 First, initialize git submodules:
 ```bash
 git submodule update --init --recursive
 ```

-### Quick Installation (Recommended)
+#### Quick Installation

 Step 0: Create and activate a conda environment (recommended):

@@ -65,7 +126,7 @@ conda activate kt-kernel

 You can now install in two clear steps using the same script.

-Option A: Two-step (specify dependencies installation and build separately)
+**Option A: Two-step** (specify dependencies installation and build separately)

 ```bash
 # 1) Install system prerequisites (cmake, hwloc, pkg-config)
@@ -76,7 +137,7 @@ Option A: Two-step (specify dependencies installation and build separately)
 ./install.sh build
 ```

-Option B: One-step
+**Option B: One-step**

 ```bash
 ./install.sh
--- a/kt-kernel/install.sh
+++ b/kt-kernel/install.sh
@@ -161,6 +161,34 @@ build_step() {
    echo "Skipping clean of $REPO_ROOT/build (requested by --no-clean)"
  fi

+  # Check for multi-variant build mode (Docker environment)
+  if [ "${CPUINFER_BUILD_ALL_VARIANTS:-0}" = "1" ]; then
+    echo "=========================================="
+    echo "Building ALL CPU variants (AMX/AVX512/AVX2)"
+    echo "=========================================="
+    echo ""
+    echo "This will build three variants in a single wheel:"
+    echo "  - AMX variant (Intel Sapphire Rapids+)"
+    echo "  - AVX512 variant (Intel Skylake-X/Ice Lake+)"
+    echo "  - AVX2 variant (maximum compatibility)"
+    echo ""
+    echo "Runtime CPU detection will automatically select the best variant."
+    echo ""
+
+    export CPUINFER_FORCE_REBUILD=1
+    export CPUINFER_BUILD_TYPE=${CPUINFER_BUILD_TYPE:-Release}
+    export CPUINFER_PARALLEL=${CPUINFER_PARALLEL:-8}
+
+    echo "Building with:"
+    echo "  CPUINFER_BUILD_ALL_VARIANTS=1"
+    echo "  CPUINFER_BUILD_TYPE=$CPUINFER_BUILD_TYPE"
+    echo "  CPUINFER_PARALLEL=$CPUINFER_PARALLEL"
+    echo ""
+
+    pip install . -v
+    return 0
+  fi
+
  if [ "$MANUAL_MODE" = "0" ]; then
  # Auto-detection mode
  echo "=========================================="
--- a/kt-kernel/pyproject.toml
+++ b/kt-kernel/pyproject.toml
@@ -5,7 +5,8 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "kt-kernel"
-version = "0.1.0"
+# Version is dynamically read from ../version.py via setup.py
+dynamic = ["version"]
 description = "KT-Kernel: High-performance kernel operations for KTransformers (AMX/AVX/KML optimizations)"
 readme = "README.md"
 authors = [{ name = "kvcache-ai" }]
--- a/kt-kernel/python/init.py
+++ b/kt-kernel/python/init.py
@@ -5,6 +5,9 @@
 KT-Kernel provides high-performance kernel operations for KTransformers,
 including CPU-optimized MoE inference with AMX, AVX, and KML support.

+The package automatically detects your CPU capabilities and loads the optimal
+kernel variant (AMX, AVX512, or AVX2) at runtime.
+
 Example usage:
    >>> from kt_kernel import KTMoEWrapper
    >>> wrapper = KTMoEWrapper(
@@ -20,11 +23,41 @@ Example usage:
    ...     chunked_prefill_size=512,
    ...     method="AMXINT4"
    ... )
+
+    Check which CPU variant is loaded:
+    >>> import kt_kernel
+    >>> print(kt_kernel.__cpu_variant__)  # 'amx', 'avx512', or 'avx2'
+
+Environment Variables:
+    KT_KERNEL_CPU_VARIANT: Override automatic detection ('amx', 'avx512', 'avx2')
+    KT_KERNEL_DEBUG: Enable debug output ('1' to enable)
 """

 from __future__ import annotations

+# Detect CPU and load optimal extension variant
+from ._cpu_detect import initialize as _initialize_cpu
+_kt_kernel_ext, __cpu_variant__ = _initialize_cpu()
+
+# Make the extension module available to other modules in this package
+import sys
+sys.modules['kt_kernel_ext'] = _kt_kernel_ext
+
+# Also expose kt_kernel_ext as an attribute for backward compatibility
+kt_kernel_ext = _kt_kernel_ext
+
+# Import main API
 from .experts import KTMoEWrapper

-__version__ = "0.1.0"
-__all__ = ["KTMoEWrapper"]
+# Read version from project root version.py
+import os
+_root_version_file = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'version.py')
+if os.path.exists(_root_version_file):
+    _version_ns = {}
+    with open(_root_version_file, 'r', encoding='utf-8') as f:
+        exec(f.read(), _version_ns)
+    __version__ = _version_ns.get('__version__', '0.4.2')
+else:
+    __version__ = "0.4.2"
+
+__all__ = ["KTMoEWrapper", "kt_kernel_ext", "__cpu_variant__", "__version__"]
--- a/kt-kernel/python/_cpu_detect.py
+++ b/kt-kernel/python/_cpu_detect.py
@@ -0,0 +1,233 @@
+"""
+CPU feature detection and optimal kernel loader for kt-kernel.
+
+This module automatically detects CPU capabilities and loads the best available
+kernel variant (AMX, AVX512, or AVX2) at runtime.
+
+Environment Variables:
+    KT_KERNEL_CPU_VARIANT: Override automatic detection ('amx', 'avx512', 'avx2')
+    KT_KERNEL_DEBUG: Enable debug output ('1' to enable)
+
+Example:
+    >>> import kt_kernel
+    >>> print(kt_kernel.__cpu_variant__)  # Shows detected variant
+
+    # Override detection
+    >>> import os
+    >>> os.environ['KT_KERNEL_CPU_VARIANT'] = 'avx2'
+    >>> import kt_kernel  # Will use AVX2 variant
+"""
+import os
+import sys
+from pathlib import Path
+
+
+def detect_cpu_features():
+    """
+    Detect CPU features to determine the best kernel variant.
+
+    Detection hierarchy:
+        1. AMX: Intel Sapphire Rapids+ with AMX support
+        2. AVX512: CPUs with AVX512F support
+        3. AVX2: Fallback for maximum compatibility
+
+    Returns:
+        str: 'amx', 'avx512', or 'avx2'
+    """
+    # Check environment override
+    variant = os.environ.get('KT_KERNEL_CPU_VARIANT', '').lower()
+    if variant in ['amx', 'avx512', 'avx2']:
+        if os.environ.get('KT_KERNEL_DEBUG') == '1':
+            print(f"[kt-kernel] Using environment override: {variant}")
+        return variant
+
+    # Try to read /proc/cpuinfo on Linux
+    try:
+        with open('/proc/cpuinfo', 'r') as f:
+            cpuinfo = f.read().lower()
+
+        # Check for AMX support (Intel Sapphire Rapids+)
+        # AMX requires amx_tile, amx_int8, and amx_bf16
+        amx_flags = ['amx_tile', 'amx_int8', 'amx_bf16']
+        has_amx = all(flag in cpuinfo for flag in amx_flags)
+
+        if has_amx:
+            if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                print("[kt-kernel] Detected AMX support via /proc/cpuinfo")
+            return 'amx'
+
+        # Check for AVX512 support
+        # AVX512F is the foundation for all AVX512 variants
+        if 'avx512f' in cpuinfo:
+            if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                print("[kt-kernel] Detected AVX512 support via /proc/cpuinfo")
+            return 'avx512'
+
+        # Check for AVX2 support
+        if 'avx2' in cpuinfo:
+            if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                print("[kt-kernel] Detected AVX2 support via /proc/cpuinfo")
+            return 'avx2'
+
+        # Fallback to AVX2 (should be rare on modern CPUs)
+        if os.environ.get('KT_KERNEL_DEBUG') == '1':
+            print("[kt-kernel] No AVX2/AVX512/AMX detected, using AVX2 fallback")
+        return 'avx2'
+
+    except FileNotFoundError:
+        # /proc/cpuinfo doesn't exist (not Linux or in container)
+        # Try cpufeature package as fallback
+        if os.environ.get('KT_KERNEL_DEBUG') == '1':
+            print("[kt-kernel] /proc/cpuinfo not found, trying cpufeature package")
+
+        try:
+            import cpufeature
+
+            # Check for AMX
+            if cpufeature.CPUFeature.get('AMX_TILE', False):
+                if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                    print("[kt-kernel] Detected AMX support via cpufeature")
+                return 'amx'
+
+            # Check for AVX512
+            if cpufeature.CPUFeature.get('AVX512F', False):
+                if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                    print("[kt-kernel] Detected AVX512 support via cpufeature")
+                return 'avx512'
+
+            # Fallback to AVX2
+            if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                print("[kt-kernel] Using AVX2 fallback via cpufeature")
+            return 'avx2'
+
+        except ImportError:
+            # cpufeature not available - ultimate fallback
+            if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                print("[kt-kernel] cpufeature not available, using AVX2 fallback")
+            return 'avx2'
+
+    except Exception as e:
+        # Any other error - safe fallback
+        if os.environ.get('KT_KERNEL_DEBUG') == '1':
+            print(f"[kt-kernel] Error during CPU detection: {e}, using AVX2 fallback")
+        return 'avx2'
+
+
+def load_extension(variant):
+    """
+    Load the appropriate kt_kernel_ext variant.
+
+    Tries to import the specified variant, with automatic fallback to
+    lower-performance variants if the requested one is not available.
+
+    Supports both multi-variant builds (_kt_kernel_ext_amx.*.so) and
+    single-variant builds (kt_kernel_ext.*.so).
+
+    Fallback order: amx -> avx512 -> avx2 -> single-variant
+
+    Args:
+        variant (str): 'amx', 'avx512', or 'avx2'
+
+    Returns:
+        module: The loaded extension module
+
+    Raises:
+        ImportError: If all variants fail to load
+    """
+    import importlib.util
+    import glob
+
+    # The .so files can be named in two ways:
+    # Multi-variant: _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
+    # Single-variant: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so
+    # Both export PyInit_kt_kernel_ext (the original module name)
+
+    try:
+        # Find the kt_kernel package directory
+        # We can't import kt_kernel here (circular import), so use __file__
+        kt_kernel_dir = os.path.dirname(os.path.abspath(__file__))
+
+        # Try multi-variant naming first
+        pattern = os.path.join(kt_kernel_dir, f'_kt_kernel_ext_{variant}.*.so')
+        so_files = glob.glob(pattern)
+
+        if not so_files:
+            # Try single-variant naming (fallback for builds without CPUINFER_BUILD_ALL_VARIANTS)
+            pattern = os.path.join(kt_kernel_dir, 'kt_kernel_ext.*.so')
+            so_files = glob.glob(pattern)
+
+            if so_files:
+                if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                    print(f"[kt-kernel] Multi-variant {variant} not found, using single-variant build")
+            else:
+                raise ImportError(f"No .so file found for variant {variant} (tried patterns: {kt_kernel_dir}/_kt_kernel_ext_{variant}.*.so and {kt_kernel_dir}/kt_kernel_ext.*.so)")
+
+        so_file = so_files[0]
+
+        if os.environ.get('KT_KERNEL_DEBUG') == '1':
+            print(f"[kt-kernel] Loading {variant} from: {so_file}")
+
+        # Load the module manually
+        # The module exports PyInit_kt_kernel_ext, so we use that as the module name
+        spec = importlib.util.spec_from_file_location('kt_kernel_ext', so_file)
+        if spec is None or spec.loader is None:
+            raise ImportError(f"Failed to create spec for {so_file}")
+
+        ext = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(ext)
+
+        if os.environ.get('KT_KERNEL_DEBUG') == '1':
+            print(f"[kt-kernel] Successfully loaded {variant.upper()} variant")
+        return ext
+
+    except (ImportError, ModuleNotFoundError, FileNotFoundError) as e:
+        if os.environ.get('KT_KERNEL_DEBUG') == '1':
+            print(f"[kt-kernel] Failed to load {variant} variant: {e}")
+
+        # Automatic fallback to next best variant
+        if variant == 'amx':
+            if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                print("[kt-kernel] Falling back from AMX to AVX512")
+            return load_extension('avx512')
+        elif variant == 'avx512':
+            if os.environ.get('KT_KERNEL_DEBUG') == '1':
+                print("[kt-kernel] Falling back from AVX512 to AVX2")
+            return load_extension('avx2')
+        else:
+            # AVX2 is the last fallback - if this fails, we can't continue
+            raise ImportError(
+                f"Failed to load kt_kernel extension (variant: {variant}). "
+                f"Original error: {e}\n"
+                f"This usually means the kt_kernel package is not properly installed."
+            )
+
+
+def initialize():
+    """
+    Detect CPU capabilities and load the optimal extension variant.
+
+    This is the main entry point called by kt_kernel.__init__.py.
+
+    Returns:
+        tuple: (extension_module, variant_name)
+    - extension_module: The loaded C++ extension module
+            - variant_name: String indicating which variant was loaded ('amx', 'avx512', 'avx2')
+
+    Example:
+        >>> ext, variant = initialize()
+        >>> print(f"Loaded {variant} variant")
+        >>> wrapper = ext.AMXMoEWrapper(...)
+    """
+    # Detect CPU features
+    variant = detect_cpu_features()
+
+    if os.environ.get('KT_KERNEL_DEBUG') == '1':
+        print(f"[kt-kernel] Selected CPU variant: {variant}")
+
+    # Load the appropriate extension
+    ext = load_extension(variant)
+
+    if os.environ.get('KT_KERNEL_DEBUG') == '1':
+        print(f"[kt-kernel] Extension module loaded: {ext.__name__}")
+
+    return ext, variant
--- a/kt-kernel/setup.py
+++ b/kt-kernel/setup.py
@@ -229,6 +229,133 @@ class CMakeBuild(build_ext):
        return info

    def build_extension(self, ext: CMakeExtension):
+        """
+        Main entry point for building the extension.
+
+        Checks if multi-variant build is requested (CPUINFER_BUILD_ALL_VARIANTS=1)
+        and routes to the appropriate build method.
+        """
+        if _env_get_bool("CPUINFER_BUILD_ALL_VARIANTS", False):
+            # Build all 3 variants (AMX, AVX512, AVX2)
+            self.build_multi_variants(ext)
+        else:
+            # Build single variant (original behavior)
+            self._build_single_variant(ext)
+
+    def build_multi_variants(self, ext: CMakeExtension):
+        """
+        Build all 3 CPU variants (AMX, AVX512, AVX2) in a single wheel.
+
+        This method is called when CPUINFER_BUILD_ALL_VARIANTS=1 is set.
+        It builds three separate extensions with different CPU instruction sets
+        and renames the output .so files with variant suffixes.
+        """
+        print("=" * 80)
+        print("Building kt-kernel with ALL CPU variants (AMX, AVX512, AVX2)")
+        print("=" * 80)
+
+        # Define the 3 variants to build
+        variants = [
+            {
+                'name': 'amx',
+                'env': {
+                    'CPUINFER_CPU_INSTRUCT': 'NATIVE',
+                    'CPUINFER_ENABLE_AMX': 'ON',
+                },
+                'description': 'AMX variant (Intel Sapphire Rapids+)'
+            },
+            {
+                'name': 'avx512',
+                'env': {
+                    'CPUINFER_CPU_INSTRUCT': 'AVX512',
+                    'CPUINFER_ENABLE_AMX': 'OFF',
+                },
+                'description': 'AVX512 variant (Intel Skylake-X/Ice Lake/Cascade Lake)'
+            },
+            {
+                'name': 'avx2',
+                'env': {
+                    'CPUINFER_CPU_INSTRUCT': 'AVX2',
+                    'CPUINFER_ENABLE_AMX': 'OFF',
+                },
+                'description': 'AVX2 variant (maximum compatibility)'
+            }
+        ]
+
+        # Save original environment
+        original_env = os.environ.copy()
+
+        extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
+
+        for i, variant in enumerate(variants, 1):
+            print(f"\n{'=' * 80}")
+            print(f"Building variant {i}/3: {variant['description']}")
+            print(f"{'=' * 80}\n")
+
+            # Set variant-specific environment variables
+            os.environ.update(variant['env'])
+
+            # Use a unique build directory for this variant
+            original_build_temp = self.build_temp
+            self.build_temp = str(Path(self.build_temp) / f"variant_{variant['name']}")
+
+            try:
+                # Build this variant (calls the single-variant build logic)
+                self._build_single_variant(ext)
+
+                # Rename the generated .so file to include variant suffix
+                # Original: kt_kernel_ext.cpython-311-x86_64-linux-gnu.so
+                # Renamed:  _kt_kernel_ext_amx.cpython-311-x86_64-linux-gnu.so
+
+                # Extract the base extension name (without package prefix)
+                # ext.name is "kt_kernel.kt_kernel_ext", we want "kt_kernel_ext"
+                base_ext_name = ext.name.split('.')[-1]
+
+                # Find the newly built .so file
+                import time
+                time.sleep(0.5)  # Give filesystem time to sync
+
+                built_candidates = [
+                    f for f in Path(extdir).glob("*.so")
+                    if f.name.startswith(base_ext_name) and not f.name.startswith(f"_{base_ext_name}_")
+                ]
+
+                if not built_candidates:
+                    print(f"WARNING: No .so file found for {base_ext_name} in {extdir}")
+                    print(f"Files in {extdir}:")
+                    for f in Path(extdir).glob("*.so"):
+                        print(f"  {f.name}")
+
+                for so_file in built_candidates:
+                    # Extract the python tag part (e.g., ".cpython-311-x86_64-linux-gnu.so")
+                    suffix = so_file.name.replace(base_ext_name, "")
+                    new_name = f"_{base_ext_name}_{variant['name']}{suffix}"
+                    new_path = extdir / new_name
+
+                    print(f"-- Renaming {so_file.name} -> {new_name}")
+                    if new_path.exists():
+                        print(f"   WARNING: Target file already exists, removing: {new_path}")
+                        new_path.unlink()
+                    so_file.rename(new_path)
+                    print(f"   ✓ Successfully renamed to {new_name}")
+
+            finally:
+                # Restore build_temp for next iteration
+                self.build_temp = original_build_temp
+
+        # Restore original environment
+        os.environ.clear()
+        os.environ.update(original_env)
+
+        print(f"\n{'=' * 80}")
+        print("✓ Successfully built all 3 CPU variants")
+        print(f"{'=' * 80}\n")
+
+    def _build_single_variant(self, ext: CMakeExtension):
+        """
+        Build a single CPU variant. This contains the core build logic
+        extracted from the original build_extension method.
+        """
        # Auto-detect CUDA toolkit if user did not explicitly set CPUINFER_USE_CUDA
        def detect_cuda_toolkit() -> bool:
            # Respect CUDA_HOME
@@ -276,6 +403,10 @@ class CMakeBuild(build_ext):
            auto_cuda = detect_cuda_toolkit()
            os.environ["CPUINFER_USE_CUDA"] = "1" if auto_cuda else "0"
            print(f"-- CPUINFER_USE_CUDA not set; auto-detected CUDA toolkit: {'YES' if auto_cuda else 'NO'}")
+        elif cuda_env:
+            print("-- CPUINFER_USE_CUDA explicitly enabled")
+        else:
+            print("-- CPUINFER_USE_CUDA explicitly disabled")

        extdir = Path(self.get_ext_fullpath(ext.name)).parent.resolve()
        cfg = default_build_type()
@@ -431,7 +562,15 @@ class CMakeBuild(build_ext):
 # Version (simple). If you later add a python package dir, you can read from it.
 ################################################################################

-VERSION = os.environ.get("CPUINFER_VERSION", "0.1.0")
+# Import version from shared version.py at project root
+_version_file = Path(__file__).resolve().parent.parent / "version.py"
+if _version_file.exists():
+    _version_ns = {}
+    with open(_version_file, "r", encoding="utf-8") as f:
+        exec(f.read(), _version_ns)
+    VERSION = os.environ.get("CPUINFER_VERSION", _version_ns.get("__version__", "0.4.2"))
+else:
+    VERSION = os.environ.get("CPUINFER_VERSION", "0.4.2")

 ################################################################################
 # Setup
@@ -449,7 +588,7 @@ setup(
        "kt_kernel": "python",
        "kt_kernel.utils": "python/utils",
    },
-    ext_modules=[CMakeExtension("kt_kernel_ext", str(REPO_ROOT))],
+    ext_modules=[CMakeExtension("kt_kernel.kt_kernel_ext", str(REPO_ROOT))],
    cmdclass={"build_ext": CMakeBuild},
    zip_safe=False,
    classifiers=[
--- a/kt-kernel/test/per_commit/test_basic_cpu.py
+++ b/kt-kernel/test/per_commit/test_basic_cpu.py
@@ -16,7 +16,8 @@ register_cpu_ci(est_time=30, suite="default")

 # Check if kt_kernel_ext is available
 try:
-    import kt_kernel_ext
+    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext
+    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_KT_KERNEL = True
 except ImportError:
    HAS_KT_KERNEL = False
--- a/kt-kernel/test/per_commit/test_moe_amx_accuracy_int4.py
+++ b/kt-kernel/test/per_commit/test_moe_amx_accuracy_int4.py
@@ -19,7 +19,8 @@ register_cpu_ci(est_time=120, suite="default")
 # Check if dependencies are available
 try:
    import torch
-    import kt_kernel_ext
+    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext
+    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_DEPS = True
 except ImportError as e:
    HAS_DEPS = False
--- a/kt-kernel/test/per_commit/test_moe_amx_accuracy_int4_1.py
+++ b/kt-kernel/test/per_commit/test_moe_amx_accuracy_int4_1.py
@@ -19,7 +19,8 @@ register_cpu_ci(est_time=120, suite="default")
 # Check if dependencies are available
 try:
    import torch
-    import kt_kernel_ext
+    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext
+    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_DEPS = True
 except ImportError as e:
    HAS_DEPS = False
--- a/kt-kernel/test/per_commit/test_moe_amx_accuracy_int4_1k.py
+++ b/kt-kernel/test/per_commit/test_moe_amx_accuracy_int4_1k.py
@@ -19,7 +19,8 @@ register_cpu_ci(est_time=120, suite="default")
 # Check if dependencies are available
 try:
    import torch
-    import kt_kernel_ext
+    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext
+    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_DEPS = True
 except ImportError as e:
    HAS_DEPS = False
--- a/kt-kernel/test/per_commit/test_moe_amx_accuracy_int8.py
+++ b/kt-kernel/test/per_commit/test_moe_amx_accuracy_int8.py
@@ -19,7 +19,8 @@ register_cpu_ci(est_time=120, suite="default")
 # Check if dependencies are available
 try:
    import torch
-    import kt_kernel_ext
+    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext
+    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    HAS_DEPS = True
 except ImportError as e:
    HAS_DEPS = False
--- a/kt-kernel/test/per_commit/test_moe_amx_bench_int4.py
+++ b/kt-kernel/test/per_commit/test_moe_amx_bench_int4.py
@@ -23,7 +23,8 @@ register_cpu_ci(est_time=300, suite="default")
 # Check if dependencies are available
 try:
    import torch
-    import kt_kernel_ext
+    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext
+    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    from tqdm import tqdm
    HAS_DEPS = True
 except ImportError as e:
--- a/kt-kernel/test/per_commit/test_moe_amx_bench_int4_1.py
+++ b/kt-kernel/test/per_commit/test_moe_amx_bench_int4_1.py
@@ -23,7 +23,8 @@ register_cpu_ci(est_time=300, suite="default")
 # Check if dependencies are available
 try:
    import torch
-    import kt_kernel_ext
+    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext
+    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    from tqdm import tqdm

    HAS_DEPS = True
--- a/kt-kernel/test/per_commit/test_moe_amx_bench_int4_1k.py
+++ b/kt-kernel/test/per_commit/test_moe_amx_bench_int4_1k.py
@@ -24,7 +24,8 @@ register_cpu_ci(est_time=300, suite="default")
 # Check if dependencies are available
 try:
    import torch
-    import kt_kernel_ext
+    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext
+    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    from tqdm import tqdm
    HAS_DEPS = True
 except ImportError as e:
--- a/kt-kernel/test/per_commit/test_moe_amx_bench_int8.py
+++ b/kt-kernel/test/per_commit/test_moe_amx_bench_int8.py
@@ -23,7 +23,8 @@ register_cpu_ci(est_time=300, suite="default")
 # Check if dependencies are available
 try:
    import torch
-    import kt_kernel_ext
+    import kt_kernel  # Import kt_kernel first to register kt_kernel_ext
+    kt_kernel_ext = kt_kernel.kt_kernel_ext  # Access the extension module
    from tqdm import tqdm
    HAS_DEPS = True
 except ImportError as e:
--- a/kt-sft/ktransformers/init.py
+++ b/kt-sft/ktransformers/init.py
@@ -1,11 +1,20 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
-Description  : 
+Description  :
 Author       : kkk1nak0
 Date         : 2024-08-15 07:34:46
 Version      : 1.0.0
-LastEditors  : chenxl 
+LastEditors  : chenxl
 LastEditTime : 2025-02-15 03:53:02
 '''
-__version__ = "0.4.1"
+import sys
+import os
+
+# Import version from shared version.py at project root
+_root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, _root_dir)
+try:
+    from version import __version__
+finally:
+    sys.path.pop(0)
--- a/version.py
+++ b/version.py
@@ -0,0 +1,6 @@
+"""
+KTransformers version information.
+Shared across kt-kernel and kt-sft modules.
+"""
+
+__version__ = "0.4.3"