# Use an official CUDA runtime with Ubuntu as a parent image FROM nvidia/cuda:12.8.1-runtime-ubuntu24.04 # Install system dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ curl \ ca-certificates \ python3.12 \ python3-pip \ python3.12-venv \ && rm -rf /var/lib/apt/lists/* # Create a virtual environment RUN python3 -m venv /opt/venv # Activate the venv and set the PATH ENV PATH="/opt/venv/bin:$PATH" # Upgrade pip RUN pip install --no-cache-dir --upgrade pip # Set the working directory in the container WORKDIR /app # Get requirements COPY pyproject.toml . # Install cu12 group first — pins torch+cu128, exllamav2/v3+cu128, # flash_attn+cu128, and flash-linear-attention. # The 'extras' group (infinity-emb, sentence-transformers) is installed separately # with --no-deps so pip cannot resolve xformers transitively and pull a cu130 wheel, # which would cause libcudart.so.13 ImportError on driver 590.x (cu128-only hosts). # See: https://github.com/theroyallab/tabbyAPI/issues/414 RUN pip install --no-cache-dir .[cu12] RUN pip install --no-cache-dir --no-deps .[extras] RUN rm pyproject.toml # Copy the current directory contents into the container COPY . . # Make port 5000 available to the world outside this container EXPOSE 5000 # Set the entry point ENTRYPOINT ["python3"] # Run main.py when the container launches CMD ["main.py", "--host", "0.0.0.0"]