co-mono/packages/pods/scripts/pod_setup.sh

#!/usr/bin/env bash
# GPU pod bootstrap for vLLM deployment
set -euo pipefail

# Parse arguments passed from pi CLI
MOUNT_COMMAND=""
MODELS_PATH=""
HF_TOKEN=""
PI_API_KEY=""
VLLM_VERSION="release"  # Default to release

while [[ $# -gt 0 ]]; do
    case $1 in
        --mount)
            MOUNT_COMMAND="$2"
            shift 2
            ;;
        --models-path)
            MODELS_PATH="$2"
            shift 2
            ;;
        --hf-token)
            HF_TOKEN="$2"
            shift 2
            ;;
        --vllm-api-key)
            PI_API_KEY="$2"
            shift 2
            ;;
        --vllm)
            VLLM_VERSION="$2"
            shift 2
            ;;
        *)
            echo "ERROR: Unknown option: $1" >&2
            exit 1
            ;;
    esac
done

# Validate required parameters
if [ -z "$HF_TOKEN" ]; then
    echo "ERROR: HF_TOKEN is required" >&2
    exit 1
fi

if [ -z "$PI_API_KEY" ]; then
    echo "ERROR: PI_API_KEY is required" >&2
    exit 1
fi

if [ -z "$MODELS_PATH" ]; then
    echo "ERROR: MODELS_PATH is required" >&2
    exit 1
fi

echo "=== Starting pod setup ==="

# Install system dependencies
apt update -y
apt install -y python3-pip python3-venv git build-essential cmake ninja-build curl wget lsb-release htop pkg-config

# --- Install matching CUDA toolkit -------------------------------------------
echo "Checking CUDA driver version..."
DRIVER_CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}')
echo "Driver supports CUDA: $DRIVER_CUDA_VERSION"

# Check if nvcc exists and its version
if command -v nvcc &> /dev/null; then
    NVCC_VERSION=$(nvcc --version | grep "release" | awk '{print $6}' | cut -d, -f1)
    echo "Current nvcc version: $NVCC_VERSION"
else
    NVCC_VERSION="none"
    echo "nvcc not found"
fi

# Install CUDA toolkit matching driver version if needed
if [[ "$NVCC_VERSION" != "$DRIVER_CUDA_VERSION" ]]; then
    echo "Installing CUDA Toolkit $DRIVER_CUDA_VERSION to match driver..."

    # Detect Ubuntu version
    UBUNTU_VERSION=$(lsb_release -rs)
    UBUNTU_CODENAME=$(lsb_release -cs)

    echo "Detected Ubuntu $UBUNTU_VERSION ($UBUNTU_CODENAME)"

    # Map Ubuntu version to NVIDIA repo path
    if [[ "$UBUNTU_VERSION" == "24.04" ]]; then
        REPO_PATH="ubuntu2404"
    elif [[ "$UBUNTU_VERSION" == "22.04" ]]; then
        REPO_PATH="ubuntu2204"
    elif [[ "$UBUNTU_VERSION" == "20.04" ]]; then
        REPO_PATH="ubuntu2004"
    else
        echo "Warning: Unsupported Ubuntu version $UBUNTU_VERSION, trying ubuntu2204"
        REPO_PATH="ubuntu2204"
    fi

    # Add NVIDIA package repositories
    wget https://developer.download.nvidia.com/compute/cuda/repos/${REPO_PATH}/x86_64/cuda-keyring_1.1-1_all.deb
    dpkg -i cuda-keyring_1.1-1_all.deb
    rm cuda-keyring_1.1-1_all.deb
    apt-get update

    # Install specific CUDA toolkit version
    # Convert version format (12.9 -> 12-9)
    CUDA_VERSION_APT=$(echo $DRIVER_CUDA_VERSION | sed 's/\./-/')
    echo "Installing cuda-toolkit-${CUDA_VERSION_APT}..."
    apt-get install -y cuda-toolkit-${CUDA_VERSION_APT}

    # Add CUDA to PATH
    export PATH=/usr/local/cuda-${DRIVER_CUDA_VERSION}/bin:$PATH
    export LD_LIBRARY_PATH=/usr/local/cuda-${DRIVER_CUDA_VERSION}/lib64:${LD_LIBRARY_PATH:-}

    # Verify installation
    nvcc --version
else
    echo "CUDA toolkit $NVCC_VERSION matches driver version"
    export PATH=/usr/local/cuda-${DRIVER_CUDA_VERSION}/bin:$PATH
    export LD_LIBRARY_PATH=/usr/local/cuda-${DRIVER_CUDA_VERSION}/lib64:${LD_LIBRARY_PATH:-}
fi

# --- Install uv (fast Python package manager) --------------------------------
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$PATH"

# --- Install Python 3.12 if not available ------------------------------------
if ! command -v python3.12 &> /dev/null; then
    echo "Python 3.12 not found. Installing via uv..."
    uv python install 3.12
fi

# --- Clean up existing environments and caches -------------------------------
echo "Cleaning up existing environments and caches..."

# Remove existing venv for a clean installation
VENV="$HOME/venv"
if [ -d "$VENV" ]; then
    echo "Removing existing virtual environment..."
    rm -rf "$VENV"
fi

# Remove uv cache to ensure fresh installs
if [ -d "$HOME/.cache/uv" ]; then
    echo "Clearing uv cache..."
    rm -rf "$HOME/.cache/uv"
fi

# Remove vLLM cache to avoid conflicts
if [ -d "$HOME/.cache/vllm" ]; then
    echo "Clearing vLLM cache..."
    rm -rf "$HOME/.cache/vllm"
fi

# --- Create and activate venv ------------------------------------------------
echo "Creating fresh virtual environment..."
uv venv --python 3.12 --seed "$VENV"
source "$VENV/bin/activate"

# --- Install PyTorch and vLLM ------------------------------------------------
echo "Installing vLLM and dependencies (version: $VLLM_VERSION)..."
case "$VLLM_VERSION" in
    release)
        echo "Installing vLLM release with PyTorch..."
        # Install vLLM with automatic PyTorch backend selection
        # vLLM will automatically install the correct PyTorch version
        uv pip install vllm>=0.10.0 --torch-backend=auto || {
            echo "ERROR: Failed to install vLLM"
            exit 1
        }
        ;;
    nightly)
        echo "Installing vLLM nightly with PyTorch..."
        echo "This will install the latest nightly build of vLLM..."

        # Install vLLM nightly with PyTorch
        uv pip install -U vllm \
            --torch-backend=auto \
            --extra-index-url https://wheels.vllm.ai/nightly || {
            echo "ERROR: Failed to install vLLM nightly"
            exit 1
        }

        echo "vLLM nightly successfully installed!"
        ;;
    gpt-oss)
        echo "Installing GPT-OSS special build with PyTorch nightly..."
        echo "WARNING: This build is ONLY for GPT-OSS models!"
        echo "Installing PyTorch nightly and cutting-edge dependencies..."

        # Convert CUDA version format for PyTorch (12.4 -> cu124)
        PYTORCH_CUDA="cu$(echo $DRIVER_CUDA_VERSION | sed 's/\.//')"
        echo "Using PyTorch nightly with ${PYTORCH_CUDA} (driver supports ${DRIVER_CUDA_VERSION})"

        # The GPT-OSS build will pull PyTorch nightly and other dependencies
        # via the extra index URLs. We don't pre-install torch here to avoid conflicts.
        uv pip install --pre vllm==0.10.1+gptoss \
            --extra-index-url https://wheels.vllm.ai/gpt-oss/ \
            --extra-index-url https://download.pytorch.org/whl/nightly/${PYTORCH_CUDA} \
            --index-strategy unsafe-best-match || {
            echo "ERROR: Failed to install GPT-OSS vLLM build"
            echo "This automatically installs PyTorch nightly with ${PYTORCH_CUDA}, Triton nightly, and other dependencies"
            exit 1
        }

        # Install gpt-oss library for tool support
        uv pip install gpt-oss || {
            echo "WARNING: Failed to install gpt-oss library (needed for tool use)"
        }
        ;;
    *)
        echo "ERROR: Unknown vLLM version: $VLLM_VERSION"
        exit 1
        ;;
esac

# --- Install additional packages ---------------------------------------------
echo "Installing additional packages..."
# Note: tensorrt removed temporarily due to CUDA 13.0 compatibility issues
# TensorRT still depends on deprecated nvidia-cuda-runtime-cu13 package
uv pip install huggingface-hub psutil hf_transfer

# --- FlashInfer installation (optional, improves performance) ----------------
echo "Attempting FlashInfer installation (optional)..."
if uv pip install flashinfer-python; then
    echo "FlashInfer installed successfully"
else
    echo "FlashInfer not available, using Flash Attention instead"
fi

# --- Mount storage if provided -----------------------------------------------
if [ -n "$MOUNT_COMMAND" ]; then
    echo "Setting up mount..."

    # Create mount point directory if it doesn't exist
    mkdir -p "$MODELS_PATH"

    # Execute the mount command
    eval "$MOUNT_COMMAND" || {
        echo "WARNING: Mount command failed, continuing without mount"
    }

    # Verify mount succeeded (optional, may not always be a mount point)
    if mountpoint -q "$MODELS_PATH" 2>/dev/null; then
        echo "Storage successfully mounted at $MODELS_PATH"
    else
        echo "Note: $MODELS_PATH is not a mount point (might be local storage)"
    fi
fi

# --- Model storage setup ------------------------------------------------------
echo ""
echo "=== Setting up model storage ==="
echo "Storage path: $MODELS_PATH"

# Check if the path exists and is writable
if [ ! -d "$MODELS_PATH" ]; then
    echo "Creating model storage directory: $MODELS_PATH"
    mkdir -p "$MODELS_PATH"
fi

if [ ! -w "$MODELS_PATH" ]; then
    echo "ERROR: Model storage path is not writable: $MODELS_PATH"
    echo "Please check permissions"
    exit 1
fi

# Create the huggingface cache directory structure in the models path
mkdir -p "${MODELS_PATH}/huggingface/hub"

# Remove any existing cache directory or symlink
if [ -e ~/.cache/huggingface ] || [ -L ~/.cache/huggingface ]; then
    echo "Removing existing ~/.cache/huggingface..."
    rm -rf ~/.cache/huggingface 2>/dev/null || true
fi

# Create parent directory if needed
mkdir -p ~/.cache

# Create symlink from ~/.cache/huggingface to the models path
ln -s "${MODELS_PATH}/huggingface" ~/.cache/huggingface
echo "Created symlink: ~/.cache/huggingface -> ${MODELS_PATH}/huggingface"

# Verify the symlink works
if [ -d ~/.cache/huggingface/hub ]; then
    echo "✓ Model storage configured successfully"

    # Check available space
    AVAILABLE_SPACE=$(df -h "$MODELS_PATH" | awk 'NR==2 {print $4}')
    echo "Available space: $AVAILABLE_SPACE"
else
    echo "ERROR: Could not verify model storage setup"
    echo "The symlink was created but the target directory is not accessible"
    exit 1
fi

# --- Configure environment ----------------------------------------------------
mkdir -p ~/.config/vllm
touch ~/.config/vllm/do_not_track

# Write environment to .bashrc for persistence
cat >> ~/.bashrc << EOF

# Pi vLLM environment
[ -d "\$HOME/venv" ] && source "\$HOME/venv/bin/activate"
export PATH="/usr/local/cuda-${DRIVER_CUDA_VERSION}/bin:\$HOME/.local/bin:\$PATH"
export LD_LIBRARY_PATH="/usr/local/cuda-${DRIVER_CUDA_VERSION}/lib64:\${LD_LIBRARY_PATH:-}"
export HF_TOKEN="${HF_TOKEN}"
export PI_API_KEY="${PI_API_KEY}"
export HUGGING_FACE_HUB_TOKEN="${HF_TOKEN}"
export HF_HUB_ENABLE_HF_TRANSFER=1
export VLLM_NO_USAGE_STATS=1
export VLLM_DO_NOT_TRACK=1
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
EOF

# Create log directory for vLLM
mkdir -p ~/.vllm_logs

# --- Output GPU info for pi CLI to parse -------------------------------------
echo ""
echo "===GPU_INFO_START==="
nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader | while IFS=, read -r id name memory; do
    # Trim whitespace
    id=$(echo "$id" | xargs)
    name=$(echo "$name" | xargs)
    memory=$(echo "$memory" | xargs)
    echo "{\"id\": $id, \"name\": \"$name\", \"memory\": \"$memory\"}"
done
echo "===GPU_INFO_END==="

echo ""
echo "=== Setup complete ==="
echo "Pod is ready for vLLM deployments"
echo "Models will be cached at: $MODELS_PATH"