Initial monorepo setup with npm workspaces and dual TypeScript configuration

- Set up npm workspaces for three packages: pi-tui, pi-agent, and pi (pods)
- Implemented dual TypeScript configuration:
  - Root tsconfig.json with path mappings for development and type checking
  - Package-specific tsconfig.build.json for clean production builds
- Configured lockstep versioning with sync script for inter-package dependencies
- Added comprehensive documentation for development and publishing workflows
- All packages at version 0.5.0 ready for npm publishing
This commit is contained in:
Mario Zechner 2025-08-09 17:18:38 +02:00
commit a74c5da112
63 changed files with 14558 additions and 0 deletions

362
packages/pods/src/cli.ts Normal file
View file

@ -0,0 +1,362 @@
#!/usr/bin/env node
import chalk from "chalk";
import { spawn } from "child_process";
import { readFileSync } from "fs";
import { dirname, join } from "path";
import { fileURLToPath } from "url";
import { listModels, startModel, stopModel, viewLogs } from "./commands/models.js";
import { listPods, removePodCommand, setupPod, switchActivePod } from "./commands/pods.js";
import { promptModel } from "./commands/prompt.js";
import { getActivePod, loadConfig } from "./config.js";
import { sshExecStream } from "./ssh.js";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const packageJson = JSON.parse(readFileSync(join(__dirname, "../package.json"), "utf-8"));
function printHelp() {
console.log(`pi v${packageJson.version} - Manage vLLM deployments on GPU pods
Pod Management:
pi pods setup <name> "<ssh>" --mount "<mount>" Setup pod with mount command
Options:
--vllm release Install latest vLLM release >=0.10.0 (default)
--vllm nightly Install vLLM nightly build (latest features)
--vllm gpt-oss Install vLLM 0.10.1+gptoss with PyTorch nightly (GPT-OSS only)
pi pods List all pods (* = active)
pi pods active <name> Switch active pod
pi pods remove <name> Remove pod from local config
pi shell [<name>] Open shell on pod (active or specified)
pi ssh [<name>] "<command>" Run SSH command on pod
Model Management:
pi start <model> --name <name> [options] Start a model
--memory <percent> GPU memory allocation (30%, 50%, 90%)
--context <size> Context window (4k, 8k, 16k, 32k, 64k, 128k)
--gpus <count> Number of GPUs to use (predefined models only)
--vllm <args...> Pass remaining args to vLLM (ignores other options)
pi stop [<name>] Stop model (or all if no name)
pi list List running models
pi logs <name> Stream model logs
pi agent <name> ["<message>"...] [options] Chat with model using agent & tools
pi agent <name> [options] Interactive chat mode
--continue, -c Continue previous session
--json Output as JSONL
(All pi-agent options are supported)
All model commands support --pod <name> to override the active pod.
Environment:
HF_TOKEN HuggingFace token for model downloads
PI_API_KEY API key for vLLM endpoints
PI_CONFIG_DIR Config directory (default: ~/.pi)`);
}
// Parse command line arguments
const args = process.argv.slice(2);
if (args.length === 0 || args[0] === "--help" || args[0] === "-h") {
printHelp();
process.exit(0);
}
if (args[0] === "--version" || args[0] === "-v") {
console.log(packageJson.version);
process.exit(0);
}
const command = args[0];
const subcommand = args[1];
// Main command handler
try {
// Handle "pi pods" commands
if (command === "pods") {
if (!subcommand) {
// pi pods - list all pods
listPods();
} else if (subcommand === "setup") {
// pi pods setup <name> "<ssh>" [--mount "<mount>"] [--models-path <path>] [--vllm release|nightly|gpt-oss]
const name = args[2];
const sshCmd = args[3];
if (!name || !sshCmd) {
console.error(
'Usage: pi pods setup <name> "<ssh>" [--mount "<mount>"] [--models-path <path>] [--vllm release|nightly|gpt-oss]',
);
process.exit(1);
}
// Parse options
const options: { mount?: string; modelsPath?: string; vllm?: "release" | "nightly" | "gpt-oss" } = {};
for (let i = 4; i < args.length; i++) {
if (args[i] === "--mount" && i + 1 < args.length) {
options.mount = args[i + 1];
i++;
} else if (args[i] === "--models-path" && i + 1 < args.length) {
options.modelsPath = args[i + 1];
i++;
} else if (args[i] === "--vllm" && i + 1 < args.length) {
const vllmType = args[i + 1];
if (vllmType === "release" || vllmType === "nightly" || vllmType === "gpt-oss") {
options.vllm = vllmType;
} else {
console.error(chalk.red(`Invalid vLLM type: ${vllmType}`));
console.error("Valid options: release, nightly, gpt-oss");
process.exit(1);
}
i++;
}
}
// If --mount provided but no --models-path, try to extract path from mount command
if (options.mount && !options.modelsPath) {
// Extract last part of mount command as models path
const parts = options.mount.trim().split(" ");
const lastPart = parts[parts.length - 1];
if (lastPart?.startsWith("/")) {
options.modelsPath = lastPart;
}
}
await setupPod(name, sshCmd, options);
} else if (subcommand === "active") {
// pi pods active <name>
const name = args[2];
if (!name) {
console.error("Usage: pi pods active <name>");
process.exit(1);
}
switchActivePod(name);
} else if (subcommand === "remove") {
// pi pods remove <name>
const name = args[2];
if (!name) {
console.error("Usage: pi pods remove <name>");
process.exit(1);
}
removePodCommand(name);
} else {
console.error(`Unknown pods subcommand: ${subcommand}`);
process.exit(1);
}
} else {
// Parse --pod override for model commands
let podOverride: string | undefined;
const podIndex = args.indexOf("--pod");
if (podIndex !== -1 && podIndex + 1 < args.length) {
podOverride = args[podIndex + 1];
// Remove --pod and its value from args
args.splice(podIndex, 2);
}
// Handle SSH/shell commands and model commands
switch (command) {
case "shell": {
// pi shell [<name>] - open interactive shell
const podName = args[1];
let podInfo: { name: string; pod: import("./types.js").Pod } | null = null;
if (podName) {
const config = loadConfig();
const pod = config.pods[podName];
if (pod) {
podInfo = { name: podName, pod };
}
} else {
podInfo = getActivePod();
}
if (!podInfo) {
if (podName) {
console.error(chalk.red(`Pod '${podName}' not found`));
} else {
console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
}
process.exit(1);
}
console.log(chalk.green(`Connecting to pod '${podInfo.name}'...`));
// Execute SSH in interactive mode
const sshArgs = podInfo.pod.ssh.split(" ").slice(1); // Remove 'ssh' from command
const sshProcess = spawn("ssh", sshArgs, {
stdio: "inherit",
env: process.env,
});
sshProcess.on("exit", (code) => {
process.exit(code || 0);
});
break;
}
case "ssh": {
// pi ssh [<name>] "<command>" - run command via SSH
let podName: string | undefined;
let sshCommand: string;
if (args.length === 2) {
// pi ssh "<command>" - use active pod
sshCommand = args[1];
} else if (args.length === 3) {
// pi ssh <name> "<command>"
podName = args[1];
sshCommand = args[2];
} else {
console.error('Usage: pi ssh [<name>] "<command>"');
process.exit(1);
}
let podInfo: { name: string; pod: import("./types.js").Pod } | null = null;
if (podName) {
const config = loadConfig();
const pod = config.pods[podName];
if (pod) {
podInfo = { name: podName, pod };
}
} else {
podInfo = getActivePod();
}
if (!podInfo) {
if (podName) {
console.error(chalk.red(`Pod '${podName}' not found`));
} else {
console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
}
process.exit(1);
}
console.log(chalk.gray(`Running on pod '${podInfo.name}': ${sshCommand}`));
// Execute command and stream output
const exitCode = await sshExecStream(podInfo.pod.ssh, sshCommand);
process.exit(exitCode);
break;
}
case "start": {
// pi start <model> --name <name> [options]
const modelId = args[1];
if (!modelId) {
// Show available models
const { showKnownModels } = await import("./commands/models.js");
await showKnownModels();
process.exit(0);
}
// Parse options
let name: string | undefined;
let memory: string | undefined;
let context: string | undefined;
let gpus: number | undefined;
const vllmArgs: string[] = [];
let inVllmArgs = false;
for (let i = 2; i < args.length; i++) {
if (inVllmArgs) {
vllmArgs.push(args[i]);
} else if (args[i] === "--name" && i + 1 < args.length) {
name = args[i + 1];
i++;
} else if (args[i] === "--memory" && i + 1 < args.length) {
memory = args[i + 1];
i++;
} else if (args[i] === "--context" && i + 1 < args.length) {
context = args[i + 1];
i++;
} else if (args[i] === "--gpus" && i + 1 < args.length) {
gpus = parseInt(args[i + 1]);
if (Number.isNaN(gpus) || gpus < 1) {
console.error(chalk.red("--gpus must be a positive number"));
process.exit(1);
}
i++;
} else if (args[i] === "--vllm") {
inVllmArgs = true;
}
}
if (!name) {
console.error("--name is required");
process.exit(1);
}
// Warn if --vllm is used with other parameters
if (vllmArgs.length > 0 && (memory || context || gpus)) {
console.log(
chalk.yellow("⚠ Warning: --memory, --context, and --gpus are ignored when --vllm is specified"),
);
console.log(chalk.yellow(" Using only custom vLLM arguments"));
console.log("");
}
await startModel(modelId, name, {
pod: podOverride,
memory,
context,
gpus,
vllmArgs: vllmArgs.length > 0 ? vllmArgs : undefined,
});
break;
}
case "stop": {
// pi stop [name] - stop specific model or all models
const name = args[1];
if (!name) {
// Stop all models on the active pod
const { stopAllModels } = await import("./commands/models.js");
await stopAllModels({ pod: podOverride });
} else {
await stopModel(name, { pod: podOverride });
}
break;
}
case "list":
// pi list
await listModels({ pod: podOverride });
break;
case "logs": {
// pi logs <name>
const name = args[1];
if (!name) {
console.error("Usage: pi logs <name>");
process.exit(1);
}
await viewLogs(name, { pod: podOverride });
break;
}
case "agent": {
// pi agent <name> [messages...] [options]
const name = args[1];
if (!name) {
console.error("Usage: pi agent <name> [messages...] [options]");
process.exit(1);
}
const apiKey = process.env.PI_API_KEY;
// Pass all args after the model name
const agentArgs = args.slice(2);
// If no messages provided, it's interactive mode
await promptModel(name, agentArgs, {
pod: podOverride,
apiKey,
}).catch(() => {
// Error already handled in promptModel, just exit cleanly
process.exit(0);
});
break;
}
default:
console.error(`Unknown command: ${command}`);
printHelp();
process.exit(1);
}
}
} catch (error) {
console.error("Error:", error);
process.exit(1);
}

View file

@ -0,0 +1,703 @@
import chalk from "chalk";
import { spawn } from "child_process";
import { readFileSync } from "fs";
import { dirname, join } from "path";
import { fileURLToPath } from "url";
import { getActivePod, loadConfig, saveConfig } from "../config.js";
import { getModelConfig, getModelName, isKnownModel } from "../model-configs.js";
import { sshExec } from "../ssh.js";
import type { Pod } from "../types.js";
/**
* Get the pod to use (active or override)
*/
const getPod = (podOverride?: string): { name: string; pod: Pod } => {
if (podOverride) {
const config = loadConfig();
const pod = config.pods[podOverride];
if (!pod) {
console.error(chalk.red(`Pod '${podOverride}' not found`));
process.exit(1);
}
return { name: podOverride, pod };
}
const active = getActivePod();
if (!active) {
console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
process.exit(1);
}
return active;
};
/**
* Find next available port starting from 8001
*/
const getNextPort = (pod: Pod): number => {
const usedPorts = Object.values(pod.models).map((m) => m.port);
let port = 8001;
while (usedPorts.includes(port)) {
port++;
}
return port;
};
/**
* Select GPUs for model deployment (round-robin)
*/
const selectGPUs = (pod: Pod, count: number = 1): number[] => {
if (count === pod.gpus.length) {
// Use all GPUs
return pod.gpus.map((g) => g.id);
}
// Count GPU usage across all models
const gpuUsage = new Map<number, number>();
for (const gpu of pod.gpus) {
gpuUsage.set(gpu.id, 0);
}
for (const model of Object.values(pod.models)) {
for (const gpuId of model.gpu) {
gpuUsage.set(gpuId, (gpuUsage.get(gpuId) || 0) + 1);
}
}
// Sort GPUs by usage (least used first)
const sortedGPUs = Array.from(gpuUsage.entries())
.sort((a, b) => a[1] - b[1])
.map((entry) => entry[0]);
// Return the least used GPUs
return sortedGPUs.slice(0, count);
};
/**
* Start a model
*/
export const startModel = async (
modelId: string,
name: string,
options: {
pod?: string;
vllmArgs?: string[];
memory?: string;
context?: string;
gpus?: number;
},
) => {
const { name: podName, pod } = getPod(options.pod);
// Validation
if (!pod.modelsPath) {
console.error(chalk.red("Pod does not have a models path configured"));
process.exit(1);
}
if (pod.models[name]) {
console.error(chalk.red(`Model '${name}' already exists on pod '${podName}'`));
process.exit(1);
}
const port = getNextPort(pod);
// Determine GPU allocation and vLLM args
let gpus: number[] = [];
let vllmArgs: string[] = [];
let modelConfig = null;
if (options.vllmArgs?.length) {
// Custom args override everything
vllmArgs = options.vllmArgs;
console.log(chalk.gray("Using custom vLLM args, GPU allocation managed by vLLM"));
} else if (isKnownModel(modelId)) {
// Handle --gpus parameter for known models
if (options.gpus) {
// Validate GPU count
if (options.gpus > pod.gpus.length) {
console.error(chalk.red(`Error: Requested ${options.gpus} GPUs but pod only has ${pod.gpus.length}`));
process.exit(1);
}
// Try to find config for requested GPU count
modelConfig = getModelConfig(modelId, pod.gpus, options.gpus);
if (modelConfig) {
gpus = selectGPUs(pod, options.gpus);
vllmArgs = [...(modelConfig.args || [])];
} else {
console.error(
chalk.red(`Model '${getModelName(modelId)}' does not have a configuration for ${options.gpus} GPU(s)`),
);
console.error(chalk.yellow("Available configurations:"));
// Show available configurations
for (let gpuCount = 1; gpuCount <= pod.gpus.length; gpuCount++) {
const config = getModelConfig(modelId, pod.gpus, gpuCount);
if (config) {
console.error(chalk.gray(` - ${gpuCount} GPU(s)`));
}
}
process.exit(1);
}
} else {
// Find best config for this hardware (original behavior)
for (let gpuCount = pod.gpus.length; gpuCount >= 1; gpuCount--) {
modelConfig = getModelConfig(modelId, pod.gpus, gpuCount);
if (modelConfig) {
gpus = selectGPUs(pod, gpuCount);
vllmArgs = [...(modelConfig.args || [])];
break;
}
}
if (!modelConfig) {
console.error(chalk.red(`Model '${getModelName(modelId)}' not compatible with this pod's GPUs`));
process.exit(1);
}
}
} else {
// Unknown model
if (options.gpus) {
console.error(chalk.red("Error: --gpus can only be used with predefined models"));
console.error(chalk.yellow("For custom models, use --vllm with tensor-parallel-size or similar arguments"));
process.exit(1);
}
// Single GPU default
gpus = selectGPUs(pod, 1);
console.log(chalk.gray("Unknown model, defaulting to single GPU"));
}
// Apply memory/context overrides
if (!options.vllmArgs?.length) {
if (options.memory) {
const fraction = parseFloat(options.memory.replace("%", "")) / 100;
vllmArgs = vllmArgs.filter((arg) => !arg.includes("gpu-memory-utilization"));
vllmArgs.push("--gpu-memory-utilization", String(fraction));
}
if (options.context) {
const contextSizes: Record<string, number> = {
"4k": 4096,
"8k": 8192,
"16k": 16384,
"32k": 32768,
"64k": 65536,
"128k": 131072,
};
const maxTokens = contextSizes[options.context.toLowerCase()] || parseInt(options.context);
vllmArgs = vllmArgs.filter((arg) => !arg.includes("max-model-len"));
vllmArgs.push("--max-model-len", String(maxTokens));
}
}
// Show what we're doing
console.log(chalk.green(`Starting model '${name}' on pod '${podName}'...`));
console.log(`Model: ${modelId}`);
console.log(`Port: ${port}`);
console.log(`GPU(s): ${gpus.length ? gpus.join(", ") : "Managed by vLLM"}`);
if (modelConfig?.notes) console.log(chalk.yellow(`Note: ${modelConfig.notes}`));
console.log("");
// Read and customize model_run.sh script with our values
const scriptPath = join(dirname(fileURLToPath(import.meta.url)), "../../scripts/model_run.sh");
let scriptContent = readFileSync(scriptPath, "utf-8");
// Replace placeholders - no escaping needed, heredoc with 'EOF' is literal
scriptContent = scriptContent
.replace("{{MODEL_ID}}", modelId)
.replace("{{NAME}}", name)
.replace("{{PORT}}", String(port))
.replace("{{VLLM_ARGS}}", vllmArgs.join(" "));
// Upload customized script
const result = await sshExec(
pod.ssh,
`cat > /tmp/model_run_${name}.sh << 'EOF'
${scriptContent}
EOF
chmod +x /tmp/model_run_${name}.sh`,
);
// Prepare environment
const env = [
`HF_TOKEN='${process.env.HF_TOKEN}'`,
`PI_API_KEY='${process.env.PI_API_KEY}'`,
`HF_HUB_ENABLE_HF_TRANSFER=1`,
`VLLM_NO_USAGE_STATS=1`,
`PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`,
`FORCE_COLOR=1`,
`TERM=xterm-256color`,
...(gpus.length === 1 ? [`CUDA_VISIBLE_DEVICES=${gpus[0]}`] : []),
...Object.entries(modelConfig?.env || {}).map(([k, v]) => `${k}='${v}'`),
]
.map((e) => `export ${e}`)
.join("\n");
// Start the model runner with script command for pseudo-TTY (preserves colors)
// Note: We use script to preserve colors and create a log file
// setsid creates a new session so it survives SSH disconnection
const startCmd = `
${env}
mkdir -p ~/.vllm_logs
# Create a wrapper that monitors the script command
cat > /tmp/model_wrapper_${name}.sh << 'WRAPPER'
#!/bin/bash
script -q -f -c "/tmp/model_run_${name}.sh" ~/.vllm_logs/${name}.log
exit_code=$?
echo "Script exited with code $exit_code" >> ~/.vllm_logs/${name}.log
exit $exit_code
WRAPPER
chmod +x /tmp/model_wrapper_${name}.sh
setsid /tmp/model_wrapper_${name}.sh </dev/null >/dev/null 2>&1 &
echo $!
exit 0
`;
const pidResult = await sshExec(pod.ssh, startCmd);
const pid = parseInt(pidResult.stdout.trim());
if (!pid) {
console.error(chalk.red("Failed to start model runner"));
process.exit(1);
}
// Save to config
const config = loadConfig();
config.pods[podName].models[name] = { model: modelId, port, gpu: gpus, pid };
saveConfig(config);
console.log(`Model runner started with PID: ${pid}`);
console.log("Streaming logs... (waiting for startup)\n");
// Small delay to ensure log file is created
await new Promise((resolve) => setTimeout(resolve, 500));
// Stream logs with color support, watching for startup complete
const sshParts = pod.ssh.split(" ");
const sshCommand = sshParts[0]; // "ssh"
const sshArgs = sshParts.slice(1); // ["root@86.38.238.55"]
const host = sshArgs[0].split("@")[1] || "localhost";
const tailCmd = `tail -f ~/.vllm_logs/${name}.log`;
// Build the full args array for spawn
const fullArgs = [...sshArgs, tailCmd];
const logProcess = spawn(sshCommand, fullArgs, {
stdio: ["inherit", "pipe", "pipe"], // capture stdout and stderr
env: { ...process.env, FORCE_COLOR: "1" },
});
let interrupted = false;
let startupComplete = false;
// Handle Ctrl+C
const sigintHandler = () => {
interrupted = true;
logProcess.kill();
};
process.on("SIGINT", sigintHandler);
// Process log output line by line
const processOutput = (data: Buffer) => {
const lines = data.toString().split("\n");
for (const line of lines) {
if (line) {
console.log(line); // Echo the line to console
// Check for startup complete message
if (line.includes("Application startup complete")) {
startupComplete = true;
logProcess.kill(); // Stop tailing logs
}
}
}
};
logProcess.stdout?.on("data", processOutput);
logProcess.stderr?.on("data", processOutput);
await new Promise<void>((resolve) => logProcess.on("exit", resolve));
process.removeListener("SIGINT", sigintHandler);
if (startupComplete) {
// Model started successfully - output connection details
console.log("\n" + chalk.green("✓ Model started successfully!"));
console.log("\n" + chalk.bold("Connection Details:"));
console.log(chalk.cyan("─".repeat(50)));
console.log(chalk.white("Base URL: ") + chalk.yellow(`http://${host}:${port}/v1`));
console.log(chalk.white("Model: ") + chalk.yellow(modelId));
console.log(chalk.white("API Key: ") + chalk.yellow(process.env.PI_API_KEY || "(not set)"));
console.log(chalk.cyan("─".repeat(50)));
console.log("\n" + chalk.bold("Export for shell:"));
console.log(chalk.gray(`export OPENAI_BASE_URL="http://${host}:${port}/v1"`));
console.log(chalk.gray(`export OPENAI_API_KEY="${process.env.PI_API_KEY || "your-api-key"}"`));
console.log(chalk.gray(`export OPENAI_MODEL="${modelId}"`));
console.log("\n" + chalk.bold("Example usage:"));
console.log(
chalk.gray(`
# Python
from openai import OpenAI
client = OpenAI() # Uses env vars
response = client.chat.completions.create(
model="${modelId}",
messages=[{"role": "user", "content": "Hello!"}]
)
# CLI
curl $OPENAI_BASE_URL/chat/completions \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-d '{"model":"${modelId}","messages":[{"role":"user","content":"Hi"}]}'`),
);
console.log("");
console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
console.log(chalk.cyan(`Interactive mode: pi agent ${name} -i`));
console.log(chalk.cyan(`Monitor logs: pi logs ${name}`));
console.log(chalk.cyan(`Stop model: pi stop ${name}`));
} else if (interrupted) {
console.log(chalk.yellow("\n\nStopped monitoring. Model deployment continues in background."));
console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
console.log(chalk.cyan(`Check status: pi logs ${name}`));
console.log(chalk.cyan(`Stop model: pi stop ${name}`));
} else {
console.log(chalk.yellow("\n\nLog stream ended. Model may still be running."));
console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
console.log(chalk.cyan(`Check status: pi logs ${name}`));
console.log(chalk.cyan(`Stop model: pi stop ${name}`));
}
};
/**
* Stop a model
*/
export const stopModel = async (name: string, options: { pod?: string }) => {
const { name: podName, pod } = getPod(options.pod);
const model = pod.models[name];
if (!model) {
console.error(chalk.red(`Model '${name}' not found on pod '${podName}'`));
process.exit(1);
}
console.log(chalk.yellow(`Stopping model '${name}' on pod '${podName}'...`));
// Kill the script process and all its children
// Using pkill to kill the process and all children
const killCmd = `
# Kill the script process and all its children
pkill -TERM -P ${model.pid} 2>/dev/null || true
kill ${model.pid} 2>/dev/null || true
`;
await sshExec(pod.ssh, killCmd);
// Remove from config
const config = loadConfig();
delete config.pods[podName].models[name];
saveConfig(config);
console.log(chalk.green(`✓ Model '${name}' stopped`));
};
/**
* Stop all models on a pod
*/
export const stopAllModels = async (options: { pod?: string }) => {
const { name: podName, pod } = getPod(options.pod);
const modelNames = Object.keys(pod.models);
if (modelNames.length === 0) {
console.log(`No models running on pod '${podName}'`);
return;
}
console.log(chalk.yellow(`Stopping ${modelNames.length} model(s) on pod '${podName}'...`));
// Kill all script processes and their children
const pids = Object.values(pod.models).map((m) => m.pid);
const killCmd = `
for PID in ${pids.join(" ")}; do
pkill -TERM -P $PID 2>/dev/null || true
kill $PID 2>/dev/null || true
done
`;
await sshExec(pod.ssh, killCmd);
// Clear all models from config
const config = loadConfig();
config.pods[podName].models = {};
saveConfig(config);
console.log(chalk.green(`✓ Stopped all models: ${modelNames.join(", ")}`));
};
/**
* List all models
*/
export const listModels = async (options: { pod?: string }) => {
const { name: podName, pod } = getPod(options.pod);
const modelNames = Object.keys(pod.models);
if (modelNames.length === 0) {
console.log(`No models running on pod '${podName}'`);
return;
}
// Get pod SSH host for URL display
const sshParts = pod.ssh.split(" ");
const host = sshParts.find((p) => p.includes("@"))?.split("@")[1] || "unknown";
console.log(`Models on pod '${chalk.bold(podName)}':`);
for (const name of modelNames) {
const model = pod.models[name];
const gpuStr =
model.gpu.length > 1
? `GPUs ${model.gpu.join(",")}`
: model.gpu.length === 1
? `GPU ${model.gpu[0]}`
: "GPU unknown";
console.log(` ${chalk.green(name)} - Port ${model.port} - ${gpuStr} - PID ${model.pid}`);
console.log(` Model: ${chalk.gray(model.model)}`);
console.log(` URL: ${chalk.cyan(`http://${host}:${model.port}/v1`)}`);
}
// Optionally verify processes are still running
console.log("");
console.log("Verifying processes...");
let anyDead = false;
for (const name of modelNames) {
const model = pod.models[name];
// Check both the wrapper process and if vLLM is responding
const checkCmd = `
# Check if wrapper process exists
if ps -p ${model.pid} > /dev/null 2>&1; then
# Process exists, now check if vLLM is responding
if curl -s -f http://localhost:${model.port}/health > /dev/null 2>&1; then
echo "running"
else
# Check if it's still starting up
if tail -n 20 ~/.vllm_logs/${name}.log 2>/dev/null | grep -q "ERROR\\|Failed\\|Cuda error\\|died"; then
echo "crashed"
else
echo "starting"
fi
fi
else
echo "dead"
fi
`;
const result = await sshExec(pod.ssh, checkCmd);
const status = result.stdout.trim();
if (status === "dead") {
console.log(chalk.red(` ${name}: Process ${model.pid} is not running`));
anyDead = true;
} else if (status === "crashed") {
console.log(chalk.red(` ${name}: vLLM crashed (check logs with 'pi logs ${name}')`));
anyDead = true;
} else if (status === "starting") {
console.log(chalk.yellow(` ${name}: Still starting up...`));
}
}
if (anyDead) {
console.log("");
console.log(chalk.yellow("Some models are not running. Clean up with:"));
console.log(chalk.cyan(" pi stop <name>"));
} else {
console.log(chalk.green("✓ All processes verified"));
}
};
/**
* View model logs
*/
export const viewLogs = async (name: string, options: { pod?: string }) => {
const { name: podName, pod } = getPod(options.pod);
const model = pod.models[name];
if (!model) {
console.error(chalk.red(`Model '${name}' not found on pod '${podName}'`));
process.exit(1);
}
console.log(chalk.green(`Streaming logs for '${name}' on pod '${podName}'...`));
console.log(chalk.gray("Press Ctrl+C to stop"));
console.log("");
// Stream logs with color preservation
const sshParts = pod.ssh.split(" ");
const sshCommand = sshParts[0]; // "ssh"
const sshArgs = sshParts.slice(1); // ["root@86.38.238.55"]
const tailCmd = `tail -f ~/.vllm_logs/${name}.log`;
const logProcess = spawn(sshCommand, [...sshArgs, tailCmd], {
stdio: "inherit",
env: {
...process.env,
FORCE_COLOR: "1",
},
});
// Wait for process to exit
await new Promise<void>((resolve) => {
logProcess.on("exit", () => resolve());
});
};
/**
* Show known models and their hardware requirements
*/
export const showKnownModels = async () => {
const modelsJson = await import("../models.json", { assert: { type: "json" } });
const models = modelsJson.default.models;
// Get active pod info if available
const activePod = getActivePod();
let podGpuCount = 0;
let podGpuType = "";
if (activePod) {
podGpuCount = activePod.pod.gpus.length;
// Extract GPU type from name (e.g., "NVIDIA H200" -> "H200")
podGpuType = activePod.pod.gpus[0]?.name?.replace("NVIDIA", "")?.trim()?.split(" ")[0] || "";
console.log(chalk.bold(`Known Models for ${activePod.name} (${podGpuCount}x ${podGpuType || "GPU"}):\n`));
} else {
console.log(chalk.bold("Known Models:\n"));
console.log(chalk.yellow("No active pod. Use 'pi pods active <name>' to filter compatible models.\n"));
}
console.log("Usage: pi start <model> --name <name> [options]\n");
// Group models by compatibility and family
const compatible: Record<string, Array<{ id: string; name: string; config: string; notes?: string }>> = {};
const incompatible: Record<string, Array<{ id: string; name: string; minGpu: string; notes?: string }>> = {};
for (const [modelId, info] of Object.entries(models)) {
const modelInfo = info as any;
const family = modelInfo.name.split("-")[0] || "Other";
let isCompatible = false;
let compatibleConfig = "";
let minGpu = "Unknown";
let minNotes: string | undefined;
if (modelInfo.configs && modelInfo.configs.length > 0) {
// Sort configs by GPU count to find minimum
const sortedConfigs = [...modelInfo.configs].sort((a: any, b: any) => (a.gpuCount || 1) - (b.gpuCount || 1));
// Find minimum requirements
const minConfig = sortedConfigs[0];
const minGpuCount = minConfig.gpuCount || 1;
const gpuTypes = minConfig.gpuTypes?.join("/") || "H100/H200";
if (minGpuCount === 1) {
minGpu = `1x ${gpuTypes}`;
} else {
minGpu = `${minGpuCount}x ${gpuTypes}`;
}
minNotes = minConfig.notes || modelInfo.notes;
// Check compatibility with active pod
if (activePod && podGpuCount > 0) {
// Find best matching config for this pod
for (const config of sortedConfigs) {
const configGpuCount = config.gpuCount || 1;
const configGpuTypes = config.gpuTypes || [];
// Check if we have enough GPUs
if (configGpuCount <= podGpuCount) {
// Check if GPU type matches (if specified)
if (
configGpuTypes.length === 0 ||
configGpuTypes.some((type: string) => podGpuType.includes(type) || type.includes(podGpuType))
) {
isCompatible = true;
if (configGpuCount === 1) {
compatibleConfig = `1x ${podGpuType}`;
} else {
compatibleConfig = `${configGpuCount}x ${podGpuType}`;
}
minNotes = config.notes || modelInfo.notes;
break;
}
}
}
}
}
const modelEntry = {
id: modelId,
name: modelInfo.name,
notes: minNotes,
};
if (activePod && isCompatible) {
if (!compatible[family]) {
compatible[family] = [];
}
compatible[family].push({ ...modelEntry, config: compatibleConfig });
} else {
if (!incompatible[family]) {
incompatible[family] = [];
}
incompatible[family].push({ ...modelEntry, minGpu });
}
}
// Display compatible models first
if (activePod && Object.keys(compatible).length > 0) {
console.log(chalk.green.bold("✓ Compatible Models:\n"));
const sortedFamilies = Object.keys(compatible).sort();
for (const family of sortedFamilies) {
console.log(chalk.cyan(`${family} Models:`));
const modelList = compatible[family].sort((a, b) => a.name.localeCompare(b.name));
for (const model of modelList) {
console.log(` ${chalk.green(model.id)}`);
console.log(` Name: ${model.name}`);
console.log(` Config: ${model.config}`);
if (model.notes) {
console.log(chalk.gray(` Note: ${model.notes}`));
}
console.log("");
}
}
}
// Display incompatible models
if (Object.keys(incompatible).length > 0) {
if (activePod && Object.keys(compatible).length > 0) {
console.log(chalk.red.bold("✗ Incompatible Models (need more/different GPUs):\n"));
}
const sortedFamilies = Object.keys(incompatible).sort();
for (const family of sortedFamilies) {
if (!activePod) {
console.log(chalk.cyan(`${family} Models:`));
} else {
console.log(chalk.gray(`${family} Models:`));
}
const modelList = incompatible[family].sort((a, b) => a.name.localeCompare(b.name));
for (const model of modelList) {
const color = activePod ? chalk.gray : chalk.green;
console.log(` ${color(model.id)}`);
console.log(chalk.gray(` Name: ${model.name}`));
console.log(chalk.gray(` Min Hardware: ${model.minGpu}`));
if (model.notes && !activePod) {
console.log(chalk.gray(` Note: ${model.notes}`));
}
if (activePod) {
console.log(""); // Less verbose for incompatible models when filtered
} else {
console.log("");
}
}
}
}
console.log(chalk.gray("\nFor unknown models, defaults to single GPU deployment."));
console.log(chalk.gray("Use --vllm to pass custom arguments to vLLM."));
};

View file

@ -0,0 +1,205 @@
import chalk from "chalk";
import { dirname, join } from "path";
import { fileURLToPath } from "url";
import { addPod, loadConfig, removePod, setActivePod } from "../config.js";
import { scpFile, sshExec, sshExecStream } from "../ssh.js";
import type { GPU, Pod } from "../types.js";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
/**
* List all pods
*/
export const listPods = () => {
const config = loadConfig();
const podNames = Object.keys(config.pods);
if (podNames.length === 0) {
console.log("No pods configured. Use 'pi pods setup' to add a pod.");
return;
}
console.log("Configured pods:");
for (const name of podNames) {
const pod = config.pods[name];
const isActive = config.active === name;
const marker = isActive ? chalk.green("*") : " ";
const gpuCount = pod.gpus?.length || 0;
const gpuInfo = gpuCount > 0 ? `${gpuCount}x ${pod.gpus[0].name}` : "no GPUs detected";
const vllmInfo = pod.vllmVersion ? ` (vLLM: ${pod.vllmVersion})` : "";
console.log(`${marker} ${chalk.bold(name)} - ${gpuInfo}${vllmInfo} - ${pod.ssh}`);
if (pod.modelsPath) {
console.log(` Models: ${pod.modelsPath}`);
}
if (pod.vllmVersion === "gpt-oss") {
console.log(chalk.yellow(` ⚠️ GPT-OSS build - only for GPT-OSS models`));
}
}
};
/**
* Setup a new pod
*/
export const setupPod = async (
name: string,
sshCmd: string,
options: { mount?: string; modelsPath?: string; vllm?: "release" | "nightly" | "gpt-oss" },
) => {
// Validate environment variables
const hfToken = process.env.HF_TOKEN;
const vllmApiKey = process.env.PI_API_KEY;
if (!hfToken) {
console.error(chalk.red("ERROR: HF_TOKEN environment variable is required"));
console.error("Get a token from: https://huggingface.co/settings/tokens");
console.error("Then run: export HF_TOKEN=your_token_here");
process.exit(1);
}
if (!vllmApiKey) {
console.error(chalk.red("ERROR: PI_API_KEY environment variable is required"));
console.error("Set an API key: export PI_API_KEY=your_api_key_here");
process.exit(1);
}
// Determine models path
let modelsPath = options.modelsPath;
if (!modelsPath && options.mount) {
// Extract path from mount command if not explicitly provided
// e.g., "mount -t nfs ... /mnt/sfs" -> "/mnt/sfs"
const parts = options.mount.split(" ");
modelsPath = parts[parts.length - 1];
}
if (!modelsPath) {
console.error(chalk.red("ERROR: --models-path is required (or must be extractable from --mount)"));
process.exit(1);
}
console.log(chalk.green(`Setting up pod '${name}'...`));
console.log(`SSH: ${sshCmd}`);
console.log(`Models path: ${modelsPath}`);
console.log(
`vLLM version: ${options.vllm || "release"} ${options.vllm === "gpt-oss" ? chalk.yellow("(GPT-OSS special build)") : ""}`,
);
if (options.mount) {
console.log(`Mount command: ${options.mount}`);
}
console.log("");
// Test SSH connection
console.log("Testing SSH connection...");
const testResult = await sshExec(sshCmd, "echo 'SSH OK'");
if (testResult.exitCode !== 0) {
console.error(chalk.red("Failed to connect via SSH"));
console.error(testResult.stderr);
process.exit(1);
}
console.log(chalk.green("✓ SSH connection successful"));
// Copy setup script
console.log("Copying setup script...");
const scriptPath = join(__dirname, "../../scripts/pod_setup.sh");
const success = await scpFile(sshCmd, scriptPath, "/tmp/pod_setup.sh");
if (!success) {
console.error(chalk.red("Failed to copy setup script"));
process.exit(1);
}
console.log(chalk.green("✓ Setup script copied"));
// Build setup command
let setupCmd = `bash /tmp/pod_setup.sh --models-path '${modelsPath}' --hf-token '${hfToken}' --vllm-api-key '${vllmApiKey}'`;
if (options.mount) {
setupCmd += ` --mount '${options.mount}'`;
}
// Add vLLM version flag
const vllmVersion = options.vllm || "release";
setupCmd += ` --vllm '${vllmVersion}'`;
// Run setup script
console.log("");
console.log(chalk.yellow("Running setup (this will take 2-5 minutes)..."));
console.log("");
// Use forceTTY to preserve colors from apt, pip, etc.
const exitCode = await sshExecStream(sshCmd, setupCmd, { forceTTY: true });
if (exitCode !== 0) {
console.error(chalk.red("\nSetup failed. Check the output above for errors."));
process.exit(1);
}
// Parse GPU info from setup output
console.log("");
console.log("Detecting GPU configuration...");
const gpuResult = await sshExec(sshCmd, "nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader");
const gpus: GPU[] = [];
if (gpuResult.exitCode === 0 && gpuResult.stdout) {
const lines = gpuResult.stdout.trim().split("\n");
for (const line of lines) {
const [id, name, memory] = line.split(",").map((s) => s.trim());
if (id !== undefined) {
gpus.push({
id: parseInt(id),
name: name || "Unknown",
memory: memory || "Unknown",
});
}
}
}
console.log(chalk.green(`✓ Detected ${gpus.length} GPU(s)`));
for (const gpu of gpus) {
console.log(` GPU ${gpu.id}: ${gpu.name} (${gpu.memory})`);
}
// Save pod configuration
const pod: Pod = {
ssh: sshCmd,
gpus,
models: {},
modelsPath,
vllmVersion: options.vllm || "release",
};
addPod(name, pod);
console.log("");
console.log(chalk.green(`✓ Pod '${name}' setup complete and set as active pod`));
console.log("");
console.log("You can now deploy models with:");
console.log(chalk.cyan(` pi start <model> --name <name>`));
};
/**
* Switch active pod
*/
export const switchActivePod = (name: string) => {
const config = loadConfig();
if (!config.pods[name]) {
console.error(chalk.red(`Pod '${name}' not found`));
console.log("\nAvailable pods:");
for (const podName of Object.keys(config.pods)) {
console.log(` ${podName}`);
}
process.exit(1);
}
setActivePod(name);
console.log(chalk.green(`✓ Switched active pod to '${name}'`));
};
/**
* Remove a pod from config
*/
export const removePodCommand = (name: string) => {
const config = loadConfig();
if (!config.pods[name]) {
console.error(chalk.red(`Pod '${name}' not found`));
process.exit(1);
}
removePod(name);
console.log(chalk.green(`✓ Removed pod '${name}' from configuration`));
console.log(chalk.yellow("Note: This only removes the local configuration. The remote pod is not affected."));
};

View file

@ -0,0 +1,85 @@
import { main as agentMain } from "@mariozechner/pi-agent";
import chalk from "chalk";
import { getActivePod, loadConfig } from "../config.js";
// ────────────────────────────────────────────────────────────────────────────────
// Types
// ────────────────────────────────────────────────────────────────────────────────
interface PromptOptions {
pod?: string;
apiKey?: string;
}
// ────────────────────────────────────────────────────────────────────────────────
// Main prompt function
// ────────────────────────────────────────────────────────────────────────────────
export async function promptModel(modelName: string, userArgs: string[], opts: PromptOptions = {}) {
// Get pod and model configuration
const activePod = opts.pod ? { name: opts.pod, pod: loadConfig().pods[opts.pod] } : getActivePod();
if (!activePod) {
console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
process.exit(1);
}
const { name: podName, pod } = activePod;
const modelConfig = pod.models[modelName];
if (!modelConfig) {
console.error(chalk.red(`Model '${modelName}' not found on pod '${podName}'`));
process.exit(1);
}
// Extract host from SSH string
const host =
pod.ssh
.split(" ")
.find((p) => p.includes("@"))
?.split("@")[1] ?? "localhost";
// Build the system prompt for code navigation
const systemPrompt = `You help the user understand and navigate the codebase in the current working directory.
You can read files, list directories, and execute shell commands via the respective tools.
Do not output file contents you read via the read_file tool directly, unless asked to.
Do not output markdown tables as part of your responses.
Keep your responses concise and relevant to the user's request.
File paths you output must include line numbers where possible, e.g. "src/index.ts:10-20" for lines 10 to 20 in src/index.ts.
Current working directory: ${process.cwd()}`;
// Build arguments for agent main function
const args: string[] = [];
// Add base configuration that we control
args.push(
"--base-url",
`http://${host}:${modelConfig.port}/v1`,
"--model",
modelConfig.model,
"--api-key",
opts.apiKey || process.env.PI_API_KEY || "dummy",
"--api",
modelConfig.model.toLowerCase().includes("gpt-oss") ? "responses" : "completions",
"--system-prompt",
systemPrompt,
);
// Pass through all user-provided arguments
// This includes messages, --continue, --json, etc.
args.push(...userArgs);
// Call agent main function directly
try {
await agentMain(args);
} catch (err: any) {
console.error(chalk.red(`Agent error: ${err.message}`));
process.exit(1);
}
}

View file

@ -0,0 +1,80 @@
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
import { homedir } from "os";
import { join } from "path";
import type { Config, Pod } from "./types.js";
// Get config directory from env or use default
const getConfigDir = (): string => {
const configDir = process.env.PI_CONFIG_DIR || join(homedir(), ".pi");
if (!existsSync(configDir)) {
mkdirSync(configDir, { recursive: true });
}
return configDir;
};
const getConfigPath = (): string => {
return join(getConfigDir(), "pods.json");
};
export const loadConfig = (): Config => {
const configPath = getConfigPath();
if (!existsSync(configPath)) {
// Return empty config if file doesn't exist
return { pods: {} };
}
try {
const data = readFileSync(configPath, "utf-8");
return JSON.parse(data);
} catch (e) {
console.error(`Error reading config: ${e}`);
return { pods: {} };
}
};
export const saveConfig = (config: Config): void => {
const configPath = getConfigPath();
try {
writeFileSync(configPath, JSON.stringify(config, null, 2));
} catch (e) {
console.error(`Error saving config: ${e}`);
process.exit(1);
}
};
export const getActivePod = (): { name: string; pod: Pod } | null => {
const config = loadConfig();
if (!config.active || !config.pods[config.active]) {
return null;
}
return { name: config.active, pod: config.pods[config.active] };
};
export const addPod = (name: string, pod: Pod): void => {
const config = loadConfig();
config.pods[name] = pod;
// If no active pod, make this one active
if (!config.active) {
config.active = name;
}
saveConfig(config);
};
export const removePod = (name: string): void => {
const config = loadConfig();
delete config.pods[name];
// If this was the active pod, clear active
if (config.active === name) {
config.active = undefined;
}
saveConfig(config);
};
export const setActivePod = (name: string): void => {
const config = loadConfig();
if (!config.pods[name]) {
console.error(`Pod '${name}' not found`);
process.exit(1);
}
config.active = name;
saveConfig(config);
};

View file

@ -0,0 +1,2 @@
// Main library exports
export * from "./types.js";

View file

@ -0,0 +1,111 @@
import { readFileSync } from "fs";
import { dirname, join } from "path";
import { fileURLToPath } from "url";
import type { GPU } from "./types.js";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
interface ModelConfig {
gpuCount: number;
gpuTypes?: string[];
args: string[];
env?: Record<string, string>;
notes?: string;
}
interface ModelInfo {
name: string;
configs: ModelConfig[];
notes?: string;
}
interface ModelsData {
models: Record<string, ModelInfo>;
}
// Load models configuration - resolve relative to this file
const modelsJsonPath = join(__dirname, "models.json");
const modelsData: ModelsData = JSON.parse(readFileSync(modelsJsonPath, "utf-8"));
/**
* Get the best configuration for a model based on available GPUs
*/
export const getModelConfig = (
modelId: string,
gpus: GPU[],
requestedGpuCount: number,
): { args: string[]; env?: Record<string, string>; notes?: string } | null => {
const modelInfo = modelsData.models[modelId];
if (!modelInfo) {
// Unknown model, no default config
return null;
}
// Extract GPU type from the first GPU name (e.g., "NVIDIA H200" -> "H200")
const gpuType = gpus[0]?.name?.replace("NVIDIA", "")?.trim()?.split(" ")[0] || "";
// Find best matching config
let bestConfig: ModelConfig | null = null;
for (const config of modelInfo.configs) {
// Check GPU count
if (config.gpuCount !== requestedGpuCount) {
continue;
}
// Check GPU type if specified
if (config.gpuTypes && config.gpuTypes.length > 0) {
const typeMatches = config.gpuTypes.some((type) => gpuType.includes(type) || type.includes(gpuType));
if (!typeMatches) {
continue;
}
}
// This config matches
bestConfig = config;
break;
}
// If no exact match, try to find a config with just the right GPU count
if (!bestConfig) {
for (const config of modelInfo.configs) {
if (config.gpuCount === requestedGpuCount) {
bestConfig = config;
break;
}
}
}
if (!bestConfig) {
// No suitable config found
return null;
}
return {
args: [...bestConfig.args],
env: bestConfig.env ? { ...bestConfig.env } : undefined,
notes: bestConfig.notes || modelInfo.notes,
};
};
/**
* Check if a model is known
*/
export const isKnownModel = (modelId: string): boolean => {
return modelId in modelsData.models;
};
/**
* Get all known models
*/
export const getKnownModels = (): string[] => {
return Object.keys(modelsData.models);
};
/**
* Get model display name
*/
export const getModelName = (modelId: string): string => {
return modelsData.models[modelId]?.name || modelId;
};

View file

@ -0,0 +1,305 @@
{
"models": {
"Qwen/Qwen2.5-Coder-32B-Instruct": {
"name": "Qwen2.5-Coder-32B",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
},
{
"gpuCount": 2,
"gpuTypes": ["H100", "H200"],
"args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
}
]
},
"Qwen/Qwen3-Coder-30B-A3B-Instruct": {
"name": "Qwen3-Coder-30B",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
"notes": "Fits comfortably on single GPU. ~60GB model weight."
},
{
"gpuCount": 2,
"gpuTypes": ["H100", "H200"],
"args": [
"--tensor-parallel-size",
"2",
"--enable-auto-tool-choice",
"--tool-call-parser",
"qwen3_coder"
],
"notes": "For higher throughput/longer context."
}
]
},
"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
"name": "Qwen3-Coder-30B-FP8",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
"env": {
"VLLM_USE_DEEP_GEMM": "1"
},
"notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
}
]
},
"Qwen/Qwen3-Coder-480B-A35B-Instruct": {
"name": "Qwen3-Coder-480B",
"configs": [
{
"gpuCount": 8,
"gpuTypes": ["H200", "H20"],
"args": [
"--tensor-parallel-size",
"8",
"--max-model-len",
"32000",
"--enable-auto-tool-choice",
"--tool-call-parser",
"qwen3_coder"
],
"notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
}
]
},
"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
"name": "Qwen3-Coder-480B-FP8",
"configs": [
{
"gpuCount": 8,
"gpuTypes": ["H200", "H20"],
"args": [
"--max-model-len",
"131072",
"--enable-expert-parallel",
"--data-parallel-size",
"8",
"--enable-auto-tool-choice",
"--tool-call-parser",
"qwen3_coder"
],
"env": {
"VLLM_USE_DEEP_GEMM": "1"
},
"notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
}
]
},
"openai/gpt-oss-20b": {
"name": "GPT-OSS-20B",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--async-scheduling"]
},
{
"gpuCount": 1,
"gpuTypes": ["B200"],
"args": ["--async-scheduling"],
"env": {
"VLLM_USE_TRTLLM_ATTENTION": "1",
"VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
"VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
"VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
}
}
],
"notes": "Requires vLLM 0.10.1+gptoss. Tools/functoin calls only via /v1/responses endpoint."
},
"openai/gpt-oss-120b": {
"name": "GPT-OSS-120B",
"configs": [
{
"gpuCount": 1,
"gpuTypes": ["H100", "H200"],
"args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
"notes": "Single GPU deployment. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
},
{
"gpuCount": 2,
"gpuTypes": ["H100", "H200"],
"args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
"notes": "Recommended for H100/H200. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
},
{
"gpuCount": 4,
"gpuTypes": ["H100", "H200"],
"args": ["--tensor-parallel-size", "4", "--async-scheduling"],
"notes": "Higher throughput. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
},
{
"gpuCount": 8,
"gpuTypes": ["H100", "H200"],
"args": ["--tensor-parallel-size", "8", "--async-scheduling"],
"notes": "Maximum throughput for evaluation workloads. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
}
]
},
"zai-org/GLM-4.5": {
"name": "GLM-4.5",
"configs": [
{
"gpuCount": 16,
"gpuTypes": ["H100"],
"args": [
"--tensor-parallel-size",
"16",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
]
},
{
"gpuCount": 8,
"gpuTypes": ["H200"],
"args": [
"--tensor-parallel-size",
"8",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
]
}
],
"notes": "Models default to thinking mode. For full 128K context, double the GPU count."
},
"zai-org/GLM-4.5-FP8": {
"name": "GLM-4.5-FP8",
"configs": [
{
"gpuCount": 8,
"gpuTypes": ["H100"],
"args": [
"--tensor-parallel-size",
"8",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
]
},
{
"gpuCount": 4,
"gpuTypes": ["H200"],
"args": [
"--tensor-parallel-size",
"4",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
]
}
]
},
"zai-org/GLM-4.5-Air-FP8": {
"name": "GLM-4.5-Air-FP8",
"configs": [
{
"gpuCount": 2,
"gpuTypes": ["H100"],
"args": [
"--tensor-parallel-size",
"2",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice",
"--quantization",
"fp8"
],
"env": {
"VLLM_ATTENTION_BACKEND": "XFORMERS"
},
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
},
{
"gpuCount": 1,
"gpuTypes": ["H200"],
"args": [
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice",
"--quantization",
"fp8"
],
"env": {
"VLLM_ATTENTION_BACKEND": "XFORMERS"
},
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
}
]
},
"zai-org/GLM-4.5-Air": {
"name": "GLM-4.5-Air",
"configs": [
{
"gpuCount": 2,
"gpuTypes": ["H100", "H200"],
"args": [
"--tensor-parallel-size",
"2",
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice"
],
"notes": "Non-quantized BF16 version, more compatible"
},
{
"gpuCount": 1,
"gpuTypes": ["H200"],
"args": [
"--tool-call-parser",
"glm4_moe",
"--reasoning-parser",
"glm4_moe",
"--enable-auto-tool-choice",
"--gpu-memory-utilization",
"0.95"
],
"notes": "Single H200 can fit the BF16 model with high memory utilization"
}
]
},
"moonshotai/Kimi-K2-Instruct": {
"name": "Kimi-K2",
"configs": [
{
"gpuCount": 16,
"gpuTypes": ["H200", "H20"],
"args": [
"--tensor-parallel-size",
"16",
"--trust-remote-code",
"--enable-auto-tool-choice",
"--tool-call-parser",
"kimi_k2"
],
"notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
}
],
"notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
}
}
}

151
packages/pods/src/ssh.ts Normal file
View file

@ -0,0 +1,151 @@
import { type SpawnOptions, spawn } from "child_process";
export interface SSHResult {
stdout: string;
stderr: string;
exitCode: number;
}
/**
* Execute an SSH command and return the result
*/
export const sshExec = async (
sshCmd: string,
command: string,
options?: { keepAlive?: boolean },
): Promise<SSHResult> => {
return new Promise((resolve) => {
// Parse SSH command (e.g., "ssh root@1.2.3.4" or "ssh -p 22 root@1.2.3.4")
const sshParts = sshCmd.split(" ").filter((p) => p);
const sshBinary = sshParts[0];
let sshArgs = [...sshParts.slice(1)];
// Add SSH keepalive options for long-running commands
if (options?.keepAlive) {
// ServerAliveInterval=30 sends keepalive every 30 seconds
// ServerAliveCountMax=120 allows up to 120 failures (60 minutes total)
sshArgs = ["-o", "ServerAliveInterval=30", "-o", "ServerAliveCountMax=120", ...sshArgs];
}
sshArgs.push(command);
const proc = spawn(sshBinary, sshArgs, {
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
let stderr = "";
proc.stdout.on("data", (data) => {
stdout += data.toString();
});
proc.stderr.on("data", (data) => {
stderr += data.toString();
});
proc.on("close", (code) => {
resolve({
stdout,
stderr,
exitCode: code || 0,
});
});
proc.on("error", (err) => {
resolve({
stdout,
stderr: err.message,
exitCode: 1,
});
});
});
};
/**
* Execute an SSH command with streaming output to console
*/
export const sshExecStream = async (
sshCmd: string,
command: string,
options?: { silent?: boolean; forceTTY?: boolean; keepAlive?: boolean },
): Promise<number> => {
return new Promise((resolve) => {
const sshParts = sshCmd.split(" ").filter((p) => p);
const sshBinary = sshParts[0];
// Build SSH args
let sshArgs = [...sshParts.slice(1)];
// Add -t flag if requested and not already present
if (options?.forceTTY && !sshParts.includes("-t")) {
sshArgs = ["-t", ...sshArgs];
}
// Add SSH keepalive options for long-running commands
if (options?.keepAlive) {
// ServerAliveInterval=30 sends keepalive every 30 seconds
// ServerAliveCountMax=120 allows up to 120 failures (60 minutes total)
sshArgs = ["-o", "ServerAliveInterval=30", "-o", "ServerAliveCountMax=120", ...sshArgs];
}
sshArgs.push(command);
const spawnOptions: SpawnOptions = options?.silent
? { stdio: ["ignore", "ignore", "ignore"] }
: { stdio: "inherit" };
const proc = spawn(sshBinary, sshArgs, spawnOptions);
proc.on("close", (code) => {
resolve(code || 0);
});
proc.on("error", () => {
resolve(1);
});
});
};
/**
* Copy a file to remote via SCP
*/
export const scpFile = async (sshCmd: string, localPath: string, remotePath: string): Promise<boolean> => {
// Extract host from SSH command
const sshParts = sshCmd.split(" ").filter((p) => p);
let host = "";
let port = "22";
let i = 1; // Skip 'ssh'
while (i < sshParts.length) {
if (sshParts[i] === "-p" && i + 1 < sshParts.length) {
port = sshParts[i + 1];
i += 2;
} else if (!sshParts[i].startsWith("-")) {
host = sshParts[i];
break;
} else {
i++;
}
}
if (!host) {
console.error("Could not parse host from SSH command");
return false;
}
// Build SCP command
const scpArgs = ["-P", port, localPath, `${host}:${remotePath}`];
return new Promise((resolve) => {
const proc = spawn("scp", scpArgs, { stdio: "inherit" });
proc.on("close", (code) => {
resolve(code === 0);
});
proc.on("error", () => {
resolve(false);
});
});
};

View file

@ -0,0 +1,27 @@
// Core type definitions for pi
export interface GPU {
id: number;
name: string;
memory: string;
}
export interface Model {
model: string;
port: number;
gpu: number[]; // Array of GPU IDs for multi-GPU deployment
pid: number;
}
export interface Pod {
ssh: string;
gpus: GPU[];
models: Record<string, Model>;
modelsPath?: string;
vllmVersion?: "release" | "nightly" | "gpt-oss"; // Track which vLLM version is installed
}
export interface Config {
pods: Record<string, Pod>;
active?: string;
}