Initial monorepo setup with npm workspaces and dual TypeScript configuration

- Set up npm workspaces for three packages: pi-tui, pi-agent, and pi (pods) - Implemented dual TypeScript configuration: - Root tsconfig.json with path mappings for development and type checking - Package-specific tsconfig.build.json for clean production builds - Configured lockstep versioning with sync script for inter-package dependencies - Added comprehensive documentation for development and publishing workflows - All packages at version 0.5.0 ready for npm publishing
2026-04-21 06:04:44 +00:00 · 2025-08-09 17:18:38 +02:00 · 2025-08-09 17:18:38 +02:00 · a74c5da112
commit a74c5da112
63 changed files with 14558 additions and 0 deletions
--- a/packages/pods/src/cli.ts
+++ b/packages/pods/src/cli.ts
@ -0,0 +1,362 @@
+#!/usr/bin/env node
+import chalk from "chalk";
+import { spawn } from "child_process";
+import { readFileSync } from "fs";
+import { dirname, join } from "path";
+import { fileURLToPath } from "url";
+import { listModels, startModel, stopModel, viewLogs } from "./commands/models.js";
+import { listPods, removePodCommand, setupPod, switchActivePod } from "./commands/pods.js";
+import { promptModel } from "./commands/prompt.js";
+import { getActivePod, loadConfig } from "./config.js";
+import { sshExecStream } from "./ssh.js";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+const packageJson = JSON.parse(readFileSync(join(__dirname, "../package.json"), "utf-8"));
+
+function printHelp() {
+	console.log(`pi v${packageJson.version} - Manage vLLM deployments on GPU pods
+
+Pod Management:
+  pi pods setup <name> "<ssh>" --mount "<mount>"    Setup pod with mount command
+    Options:
+      --vllm release    Install latest vLLM release >=0.10.0 (default)
+      --vllm nightly    Install vLLM nightly build (latest features)
+      --vllm gpt-oss    Install vLLM 0.10.1+gptoss with PyTorch nightly (GPT-OSS only)
+  pi pods                                           List all pods (* = active)
+  pi pods active <name>                             Switch active pod
+  pi pods remove <name>                             Remove pod from local config
+  pi shell [<name>]                                 Open shell on pod (active or specified)
+  pi ssh [<name>] "<command>"                       Run SSH command on pod
+
+Model Management:
+  pi start <model> --name <name> [options]          Start a model
+    --memory <percent>   GPU memory allocation (30%, 50%, 90%)
+    --context <size>     Context window (4k, 8k, 16k, 32k, 64k, 128k)
+    --gpus <count>       Number of GPUs to use (predefined models only)
+    --vllm <args...>     Pass remaining args to vLLM (ignores other options)
+  pi stop [<name>]                                  Stop model (or all if no name)
+  pi list                                           List running models
+  pi logs <name>                                    Stream model logs
+  pi agent <name> ["<message>"...] [options]        Chat with model using agent & tools
+  pi agent <name> [options]                         Interactive chat mode
+    --continue, -c       Continue previous session
+    --json              Output as JSONL
+    (All pi-agent options are supported)
+
+  All model commands support --pod <name> to override the active pod.
+
+Environment:
+  HF_TOKEN         HuggingFace token for model downloads
+  PI_API_KEY     API key for vLLM endpoints
+  PI_CONFIG_DIR    Config directory (default: ~/.pi)`);
+}
+
+// Parse command line arguments
+const args = process.argv.slice(2);
+
+if (args.length === 0 || args[0] === "--help" || args[0] === "-h") {
+	printHelp();
+	process.exit(0);
+}
+
+if (args[0] === "--version" || args[0] === "-v") {
+	console.log(packageJson.version);
+	process.exit(0);
+}
+
+const command = args[0];
+const subcommand = args[1];
+
+// Main command handler
+try {
+	// Handle "pi pods" commands
+	if (command === "pods") {
+		if (!subcommand) {
+			// pi pods - list all pods
+			listPods();
+		} else if (subcommand === "setup") {
+			// pi pods setup <name> "<ssh>" [--mount "<mount>"] [--models-path <path>] [--vllm release|nightly|gpt-oss]
+			const name = args[2];
+			const sshCmd = args[3];
+
+			if (!name || !sshCmd) {
+				console.error(
+					'Usage: pi pods setup <name> "<ssh>" [--mount "<mount>"] [--models-path <path>] [--vllm release|nightly|gpt-oss]',
+				);
+				process.exit(1);
+			}
+
+			// Parse options
+			const options: { mount?: string; modelsPath?: string; vllm?: "release" | "nightly" | "gpt-oss" } = {};
+			for (let i = 4; i < args.length; i++) {
+				if (args[i] === "--mount" && i + 1 < args.length) {
+					options.mount = args[i + 1];
+					i++;
+				} else if (args[i] === "--models-path" && i + 1 < args.length) {
+					options.modelsPath = args[i + 1];
+					i++;
+				} else if (args[i] === "--vllm" && i + 1 < args.length) {
+					const vllmType = args[i + 1];
+					if (vllmType === "release" || vllmType === "nightly" || vllmType === "gpt-oss") {
+						options.vllm = vllmType;
+					} else {
+						console.error(chalk.red(`Invalid vLLM type: ${vllmType}`));
+						console.error("Valid options: release, nightly, gpt-oss");
+						process.exit(1);
+					}
+					i++;
+				}
+			}
+
+			// If --mount provided but no --models-path, try to extract path from mount command
+			if (options.mount && !options.modelsPath) {
+				// Extract last part of mount command as models path
+				const parts = options.mount.trim().split(" ");
+				const lastPart = parts[parts.length - 1];
+				if (lastPart?.startsWith("/")) {
+					options.modelsPath = lastPart;
+				}
+			}
+
+			await setupPod(name, sshCmd, options);
+		} else if (subcommand === "active") {
+			// pi pods active <name>
+			const name = args[2];
+			if (!name) {
+				console.error("Usage: pi pods active <name>");
+				process.exit(1);
+			}
+			switchActivePod(name);
+		} else if (subcommand === "remove") {
+			// pi pods remove <name>
+			const name = args[2];
+			if (!name) {
+				console.error("Usage: pi pods remove <name>");
+				process.exit(1);
+			}
+			removePodCommand(name);
+		} else {
+			console.error(`Unknown pods subcommand: ${subcommand}`);
+			process.exit(1);
+		}
+	} else {
+		// Parse --pod override for model commands
+		let podOverride: string | undefined;
+		const podIndex = args.indexOf("--pod");
+		if (podIndex !== -1 && podIndex + 1 < args.length) {
+			podOverride = args[podIndex + 1];
+			// Remove --pod and its value from args
+			args.splice(podIndex, 2);
+		}
+
+		// Handle SSH/shell commands and model commands
+		switch (command) {
+			case "shell": {
+				// pi shell [<name>] - open interactive shell
+				const podName = args[1];
+				let podInfo: { name: string; pod: import("./types.js").Pod } | null = null;
+
+				if (podName) {
+					const config = loadConfig();
+					const pod = config.pods[podName];
+					if (pod) {
+						podInfo = { name: podName, pod };
+					}
+				} else {
+					podInfo = getActivePod();
+				}
+
+				if (!podInfo) {
+					if (podName) {
+						console.error(chalk.red(`Pod '${podName}' not found`));
+					} else {
+						console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
+					}
+					process.exit(1);
+				}
+
+				console.log(chalk.green(`Connecting to pod '${podInfo.name}'...`));
+
+				// Execute SSH in interactive mode
+				const sshArgs = podInfo.pod.ssh.split(" ").slice(1); // Remove 'ssh' from command
+				const sshProcess = spawn("ssh", sshArgs, {
+					stdio: "inherit",
+					env: process.env,
+				});
+
+				sshProcess.on("exit", (code) => {
+					process.exit(code || 0);
+				});
+				break;
+			}
+			case "ssh": {
+				// pi ssh [<name>] "<command>" - run command via SSH
+				let podName: string | undefined;
+				let sshCommand: string;
+
+				if (args.length === 2) {
+					// pi ssh "<command>" - use active pod
+					sshCommand = args[1];
+				} else if (args.length === 3) {
+					// pi ssh <name> "<command>"
+					podName = args[1];
+					sshCommand = args[2];
+				} else {
+					console.error('Usage: pi ssh [<name>] "<command>"');
+					process.exit(1);
+				}
+
+				let podInfo: { name: string; pod: import("./types.js").Pod } | null = null;
+
+				if (podName) {
+					const config = loadConfig();
+					const pod = config.pods[podName];
+					if (pod) {
+						podInfo = { name: podName, pod };
+					}
+				} else {
+					podInfo = getActivePod();
+				}
+
+				if (!podInfo) {
+					if (podName) {
+						console.error(chalk.red(`Pod '${podName}' not found`));
+					} else {
+						console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
+					}
+					process.exit(1);
+				}
+
+				console.log(chalk.gray(`Running on pod '${podInfo.name}': ${sshCommand}`));
+
+				// Execute command and stream output
+				const exitCode = await sshExecStream(podInfo.pod.ssh, sshCommand);
+				process.exit(exitCode);
+				break;
+			}
+			case "start": {
+				// pi start <model> --name <name> [options]
+				const modelId = args[1];
+				if (!modelId) {
+					// Show available models
+					const { showKnownModels } = await import("./commands/models.js");
+					await showKnownModels();
+					process.exit(0);
+				}
+
+				// Parse options
+				let name: string | undefined;
+				let memory: string | undefined;
+				let context: string | undefined;
+				let gpus: number | undefined;
+				const vllmArgs: string[] = [];
+				let inVllmArgs = false;
+
+				for (let i = 2; i < args.length; i++) {
+					if (inVllmArgs) {
+						vllmArgs.push(args[i]);
+					} else if (args[i] === "--name" && i + 1 < args.length) {
+						name = args[i + 1];
+						i++;
+					} else if (args[i] === "--memory" && i + 1 < args.length) {
+						memory = args[i + 1];
+						i++;
+					} else if (args[i] === "--context" && i + 1 < args.length) {
+						context = args[i + 1];
+						i++;
+					} else if (args[i] === "--gpus" && i + 1 < args.length) {
+						gpus = parseInt(args[i + 1]);
+						if (Number.isNaN(gpus) || gpus < 1) {
+							console.error(chalk.red("--gpus must be a positive number"));
+							process.exit(1);
+						}
+						i++;
+					} else if (args[i] === "--vllm") {
+						inVllmArgs = true;
+					}
+				}
+
+				if (!name) {
+					console.error("--name is required");
+					process.exit(1);
+				}
+
+				// Warn if --vllm is used with other parameters
+				if (vllmArgs.length > 0 && (memory || context || gpus)) {
+					console.log(
+						chalk.yellow("⚠ Warning: --memory, --context, and --gpus are ignored when --vllm is specified"),
+					);
+					console.log(chalk.yellow("  Using only custom vLLM arguments"));
+					console.log("");
+				}
+
+				await startModel(modelId, name, {
+					pod: podOverride,
+					memory,
+					context,
+					gpus,
+					vllmArgs: vllmArgs.length > 0 ? vllmArgs : undefined,
+				});
+				break;
+			}
+			case "stop": {
+				// pi stop [name] - stop specific model or all models
+				const name = args[1];
+				if (!name) {
+					// Stop all models on the active pod
+					const { stopAllModels } = await import("./commands/models.js");
+					await stopAllModels({ pod: podOverride });
+				} else {
+					await stopModel(name, { pod: podOverride });
+				}
+				break;
+			}
+			case "list":
+				// pi list
+				await listModels({ pod: podOverride });
+				break;
+			case "logs": {
+				// pi logs <name>
+				const name = args[1];
+				if (!name) {
+					console.error("Usage: pi logs <name>");
+					process.exit(1);
+				}
+				await viewLogs(name, { pod: podOverride });
+				break;
+			}
+			case "agent": {
+				// pi agent <name> [messages...] [options]
+				const name = args[1];
+				if (!name) {
+					console.error("Usage: pi agent <name> [messages...] [options]");
+					process.exit(1);
+				}
+
+				const apiKey = process.env.PI_API_KEY;
+
+				// Pass all args after the model name
+				const agentArgs = args.slice(2);
+
+				// If no messages provided, it's interactive mode
+				await promptModel(name, agentArgs, {
+					pod: podOverride,
+					apiKey,
+				}).catch(() => {
+					// Error already handled in promptModel, just exit cleanly
+					process.exit(0);
+				});
+				break;
+			}
+			default:
+				console.error(`Unknown command: ${command}`);
+				printHelp();
+				process.exit(1);
+		}
+	}
+} catch (error) {
+	console.error("Error:", error);
+	process.exit(1);
+}
--- a/packages/pods/src/commands/models.ts
+++ b/packages/pods/src/commands/models.ts
@ -0,0 +1,703 @@
+import chalk from "chalk";
+import { spawn } from "child_process";
+import { readFileSync } from "fs";
+import { dirname, join } from "path";
+import { fileURLToPath } from "url";
+import { getActivePod, loadConfig, saveConfig } from "../config.js";
+import { getModelConfig, getModelName, isKnownModel } from "../model-configs.js";
+import { sshExec } from "../ssh.js";
+import type { Pod } from "../types.js";
+
+/**
+ * Get the pod to use (active or override)
+ */
+const getPod = (podOverride?: string): { name: string; pod: Pod } => {
+	if (podOverride) {
+		const config = loadConfig();
+		const pod = config.pods[podOverride];
+		if (!pod) {
+			console.error(chalk.red(`Pod '${podOverride}' not found`));
+			process.exit(1);
+		}
+		return { name: podOverride, pod };
+	}
+
+	const active = getActivePod();
+	if (!active) {
+		console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
+		process.exit(1);
+	}
+	return active;
+};
+
+/**
+ * Find next available port starting from 8001
+ */
+const getNextPort = (pod: Pod): number => {
+	const usedPorts = Object.values(pod.models).map((m) => m.port);
+	let port = 8001;
+	while (usedPorts.includes(port)) {
+		port++;
+	}
+	return port;
+};
+
+/**
+ * Select GPUs for model deployment (round-robin)
+ */
+const selectGPUs = (pod: Pod, count: number = 1): number[] => {
+	if (count === pod.gpus.length) {
+		// Use all GPUs
+		return pod.gpus.map((g) => g.id);
+	}
+
+	// Count GPU usage across all models
+	const gpuUsage = new Map<number, number>();
+	for (const gpu of pod.gpus) {
+		gpuUsage.set(gpu.id, 0);
+	}
+
+	for (const model of Object.values(pod.models)) {
+		for (const gpuId of model.gpu) {
+			gpuUsage.set(gpuId, (gpuUsage.get(gpuId) || 0) + 1);
+		}
+	}
+
+	// Sort GPUs by usage (least used first)
+	const sortedGPUs = Array.from(gpuUsage.entries())
+		.sort((a, b) => a[1] - b[1])
+		.map((entry) => entry[0]);
+
+	// Return the least used GPUs
+	return sortedGPUs.slice(0, count);
+};
+
+/**
+ * Start a model
+ */
+export const startModel = async (
+	modelId: string,
+	name: string,
+	options: {
+		pod?: string;
+		vllmArgs?: string[];
+		memory?: string;
+		context?: string;
+		gpus?: number;
+	},
+) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	// Validation
+	if (!pod.modelsPath) {
+		console.error(chalk.red("Pod does not have a models path configured"));
+		process.exit(1);
+	}
+	if (pod.models[name]) {
+		console.error(chalk.red(`Model '${name}' already exists on pod '${podName}'`));
+		process.exit(1);
+	}
+
+	const port = getNextPort(pod);
+
+	// Determine GPU allocation and vLLM args
+	let gpus: number[] = [];
+	let vllmArgs: string[] = [];
+	let modelConfig = null;
+
+	if (options.vllmArgs?.length) {
+		// Custom args override everything
+		vllmArgs = options.vllmArgs;
+		console.log(chalk.gray("Using custom vLLM args, GPU allocation managed by vLLM"));
+	} else if (isKnownModel(modelId)) {
+		// Handle --gpus parameter for known models
+		if (options.gpus) {
+			// Validate GPU count
+			if (options.gpus > pod.gpus.length) {
+				console.error(chalk.red(`Error: Requested ${options.gpus} GPUs but pod only has ${pod.gpus.length}`));
+				process.exit(1);
+			}
+
+			// Try to find config for requested GPU count
+			modelConfig = getModelConfig(modelId, pod.gpus, options.gpus);
+			if (modelConfig) {
+				gpus = selectGPUs(pod, options.gpus);
+				vllmArgs = [...(modelConfig.args || [])];
+			} else {
+				console.error(
+					chalk.red(`Model '${getModelName(modelId)}' does not have a configuration for ${options.gpus} GPU(s)`),
+				);
+				console.error(chalk.yellow("Available configurations:"));
+
+				// Show available configurations
+				for (let gpuCount = 1; gpuCount <= pod.gpus.length; gpuCount++) {
+					const config = getModelConfig(modelId, pod.gpus, gpuCount);
+					if (config) {
+						console.error(chalk.gray(`  - ${gpuCount} GPU(s)`));
+					}
+				}
+				process.exit(1);
+			}
+		} else {
+			// Find best config for this hardware (original behavior)
+			for (let gpuCount = pod.gpus.length; gpuCount >= 1; gpuCount--) {
+				modelConfig = getModelConfig(modelId, pod.gpus, gpuCount);
+				if (modelConfig) {
+					gpus = selectGPUs(pod, gpuCount);
+					vllmArgs = [...(modelConfig.args || [])];
+					break;
+				}
+			}
+			if (!modelConfig) {
+				console.error(chalk.red(`Model '${getModelName(modelId)}' not compatible with this pod's GPUs`));
+				process.exit(1);
+			}
+		}
+	} else {
+		// Unknown model
+		if (options.gpus) {
+			console.error(chalk.red("Error: --gpus can only be used with predefined models"));
+			console.error(chalk.yellow("For custom models, use --vllm with tensor-parallel-size or similar arguments"));
+			process.exit(1);
+		}
+		// Single GPU default
+		gpus = selectGPUs(pod, 1);
+		console.log(chalk.gray("Unknown model, defaulting to single GPU"));
+	}
+
+	// Apply memory/context overrides
+	if (!options.vllmArgs?.length) {
+		if (options.memory) {
+			const fraction = parseFloat(options.memory.replace("%", "")) / 100;
+			vllmArgs = vllmArgs.filter((arg) => !arg.includes("gpu-memory-utilization"));
+			vllmArgs.push("--gpu-memory-utilization", String(fraction));
+		}
+		if (options.context) {
+			const contextSizes: Record<string, number> = {
+				"4k": 4096,
+				"8k": 8192,
+				"16k": 16384,
+				"32k": 32768,
+				"64k": 65536,
+				"128k": 131072,
+			};
+			const maxTokens = contextSizes[options.context.toLowerCase()] || parseInt(options.context);
+			vllmArgs = vllmArgs.filter((arg) => !arg.includes("max-model-len"));
+			vllmArgs.push("--max-model-len", String(maxTokens));
+		}
+	}
+
+	// Show what we're doing
+	console.log(chalk.green(`Starting model '${name}' on pod '${podName}'...`));
+	console.log(`Model: ${modelId}`);
+	console.log(`Port: ${port}`);
+	console.log(`GPU(s): ${gpus.length ? gpus.join(", ") : "Managed by vLLM"}`);
+	if (modelConfig?.notes) console.log(chalk.yellow(`Note: ${modelConfig.notes}`));
+	console.log("");
+
+	// Read and customize model_run.sh script with our values
+	const scriptPath = join(dirname(fileURLToPath(import.meta.url)), "../../scripts/model_run.sh");
+	let scriptContent = readFileSync(scriptPath, "utf-8");
+
+	// Replace placeholders - no escaping needed, heredoc with 'EOF' is literal
+	scriptContent = scriptContent
+		.replace("{{MODEL_ID}}", modelId)
+		.replace("{{NAME}}", name)
+		.replace("{{PORT}}", String(port))
+		.replace("{{VLLM_ARGS}}", vllmArgs.join(" "));
+
+	// Upload customized script
+	const result = await sshExec(
+		pod.ssh,
+		`cat > /tmp/model_run_${name}.sh << 'EOF'
+${scriptContent}
+EOF
+chmod +x /tmp/model_run_${name}.sh`,
+	);
+
+	// Prepare environment
+	const env = [
+		`HF_TOKEN='${process.env.HF_TOKEN}'`,
+		`PI_API_KEY='${process.env.PI_API_KEY}'`,
+		`HF_HUB_ENABLE_HF_TRANSFER=1`,
+		`VLLM_NO_USAGE_STATS=1`,
+		`PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`,
+		`FORCE_COLOR=1`,
+		`TERM=xterm-256color`,
+		...(gpus.length === 1 ? [`CUDA_VISIBLE_DEVICES=${gpus[0]}`] : []),
+		...Object.entries(modelConfig?.env || {}).map(([k, v]) => `${k}='${v}'`),
+	]
+		.map((e) => `export ${e}`)
+		.join("\n");
+
+	// Start the model runner with script command for pseudo-TTY (preserves colors)
+	// Note: We use script to preserve colors and create a log file
+	// setsid creates a new session so it survives SSH disconnection
+	const startCmd = `
+		${env}
+		mkdir -p ~/.vllm_logs
+		# Create a wrapper that monitors the script command
+		cat > /tmp/model_wrapper_${name}.sh << 'WRAPPER'
+#!/bin/bash
+script -q -f -c "/tmp/model_run_${name}.sh" ~/.vllm_logs/${name}.log
+exit_code=$?
+echo "Script exited with code $exit_code" >> ~/.vllm_logs/${name}.log
+exit $exit_code
+WRAPPER
+		chmod +x /tmp/model_wrapper_${name}.sh
+		setsid /tmp/model_wrapper_${name}.sh </dev/null >/dev/null 2>&1 &
+		echo $!
+		exit 0
+	`;
+
+	const pidResult = await sshExec(pod.ssh, startCmd);
+	const pid = parseInt(pidResult.stdout.trim());
+	if (!pid) {
+		console.error(chalk.red("Failed to start model runner"));
+		process.exit(1);
+	}
+
+	// Save to config
+	const config = loadConfig();
+	config.pods[podName].models[name] = { model: modelId, port, gpu: gpus, pid };
+	saveConfig(config);
+
+	console.log(`Model runner started with PID: ${pid}`);
+	console.log("Streaming logs... (waiting for startup)\n");
+
+	// Small delay to ensure log file is created
+	await new Promise((resolve) => setTimeout(resolve, 500));
+
+	// Stream logs with color support, watching for startup complete
+	const sshParts = pod.ssh.split(" ");
+	const sshCommand = sshParts[0]; // "ssh"
+	const sshArgs = sshParts.slice(1); // ["root@86.38.238.55"]
+	const host = sshArgs[0].split("@")[1] || "localhost";
+	const tailCmd = `tail -f ~/.vllm_logs/${name}.log`;
+
+	// Build the full args array for spawn
+	const fullArgs = [...sshArgs, tailCmd];
+
+	const logProcess = spawn(sshCommand, fullArgs, {
+		stdio: ["inherit", "pipe", "pipe"], // capture stdout and stderr
+		env: { ...process.env, FORCE_COLOR: "1" },
+	});
+
+	let interrupted = false;
+	let startupComplete = false;
+
+	// Handle Ctrl+C
+	const sigintHandler = () => {
+		interrupted = true;
+		logProcess.kill();
+	};
+	process.on("SIGINT", sigintHandler);
+
+	// Process log output line by line
+	const processOutput = (data: Buffer) => {
+		const lines = data.toString().split("\n");
+		for (const line of lines) {
+			if (line) {
+				console.log(line); // Echo the line to console
+
+				// Check for startup complete message
+				if (line.includes("Application startup complete")) {
+					startupComplete = true;
+					logProcess.kill(); // Stop tailing logs
+				}
+			}
+		}
+	};
+
+	logProcess.stdout?.on("data", processOutput);
+	logProcess.stderr?.on("data", processOutput);
+
+	await new Promise<void>((resolve) => logProcess.on("exit", resolve));
+	process.removeListener("SIGINT", sigintHandler);
+
+	if (startupComplete) {
+		// Model started successfully - output connection details
+		console.log("\n" + chalk.green("✓ Model started successfully!"));
+		console.log("\n" + chalk.bold("Connection Details:"));
+		console.log(chalk.cyan("─".repeat(50)));
+		console.log(chalk.white("Base URL:    ") + chalk.yellow(`http://${host}:${port}/v1`));
+		console.log(chalk.white("Model:       ") + chalk.yellow(modelId));
+		console.log(chalk.white("API Key:     ") + chalk.yellow(process.env.PI_API_KEY || "(not set)"));
+		console.log(chalk.cyan("─".repeat(50)));
+
+		console.log("\n" + chalk.bold("Export for shell:"));
+		console.log(chalk.gray(`export OPENAI_BASE_URL="http://${host}:${port}/v1"`));
+		console.log(chalk.gray(`export OPENAI_API_KEY="${process.env.PI_API_KEY || "your-api-key"}"`));
+		console.log(chalk.gray(`export OPENAI_MODEL="${modelId}"`));
+
+		console.log("\n" + chalk.bold("Example usage:"));
+		console.log(
+			chalk.gray(`
+  # Python
+  from openai import OpenAI
+  client = OpenAI()  # Uses env vars
+  response = client.chat.completions.create(
+      model="${modelId}",
+      messages=[{"role": "user", "content": "Hello!"}]
+  )
+
+  # CLI
+  curl $OPENAI_BASE_URL/chat/completions \\
+    -H "Authorization: Bearer $OPENAI_API_KEY" \\
+    -H "Content-Type: application/json" \\
+    -d '{"model":"${modelId}","messages":[{"role":"user","content":"Hi"}]}'`),
+		);
+		console.log("");
+		console.log(chalk.cyan(`Chat with model:  pi agent ${name} "Your message"`));
+		console.log(chalk.cyan(`Interactive mode: pi agent ${name} -i`));
+		console.log(chalk.cyan(`Monitor logs:     pi logs ${name}`));
+		console.log(chalk.cyan(`Stop model:       pi stop ${name}`));
+	} else if (interrupted) {
+		console.log(chalk.yellow("\n\nStopped monitoring. Model deployment continues in background."));
+		console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
+		console.log(chalk.cyan(`Check status: pi logs ${name}`));
+		console.log(chalk.cyan(`Stop model: pi stop ${name}`));
+	} else {
+		console.log(chalk.yellow("\n\nLog stream ended. Model may still be running."));
+		console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
+		console.log(chalk.cyan(`Check status: pi logs ${name}`));
+		console.log(chalk.cyan(`Stop model: pi stop ${name}`));
+	}
+};
+
+/**
+ * Stop a model
+ */
+export const stopModel = async (name: string, options: { pod?: string }) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	const model = pod.models[name];
+	if (!model) {
+		console.error(chalk.red(`Model '${name}' not found on pod '${podName}'`));
+		process.exit(1);
+	}
+
+	console.log(chalk.yellow(`Stopping model '${name}' on pod '${podName}'...`));
+
+	// Kill the script process and all its children
+	// Using pkill to kill the process and all children
+	const killCmd = `
+		# Kill the script process and all its children
+		pkill -TERM -P ${model.pid} 2>/dev/null || true
+		kill ${model.pid} 2>/dev/null || true
+	`;
+	await sshExec(pod.ssh, killCmd);
+
+	// Remove from config
+	const config = loadConfig();
+	delete config.pods[podName].models[name];
+	saveConfig(config);
+
+	console.log(chalk.green(`✓ Model '${name}' stopped`));
+};
+
+/**
+ * Stop all models on a pod
+ */
+export const stopAllModels = async (options: { pod?: string }) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	const modelNames = Object.keys(pod.models);
+	if (modelNames.length === 0) {
+		console.log(`No models running on pod '${podName}'`);
+		return;
+	}
+
+	console.log(chalk.yellow(`Stopping ${modelNames.length} model(s) on pod '${podName}'...`));
+
+	// Kill all script processes and their children
+	const pids = Object.values(pod.models).map((m) => m.pid);
+	const killCmd = `
+		for PID in ${pids.join(" ")}; do
+			pkill -TERM -P $PID 2>/dev/null || true
+			kill $PID 2>/dev/null || true
+		done
+	`;
+	await sshExec(pod.ssh, killCmd);
+
+	// Clear all models from config
+	const config = loadConfig();
+	config.pods[podName].models = {};
+	saveConfig(config);
+
+	console.log(chalk.green(`✓ Stopped all models: ${modelNames.join(", ")}`));
+};
+
+/**
+ * List all models
+ */
+export const listModels = async (options: { pod?: string }) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	const modelNames = Object.keys(pod.models);
+	if (modelNames.length === 0) {
+		console.log(`No models running on pod '${podName}'`);
+		return;
+	}
+
+	// Get pod SSH host for URL display
+	const sshParts = pod.ssh.split(" ");
+	const host = sshParts.find((p) => p.includes("@"))?.split("@")[1] || "unknown";
+
+	console.log(`Models on pod '${chalk.bold(podName)}':`);
+	for (const name of modelNames) {
+		const model = pod.models[name];
+		const gpuStr =
+			model.gpu.length > 1
+				? `GPUs ${model.gpu.join(",")}`
+				: model.gpu.length === 1
+					? `GPU ${model.gpu[0]}`
+					: "GPU unknown";
+		console.log(`  ${chalk.green(name)} - Port ${model.port} - ${gpuStr} - PID ${model.pid}`);
+		console.log(`    Model: ${chalk.gray(model.model)}`);
+		console.log(`    URL: ${chalk.cyan(`http://${host}:${model.port}/v1`)}`);
+	}
+
+	// Optionally verify processes are still running
+	console.log("");
+	console.log("Verifying processes...");
+	let anyDead = false;
+	for (const name of modelNames) {
+		const model = pod.models[name];
+		// Check both the wrapper process and if vLLM is responding
+		const checkCmd = `
+			# Check if wrapper process exists
+			if ps -p ${model.pid} > /dev/null 2>&1; then
+				# Process exists, now check if vLLM is responding
+				if curl -s -f http://localhost:${model.port}/health > /dev/null 2>&1; then
+					echo "running"
+				else
+					# Check if it's still starting up
+					if tail -n 20 ~/.vllm_logs/${name}.log 2>/dev/null | grep -q "ERROR\\|Failed\\|Cuda error\\|died"; then
+						echo "crashed"
+					else
+						echo "starting"
+					fi
+				fi
+			else
+				echo "dead"
+			fi
+		`;
+		const result = await sshExec(pod.ssh, checkCmd);
+		const status = result.stdout.trim();
+		if (status === "dead") {
+			console.log(chalk.red(`  ${name}: Process ${model.pid} is not running`));
+			anyDead = true;
+		} else if (status === "crashed") {
+			console.log(chalk.red(`  ${name}: vLLM crashed (check logs with 'pi logs ${name}')`));
+			anyDead = true;
+		} else if (status === "starting") {
+			console.log(chalk.yellow(`  ${name}: Still starting up...`));
+		}
+	}
+
+	if (anyDead) {
+		console.log("");
+		console.log(chalk.yellow("Some models are not running. Clean up with:"));
+		console.log(chalk.cyan("  pi stop <name>"));
+	} else {
+		console.log(chalk.green("✓ All processes verified"));
+	}
+};
+
+/**
+ * View model logs
+ */
+export const viewLogs = async (name: string, options: { pod?: string }) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	const model = pod.models[name];
+	if (!model) {
+		console.error(chalk.red(`Model '${name}' not found on pod '${podName}'`));
+		process.exit(1);
+	}
+
+	console.log(chalk.green(`Streaming logs for '${name}' on pod '${podName}'...`));
+	console.log(chalk.gray("Press Ctrl+C to stop"));
+	console.log("");
+
+	// Stream logs with color preservation
+	const sshParts = pod.ssh.split(" ");
+	const sshCommand = sshParts[0]; // "ssh"
+	const sshArgs = sshParts.slice(1); // ["root@86.38.238.55"]
+	const tailCmd = `tail -f ~/.vllm_logs/${name}.log`;
+
+	const logProcess = spawn(sshCommand, [...sshArgs, tailCmd], {
+		stdio: "inherit",
+		env: {
+			...process.env,
+			FORCE_COLOR: "1",
+		},
+	});
+
+	// Wait for process to exit
+	await new Promise<void>((resolve) => {
+		logProcess.on("exit", () => resolve());
+	});
+};
+
+/**
+ * Show known models and their hardware requirements
+ */
+export const showKnownModels = async () => {
+	const modelsJson = await import("../models.json", { assert: { type: "json" } });
+	const models = modelsJson.default.models;
+
+	// Get active pod info if available
+	const activePod = getActivePod();
+	let podGpuCount = 0;
+	let podGpuType = "";
+
+	if (activePod) {
+		podGpuCount = activePod.pod.gpus.length;
+		// Extract GPU type from name (e.g., "NVIDIA H200" -> "H200")
+		podGpuType = activePod.pod.gpus[0]?.name?.replace("NVIDIA", "")?.trim()?.split(" ")[0] || "";
+
+		console.log(chalk.bold(`Known Models for ${activePod.name} (${podGpuCount}x ${podGpuType || "GPU"}):\n`));
+	} else {
+		console.log(chalk.bold("Known Models:\n"));
+		console.log(chalk.yellow("No active pod. Use 'pi pods active <name>' to filter compatible models.\n"));
+	}
+
+	console.log("Usage: pi start <model> --name <name> [options]\n");
+
+	// Group models by compatibility and family
+	const compatible: Record<string, Array<{ id: string; name: string; config: string; notes?: string }>> = {};
+	const incompatible: Record<string, Array<{ id: string; name: string; minGpu: string; notes?: string }>> = {};
+
+	for (const [modelId, info] of Object.entries(models)) {
+		const modelInfo = info as any;
+		const family = modelInfo.name.split("-")[0] || "Other";
+
+		let isCompatible = false;
+		let compatibleConfig = "";
+		let minGpu = "Unknown";
+		let minNotes: string | undefined;
+
+		if (modelInfo.configs && modelInfo.configs.length > 0) {
+			// Sort configs by GPU count to find minimum
+			const sortedConfigs = [...modelInfo.configs].sort((a: any, b: any) => (a.gpuCount || 1) - (b.gpuCount || 1));
+
+			// Find minimum requirements
+			const minConfig = sortedConfigs[0];
+			const minGpuCount = minConfig.gpuCount || 1;
+			const gpuTypes = minConfig.gpuTypes?.join("/") || "H100/H200";
+
+			if (minGpuCount === 1) {
+				minGpu = `1x ${gpuTypes}`;
+			} else {
+				minGpu = `${minGpuCount}x ${gpuTypes}`;
+			}
+
+			minNotes = minConfig.notes || modelInfo.notes;
+
+			// Check compatibility with active pod
+			if (activePod && podGpuCount > 0) {
+				// Find best matching config for this pod
+				for (const config of sortedConfigs) {
+					const configGpuCount = config.gpuCount || 1;
+					const configGpuTypes = config.gpuTypes || [];
+
+					// Check if we have enough GPUs
+					if (configGpuCount <= podGpuCount) {
+						// Check if GPU type matches (if specified)
+						if (
+							configGpuTypes.length === 0 ||
+							configGpuTypes.some((type: string) => podGpuType.includes(type) || type.includes(podGpuType))
+						) {
+							isCompatible = true;
+							if (configGpuCount === 1) {
+								compatibleConfig = `1x ${podGpuType}`;
+							} else {
+								compatibleConfig = `${configGpuCount}x ${podGpuType}`;
+							}
+							minNotes = config.notes || modelInfo.notes;
+							break;
+						}
+					}
+				}
+			}
+		}
+
+		const modelEntry = {
+			id: modelId,
+			name: modelInfo.name,
+			notes: minNotes,
+		};
+
+		if (activePod && isCompatible) {
+			if (!compatible[family]) {
+				compatible[family] = [];
+			}
+			compatible[family].push({ ...modelEntry, config: compatibleConfig });
+		} else {
+			if (!incompatible[family]) {
+				incompatible[family] = [];
+			}
+			incompatible[family].push({ ...modelEntry, minGpu });
+		}
+	}
+
+	// Display compatible models first
+	if (activePod && Object.keys(compatible).length > 0) {
+		console.log(chalk.green.bold("✓ Compatible Models:\n"));
+
+		const sortedFamilies = Object.keys(compatible).sort();
+		for (const family of sortedFamilies) {
+			console.log(chalk.cyan(`${family} Models:`));
+
+			const modelList = compatible[family].sort((a, b) => a.name.localeCompare(b.name));
+
+			for (const model of modelList) {
+				console.log(`  ${chalk.green(model.id)}`);
+				console.log(`    Name: ${model.name}`);
+				console.log(`    Config: ${model.config}`);
+				if (model.notes) {
+					console.log(chalk.gray(`    Note: ${model.notes}`));
+				}
+				console.log("");
+			}
+		}
+	}
+
+	// Display incompatible models
+	if (Object.keys(incompatible).length > 0) {
+		if (activePod && Object.keys(compatible).length > 0) {
+			console.log(chalk.red.bold("✗ Incompatible Models (need more/different GPUs):\n"));
+		}
+
+		const sortedFamilies = Object.keys(incompatible).sort();
+		for (const family of sortedFamilies) {
+			if (!activePod) {
+				console.log(chalk.cyan(`${family} Models:`));
+			} else {
+				console.log(chalk.gray(`${family} Models:`));
+			}
+
+			const modelList = incompatible[family].sort((a, b) => a.name.localeCompare(b.name));
+
+			for (const model of modelList) {
+				const color = activePod ? chalk.gray : chalk.green;
+				console.log(`  ${color(model.id)}`);
+				console.log(chalk.gray(`    Name: ${model.name}`));
+				console.log(chalk.gray(`    Min Hardware: ${model.minGpu}`));
+				if (model.notes && !activePod) {
+					console.log(chalk.gray(`    Note: ${model.notes}`));
+				}
+				if (activePod) {
+					console.log(""); // Less verbose for incompatible models when filtered
+				} else {
+					console.log("");
+				}
+			}
+		}
+	}
+
+	console.log(chalk.gray("\nFor unknown models, defaults to single GPU deployment."));
+	console.log(chalk.gray("Use --vllm to pass custom arguments to vLLM."));
+};
--- a/packages/pods/src/commands/pods.ts
+++ b/packages/pods/src/commands/pods.ts
@ -0,0 +1,205 @@
+import chalk from "chalk";
+import { dirname, join } from "path";
+import { fileURLToPath } from "url";
+import { addPod, loadConfig, removePod, setActivePod } from "../config.js";
+import { scpFile, sshExec, sshExecStream } from "../ssh.js";
+import type { GPU, Pod } from "../types.js";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+/**
+ * List all pods
+ */
+export const listPods = () => {
+	const config = loadConfig();
+	const podNames = Object.keys(config.pods);
+
+	if (podNames.length === 0) {
+		console.log("No pods configured. Use 'pi pods setup' to add a pod.");
+		return;
+	}
+
+	console.log("Configured pods:");
+	for (const name of podNames) {
+		const pod = config.pods[name];
+		const isActive = config.active === name;
+		const marker = isActive ? chalk.green("*") : " ";
+		const gpuCount = pod.gpus?.length || 0;
+		const gpuInfo = gpuCount > 0 ? `${gpuCount}x ${pod.gpus[0].name}` : "no GPUs detected";
+		const vllmInfo = pod.vllmVersion ? ` (vLLM: ${pod.vllmVersion})` : "";
+		console.log(`${marker} ${chalk.bold(name)} - ${gpuInfo}${vllmInfo} - ${pod.ssh}`);
+		if (pod.modelsPath) {
+			console.log(`    Models: ${pod.modelsPath}`);
+		}
+		if (pod.vllmVersion === "gpt-oss") {
+			console.log(chalk.yellow(`    ⚠️  GPT-OSS build - only for GPT-OSS models`));
+		}
+	}
+};
+
+/**
+ * Setup a new pod
+ */
+export const setupPod = async (
+	name: string,
+	sshCmd: string,
+	options: { mount?: string; modelsPath?: string; vllm?: "release" | "nightly" | "gpt-oss" },
+) => {
+	// Validate environment variables
+	const hfToken = process.env.HF_TOKEN;
+	const vllmApiKey = process.env.PI_API_KEY;
+
+	if (!hfToken) {
+		console.error(chalk.red("ERROR: HF_TOKEN environment variable is required"));
+		console.error("Get a token from: https://huggingface.co/settings/tokens");
+		console.error("Then run: export HF_TOKEN=your_token_here");
+		process.exit(1);
+	}
+
+	if (!vllmApiKey) {
+		console.error(chalk.red("ERROR: PI_API_KEY environment variable is required"));
+		console.error("Set an API key: export PI_API_KEY=your_api_key_here");
+		process.exit(1);
+	}
+
+	// Determine models path
+	let modelsPath = options.modelsPath;
+	if (!modelsPath && options.mount) {
+		// Extract path from mount command if not explicitly provided
+		// e.g., "mount -t nfs ... /mnt/sfs" -> "/mnt/sfs"
+		const parts = options.mount.split(" ");
+		modelsPath = parts[parts.length - 1];
+	}
+
+	if (!modelsPath) {
+		console.error(chalk.red("ERROR: --models-path is required (or must be extractable from --mount)"));
+		process.exit(1);
+	}
+
+	console.log(chalk.green(`Setting up pod '${name}'...`));
+	console.log(`SSH: ${sshCmd}`);
+	console.log(`Models path: ${modelsPath}`);
+	console.log(
+		`vLLM version: ${options.vllm || "release"} ${options.vllm === "gpt-oss" ? chalk.yellow("(GPT-OSS special build)") : ""}`,
+	);
+	if (options.mount) {
+		console.log(`Mount command: ${options.mount}`);
+	}
+	console.log("");
+
+	// Test SSH connection
+	console.log("Testing SSH connection...");
+	const testResult = await sshExec(sshCmd, "echo 'SSH OK'");
+	if (testResult.exitCode !== 0) {
+		console.error(chalk.red("Failed to connect via SSH"));
+		console.error(testResult.stderr);
+		process.exit(1);
+	}
+	console.log(chalk.green("✓ SSH connection successful"));
+
+	// Copy setup script
+	console.log("Copying setup script...");
+	const scriptPath = join(__dirname, "../../scripts/pod_setup.sh");
+	const success = await scpFile(sshCmd, scriptPath, "/tmp/pod_setup.sh");
+	if (!success) {
+		console.error(chalk.red("Failed to copy setup script"));
+		process.exit(1);
+	}
+	console.log(chalk.green("✓ Setup script copied"));
+
+	// Build setup command
+	let setupCmd = `bash /tmp/pod_setup.sh --models-path '${modelsPath}' --hf-token '${hfToken}' --vllm-api-key '${vllmApiKey}'`;
+	if (options.mount) {
+		setupCmd += ` --mount '${options.mount}'`;
+	}
+	// Add vLLM version flag
+	const vllmVersion = options.vllm || "release";
+	setupCmd += ` --vllm '${vllmVersion}'`;
+
+	// Run setup script
+	console.log("");
+	console.log(chalk.yellow("Running setup (this will take 2-5 minutes)..."));
+	console.log("");
+
+	// Use forceTTY to preserve colors from apt, pip, etc.
+	const exitCode = await sshExecStream(sshCmd, setupCmd, { forceTTY: true });
+	if (exitCode !== 0) {
+		console.error(chalk.red("\nSetup failed. Check the output above for errors."));
+		process.exit(1);
+	}
+
+	// Parse GPU info from setup output
+	console.log("");
+	console.log("Detecting GPU configuration...");
+	const gpuResult = await sshExec(sshCmd, "nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader");
+
+	const gpus: GPU[] = [];
+	if (gpuResult.exitCode === 0 && gpuResult.stdout) {
+		const lines = gpuResult.stdout.trim().split("\n");
+		for (const line of lines) {
+			const [id, name, memory] = line.split(",").map((s) => s.trim());
+			if (id !== undefined) {
+				gpus.push({
+					id: parseInt(id),
+					name: name || "Unknown",
+					memory: memory || "Unknown",
+				});
+			}
+		}
+	}
+
+	console.log(chalk.green(`✓ Detected ${gpus.length} GPU(s)`));
+	for (const gpu of gpus) {
+		console.log(`  GPU ${gpu.id}: ${gpu.name} (${gpu.memory})`);
+	}
+
+	// Save pod configuration
+	const pod: Pod = {
+		ssh: sshCmd,
+		gpus,
+		models: {},
+		modelsPath,
+		vllmVersion: options.vllm || "release",
+	};
+
+	addPod(name, pod);
+	console.log("");
+	console.log(chalk.green(`✓ Pod '${name}' setup complete and set as active pod`));
+	console.log("");
+	console.log("You can now deploy models with:");
+	console.log(chalk.cyan(`  pi start <model> --name <name>`));
+};
+
+/**
+ * Switch active pod
+ */
+export const switchActivePod = (name: string) => {
+	const config = loadConfig();
+	if (!config.pods[name]) {
+		console.error(chalk.red(`Pod '${name}' not found`));
+		console.log("\nAvailable pods:");
+		for (const podName of Object.keys(config.pods)) {
+			console.log(`  ${podName}`);
+		}
+		process.exit(1);
+	}
+
+	setActivePod(name);
+	console.log(chalk.green(`✓ Switched active pod to '${name}'`));
+};
+
+/**
+ * Remove a pod from config
+ */
+export const removePodCommand = (name: string) => {
+	const config = loadConfig();
+	if (!config.pods[name]) {
+		console.error(chalk.red(`Pod '${name}' not found`));
+		process.exit(1);
+	}
+
+	removePod(name);
+	console.log(chalk.green(`✓ Removed pod '${name}' from configuration`));
+	console.log(chalk.yellow("Note: This only removes the local configuration. The remote pod is not affected."));
+};
--- a/packages/pods/src/commands/prompt.ts
+++ b/packages/pods/src/commands/prompt.ts
@ -0,0 +1,85 @@
+import { main as agentMain } from "@mariozechner/pi-agent";
+import chalk from "chalk";
+import { getActivePod, loadConfig } from "../config.js";
+
+// ────────────────────────────────────────────────────────────────────────────────
+// Types
+// ────────────────────────────────────────────────────────────────────────────────
+
+interface PromptOptions {
+	pod?: string;
+	apiKey?: string;
+}
+
+// ────────────────────────────────────────────────────────────────────────────────
+// Main prompt function
+// ────────────────────────────────────────────────────────────────────────────────
+
+export async function promptModel(modelName: string, userArgs: string[], opts: PromptOptions = {}) {
+	// Get pod and model configuration
+	const activePod = opts.pod ? { name: opts.pod, pod: loadConfig().pods[opts.pod] } : getActivePod();
+
+	if (!activePod) {
+		console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
+		process.exit(1);
+	}
+
+	const { name: podName, pod } = activePod;
+	const modelConfig = pod.models[modelName];
+
+	if (!modelConfig) {
+		console.error(chalk.red(`Model '${modelName}' not found on pod '${podName}'`));
+		process.exit(1);
+	}
+
+	// Extract host from SSH string
+	const host =
+		pod.ssh
+			.split(" ")
+			.find((p) => p.includes("@"))
+			?.split("@")[1] ?? "localhost";
+
+	// Build the system prompt for code navigation
+	const systemPrompt = `You help the user understand and navigate the codebase in the current working directory.
+
+You can read files, list directories, and execute shell commands via the respective tools.
+
+Do not output file contents you read via the read_file tool directly, unless asked to.
+
+Do not output markdown tables as part of your responses.
+
+Keep your responses concise and relevant to the user's request.
+
+File paths you output must include line numbers where possible, e.g. "src/index.ts:10-20" for lines 10 to 20 in src/index.ts.
+
+Current working directory: ${process.cwd()}`;
+
+	// Build arguments for agent main function
+	const args: string[] = [];
+
+	// Add base configuration that we control
+	args.push(
+		"--base-url",
+		`http://${host}:${modelConfig.port}/v1`,
+		"--model",
+		modelConfig.model,
+		"--api-key",
+		opts.apiKey || process.env.PI_API_KEY || "dummy",
+		"--api",
+		modelConfig.model.toLowerCase().includes("gpt-oss") ? "responses" : "completions",
+		"--system-prompt",
+		systemPrompt,
+	);
+
+	// Pass through all user-provided arguments
+	// This includes messages, --continue, --json, etc.
+	args.push(...userArgs);
+
+	// Call agent main function directly
+	try {
+		await agentMain(args);
+	} catch (err: any) {
+		console.error(chalk.red(`Agent error: ${err.message}`));
+		process.exit(1);
+	}
+}
--- a/packages/pods/src/config.ts
+++ b/packages/pods/src/config.ts
@ -0,0 +1,80 @@
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
+import { homedir } from "os";
+import { join } from "path";
+import type { Config, Pod } from "./types.js";
+
+// Get config directory from env or use default
+const getConfigDir = (): string => {
+	const configDir = process.env.PI_CONFIG_DIR || join(homedir(), ".pi");
+	if (!existsSync(configDir)) {
+		mkdirSync(configDir, { recursive: true });
+	}
+	return configDir;
+};
+
+const getConfigPath = (): string => {
+	return join(getConfigDir(), "pods.json");
+};
+
+export const loadConfig = (): Config => {
+	const configPath = getConfigPath();
+	if (!existsSync(configPath)) {
+		// Return empty config if file doesn't exist
+		return { pods: {} };
+	}
+	try {
+		const data = readFileSync(configPath, "utf-8");
+		return JSON.parse(data);
+	} catch (e) {
+		console.error(`Error reading config: ${e}`);
+		return { pods: {} };
+	}
+};
+
+export const saveConfig = (config: Config): void => {
+	const configPath = getConfigPath();
+	try {
+		writeFileSync(configPath, JSON.stringify(config, null, 2));
+	} catch (e) {
+		console.error(`Error saving config: ${e}`);
+		process.exit(1);
+	}
+};
+
+export const getActivePod = (): { name: string; pod: Pod } | null => {
+	const config = loadConfig();
+	if (!config.active || !config.pods[config.active]) {
+		return null;
+	}
+	return { name: config.active, pod: config.pods[config.active] };
+};
+
+export const addPod = (name: string, pod: Pod): void => {
+	const config = loadConfig();
+	config.pods[name] = pod;
+	// If no active pod, make this one active
+	if (!config.active) {
+		config.active = name;
+	}
+	saveConfig(config);
+};
+
+export const removePod = (name: string): void => {
+	const config = loadConfig();
+	delete config.pods[name];
+	// If this was the active pod, clear active
+	if (config.active === name) {
+		config.active = undefined;
+	}
+	saveConfig(config);
+};
+
+export const setActivePod = (name: string): void => {
+	const config = loadConfig();
+	if (!config.pods[name]) {
+		console.error(`Pod '${name}' not found`);
+		process.exit(1);
+	}
+	config.active = name;
+	saveConfig(config);
+};
--- a/packages/pods/src/index.ts
+++ b/packages/pods/src/index.ts
@ -0,0 +1,2 @@
+// Main library exports
+export * from "./types.js";
--- a/packages/pods/src/model-configs.ts
+++ b/packages/pods/src/model-configs.ts
@ -0,0 +1,111 @@
+import { readFileSync } from "fs";
+import { dirname, join } from "path";
+import { fileURLToPath } from "url";
+import type { GPU } from "./types.js";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+interface ModelConfig {
+	gpuCount: number;
+	gpuTypes?: string[];
+	args: string[];
+	env?: Record<string, string>;
+	notes?: string;
+}
+
+interface ModelInfo {
+	name: string;
+	configs: ModelConfig[];
+	notes?: string;
+}
+
+interface ModelsData {
+	models: Record<string, ModelInfo>;
+}
+
+// Load models configuration - resolve relative to this file
+const modelsJsonPath = join(__dirname, "models.json");
+const modelsData: ModelsData = JSON.parse(readFileSync(modelsJsonPath, "utf-8"));
+
+/**
+ * Get the best configuration for a model based on available GPUs
+ */
+export const getModelConfig = (
+	modelId: string,
+	gpus: GPU[],
+	requestedGpuCount: number,
+): { args: string[]; env?: Record<string, string>; notes?: string } | null => {
+	const modelInfo = modelsData.models[modelId];
+	if (!modelInfo) {
+		// Unknown model, no default config
+		return null;
+	}
+
+	// Extract GPU type from the first GPU name (e.g., "NVIDIA H200" -> "H200")
+	const gpuType = gpus[0]?.name?.replace("NVIDIA", "")?.trim()?.split(" ")[0] || "";
+
+	// Find best matching config
+	let bestConfig: ModelConfig | null = null;
+
+	for (const config of modelInfo.configs) {
+		// Check GPU count
+		if (config.gpuCount !== requestedGpuCount) {
+			continue;
+		}
+
+		// Check GPU type if specified
+		if (config.gpuTypes && config.gpuTypes.length > 0) {
+			const typeMatches = config.gpuTypes.some((type) => gpuType.includes(type) || type.includes(gpuType));
+			if (!typeMatches) {
+				continue;
+			}
+		}
+
+		// This config matches
+		bestConfig = config;
+		break;
+	}
+
+	// If no exact match, try to find a config with just the right GPU count
+	if (!bestConfig) {
+		for (const config of modelInfo.configs) {
+			if (config.gpuCount === requestedGpuCount) {
+				bestConfig = config;
+				break;
+			}
+		}
+	}
+
+	if (!bestConfig) {
+		// No suitable config found
+		return null;
+	}
+
+	return {
+		args: [...bestConfig.args],
+		env: bestConfig.env ? { ...bestConfig.env } : undefined,
+		notes: bestConfig.notes || modelInfo.notes,
+	};
+};
+
+/**
+ * Check if a model is known
+ */
+export const isKnownModel = (modelId: string): boolean => {
+	return modelId in modelsData.models;
+};
+
+/**
+ * Get all known models
+ */
+export const getKnownModels = (): string[] => {
+	return Object.keys(modelsData.models);
+};
+
+/**
+ * Get model display name
+ */
+export const getModelName = (modelId: string): string => {
+	return modelsData.models[modelId]?.name || modelId;
+};
--- a/packages/pods/src/models.json
+++ b/packages/pods/src/models.json
@ -0,0 +1,305 @@
+{
+	"models": {
+		"Qwen/Qwen2.5-Coder-32B-Instruct": {
+			"name": "Qwen2.5-Coder-32B",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
+				},
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
+				}
+			]
+		},
+		"Qwen/Qwen3-Coder-30B-A3B-Instruct": {
+			"name": "Qwen3-Coder-30B",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
+					"notes": "Fits comfortably on single GPU. ~60GB model weight."
+				},
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100", "H200"],
+					"args": [
+						"--tensor-parallel-size",
+						"2",
+						"--enable-auto-tool-choice",
+						"--tool-call-parser",
+						"qwen3_coder"
+					],
+					"notes": "For higher throughput/longer context."
+				}
+			]
+		},
+		"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
+			"name": "Qwen3-Coder-30B-FP8",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
+					"env": {
+						"VLLM_USE_DEEP_GEMM": "1"
+					},
+					"notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
+				}
+			]
+		},
+		"Qwen/Qwen3-Coder-480B-A35B-Instruct": {
+			"name": "Qwen3-Coder-480B",
+			"configs": [
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H200", "H20"],
+					"args": [
+						"--tensor-parallel-size",
+						"8",
+						"--max-model-len",
+						"32000",
+						"--enable-auto-tool-choice",
+						"--tool-call-parser",
+						"qwen3_coder"
+					],
+					"notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
+				}
+			]
+		},
+		"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
+			"name": "Qwen3-Coder-480B-FP8",
+			"configs": [
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H200", "H20"],
+					"args": [
+						"--max-model-len",
+						"131072",
+						"--enable-expert-parallel",
+						"--data-parallel-size",
+						"8",
+						"--enable-auto-tool-choice",
+						"--tool-call-parser",
+						"qwen3_coder"
+					],
+					"env": {
+						"VLLM_USE_DEEP_GEMM": "1"
+					},
+					"notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
+				}
+			]
+		},
+		"openai/gpt-oss-20b": {
+			"name": "GPT-OSS-20B",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--async-scheduling"]
+				},
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["B200"],
+					"args": ["--async-scheduling"],
+					"env": {
+						"VLLM_USE_TRTLLM_ATTENTION": "1",
+						"VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
+						"VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
+						"VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
+					}
+				}
+			],
+			"notes": "Requires vLLM 0.10.1+gptoss. Tools/functoin calls only  via /v1/responses endpoint."
+		},
+		"openai/gpt-oss-120b": {
+			"name": "GPT-OSS-120B",
+			"configs": [
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
+					"notes": "Single GPU deployment. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
+				},
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
+					"notes": "Recommended for H100/H200. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
+				},
+				{
+					"gpuCount": 4,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tensor-parallel-size", "4", "--async-scheduling"],
+					"notes": "Higher throughput. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
+				},
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H100", "H200"],
+					"args": ["--tensor-parallel-size", "8", "--async-scheduling"],
+					"notes": "Maximum throughput for evaluation workloads. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
+				}
+			]
+		},
+		"zai-org/GLM-4.5": {
+			"name": "GLM-4.5",
+			"configs": [
+				{
+					"gpuCount": 16,
+					"gpuTypes": ["H100"],
+					"args": [
+						"--tensor-parallel-size",
+						"16",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					]
+				},
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H200"],
+					"args": [
+						"--tensor-parallel-size",
+						"8",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					]
+				}
+			],
+			"notes": "Models default to thinking mode. For full 128K context, double the GPU count."
+		},
+		"zai-org/GLM-4.5-FP8": {
+			"name": "GLM-4.5-FP8",
+			"configs": [
+				{
+					"gpuCount": 8,
+					"gpuTypes": ["H100"],
+					"args": [
+						"--tensor-parallel-size",
+						"8",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					]
+				},
+				{
+					"gpuCount": 4,
+					"gpuTypes": ["H200"],
+					"args": [
+						"--tensor-parallel-size",
+						"4",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					]
+				}
+			]
+		},
+		"zai-org/GLM-4.5-Air-FP8": {
+			"name": "GLM-4.5-Air-FP8",
+			"configs": [
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100"],
+					"args": [
+						"--tensor-parallel-size",
+						"2",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice",
+						"--quantization",
+						"fp8"
+					],
+					"env": {
+						"VLLM_ATTENTION_BACKEND": "XFORMERS"
+					},
+					"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
+				},
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H200"],
+					"args": [
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice",
+						"--quantization",
+						"fp8"
+					],
+					"env": {
+						"VLLM_ATTENTION_BACKEND": "XFORMERS"
+					},
+					"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
+				}
+			]
+		},
+		"zai-org/GLM-4.5-Air": {
+			"name": "GLM-4.5-Air",
+			"configs": [
+				{
+					"gpuCount": 2,
+					"gpuTypes": ["H100", "H200"],
+					"args": [
+						"--tensor-parallel-size",
+						"2",
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice"
+					],
+					"notes": "Non-quantized BF16 version, more compatible"
+				},
+				{
+					"gpuCount": 1,
+					"gpuTypes": ["H200"],
+					"args": [
+						"--tool-call-parser",
+						"glm4_moe",
+						"--reasoning-parser",
+						"glm4_moe",
+						"--enable-auto-tool-choice",
+						"--gpu-memory-utilization",
+						"0.95"
+					],
+					"notes": "Single H200 can fit the BF16 model with high memory utilization"
+				}
+			]
+		},
+		"moonshotai/Kimi-K2-Instruct": {
+			"name": "Kimi-K2",
+			"configs": [
+				{
+					"gpuCount": 16,
+					"gpuTypes": ["H200", "H20"],
+					"args": [
+						"--tensor-parallel-size",
+						"16",
+						"--trust-remote-code",
+						"--enable-auto-tool-choice",
+						"--tool-call-parser",
+						"kimi_k2"
+					],
+					"notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
+				}
+			],
+			"notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
+		}
+	}
+}
--- a/packages/pods/src/ssh.ts
+++ b/packages/pods/src/ssh.ts
@ -0,0 +1,151 @@
+import { type SpawnOptions, spawn } from "child_process";
+
+export interface SSHResult {
+	stdout: string;
+	stderr: string;
+	exitCode: number;
+}
+
+/**
+ * Execute an SSH command and return the result
+ */
+export const sshExec = async (
+	sshCmd: string,
+	command: string,
+	options?: { keepAlive?: boolean },
+): Promise<SSHResult> => {
+	return new Promise((resolve) => {
+		// Parse SSH command (e.g., "ssh root@1.2.3.4" or "ssh -p 22 root@1.2.3.4")
+		const sshParts = sshCmd.split(" ").filter((p) => p);
+		const sshBinary = sshParts[0];
+		let sshArgs = [...sshParts.slice(1)];
+
+		// Add SSH keepalive options for long-running commands
+		if (options?.keepAlive) {
+			// ServerAliveInterval=30 sends keepalive every 30 seconds
+			// ServerAliveCountMax=120 allows up to 120 failures (60 minutes total)
+			sshArgs = ["-o", "ServerAliveInterval=30", "-o", "ServerAliveCountMax=120", ...sshArgs];
+		}
+
+		sshArgs.push(command);
+
+		const proc = spawn(sshBinary, sshArgs, {
+			stdio: ["ignore", "pipe", "pipe"],
+		});
+
+		let stdout = "";
+		let stderr = "";
+
+		proc.stdout.on("data", (data) => {
+			stdout += data.toString();
+		});
+
+		proc.stderr.on("data", (data) => {
+			stderr += data.toString();
+		});
+
+		proc.on("close", (code) => {
+			resolve({
+				stdout,
+				stderr,
+				exitCode: code || 0,
+			});
+		});
+
+		proc.on("error", (err) => {
+			resolve({
+				stdout,
+				stderr: err.message,
+				exitCode: 1,
+			});
+		});
+	});
+};
+
+/**
+ * Execute an SSH command with streaming output to console
+ */
+export const sshExecStream = async (
+	sshCmd: string,
+	command: string,
+	options?: { silent?: boolean; forceTTY?: boolean; keepAlive?: boolean },
+): Promise<number> => {
+	return new Promise((resolve) => {
+		const sshParts = sshCmd.split(" ").filter((p) => p);
+		const sshBinary = sshParts[0];
+
+		// Build SSH args
+		let sshArgs = [...sshParts.slice(1)];
+
+		// Add -t flag if requested and not already present
+		if (options?.forceTTY && !sshParts.includes("-t")) {
+			sshArgs = ["-t", ...sshArgs];
+		}
+
+		// Add SSH keepalive options for long-running commands
+		if (options?.keepAlive) {
+			// ServerAliveInterval=30 sends keepalive every 30 seconds
+			// ServerAliveCountMax=120 allows up to 120 failures (60 minutes total)
+			sshArgs = ["-o", "ServerAliveInterval=30", "-o", "ServerAliveCountMax=120", ...sshArgs];
+		}
+
+		sshArgs.push(command);
+
+		const spawnOptions: SpawnOptions = options?.silent
+			? { stdio: ["ignore", "ignore", "ignore"] }
+			: { stdio: "inherit" };
+
+		const proc = spawn(sshBinary, sshArgs, spawnOptions);
+
+		proc.on("close", (code) => {
+			resolve(code || 0);
+		});
+
+		proc.on("error", () => {
+			resolve(1);
+		});
+	});
+};
+
+/**
+ * Copy a file to remote via SCP
+ */
+export const scpFile = async (sshCmd: string, localPath: string, remotePath: string): Promise<boolean> => {
+	// Extract host from SSH command
+	const sshParts = sshCmd.split(" ").filter((p) => p);
+	let host = "";
+	let port = "22";
+	let i = 1; // Skip 'ssh'
+
+	while (i < sshParts.length) {
+		if (sshParts[i] === "-p" && i + 1 < sshParts.length) {
+			port = sshParts[i + 1];
+			i += 2;
+		} else if (!sshParts[i].startsWith("-")) {
+			host = sshParts[i];
+			break;
+		} else {
+			i++;
+		}
+	}
+
+	if (!host) {
+		console.error("Could not parse host from SSH command");
+		return false;
+	}
+
+	// Build SCP command
+	const scpArgs = ["-P", port, localPath, `${host}:${remotePath}`];
+
+	return new Promise((resolve) => {
+		const proc = spawn("scp", scpArgs, { stdio: "inherit" });
+
+		proc.on("close", (code) => {
+			resolve(code === 0);
+		});
+
+		proc.on("error", () => {
+			resolve(false);
+		});
+	});
+};
--- a/packages/pods/src/types.ts
+++ b/packages/pods/src/types.ts
@ -0,0 +1,27 @@
+// Core type definitions for pi
+
+export interface GPU {
+	id: number;
+	name: string;
+	memory: string;
+}
+
+export interface Model {
+	model: string;
+	port: number;
+	gpu: number[]; // Array of GPU IDs for multi-GPU deployment
+	pid: number;
+}
+
+export interface Pod {
+	ssh: string;
+	gpus: GPU[];
+	models: Record<string, Model>;
+	modelsPath?: string;
+	vllmVersion?: "release" | "nightly" | "gpt-oss"; // Track which vLLM version is installed
+}
+
+export interface Config {
+	pods: Record<string, Pod>;
+	active?: string;
+}