Initial monorepo setup with npm workspaces and dual TypeScript configuration

- Set up npm workspaces for three packages: pi-tui, pi-agent, and pi (pods) - Implemented dual TypeScript configuration: - Root tsconfig.json with path mappings for development and type checking - Package-specific tsconfig.build.json for clean production builds - Configured lockstep versioning with sync script for inter-package dependencies - Added comprehensive documentation for development and publishing workflows - All packages at version 0.5.0 ready for npm publishing
2026-04-21 12:00:15 +00:00 · 2025-08-09 17:18:38 +02:00 · 2025-08-09 17:18:38 +02:00 · a74c5da112
commit a74c5da112
63 changed files with 14558 additions and 0 deletions
--- a/packages/pods/src/commands/models.ts
+++ b/packages/pods/src/commands/models.ts
@ -0,0 +1,703 @@
+import chalk from "chalk";
+import { spawn } from "child_process";
+import { readFileSync } from "fs";
+import { dirname, join } from "path";
+import { fileURLToPath } from "url";
+import { getActivePod, loadConfig, saveConfig } from "../config.js";
+import { getModelConfig, getModelName, isKnownModel } from "../model-configs.js";
+import { sshExec } from "../ssh.js";
+import type { Pod } from "../types.js";
+
+/**
+ * Get the pod to use (active or override)
+ */
+const getPod = (podOverride?: string): { name: string; pod: Pod } => {
+	if (podOverride) {
+		const config = loadConfig();
+		const pod = config.pods[podOverride];
+		if (!pod) {
+			console.error(chalk.red(`Pod '${podOverride}' not found`));
+			process.exit(1);
+		}
+		return { name: podOverride, pod };
+	}
+
+	const active = getActivePod();
+	if (!active) {
+		console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
+		process.exit(1);
+	}
+	return active;
+};
+
+/**
+ * Find next available port starting from 8001
+ */
+const getNextPort = (pod: Pod): number => {
+	const usedPorts = Object.values(pod.models).map((m) => m.port);
+	let port = 8001;
+	while (usedPorts.includes(port)) {
+		port++;
+	}
+	return port;
+};
+
+/**
+ * Select GPUs for model deployment (round-robin)
+ */
+const selectGPUs = (pod: Pod, count: number = 1): number[] => {
+	if (count === pod.gpus.length) {
+		// Use all GPUs
+		return pod.gpus.map((g) => g.id);
+	}
+
+	// Count GPU usage across all models
+	const gpuUsage = new Map<number, number>();
+	for (const gpu of pod.gpus) {
+		gpuUsage.set(gpu.id, 0);
+	}
+
+	for (const model of Object.values(pod.models)) {
+		for (const gpuId of model.gpu) {
+			gpuUsage.set(gpuId, (gpuUsage.get(gpuId) || 0) + 1);
+		}
+	}
+
+	// Sort GPUs by usage (least used first)
+	const sortedGPUs = Array.from(gpuUsage.entries())
+		.sort((a, b) => a[1] - b[1])
+		.map((entry) => entry[0]);
+
+	// Return the least used GPUs
+	return sortedGPUs.slice(0, count);
+};
+
+/**
+ * Start a model
+ */
+export const startModel = async (
+	modelId: string,
+	name: string,
+	options: {
+		pod?: string;
+		vllmArgs?: string[];
+		memory?: string;
+		context?: string;
+		gpus?: number;
+	},
+) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	// Validation
+	if (!pod.modelsPath) {
+		console.error(chalk.red("Pod does not have a models path configured"));
+		process.exit(1);
+	}
+	if (pod.models[name]) {
+		console.error(chalk.red(`Model '${name}' already exists on pod '${podName}'`));
+		process.exit(1);
+	}
+
+	const port = getNextPort(pod);
+
+	// Determine GPU allocation and vLLM args
+	let gpus: number[] = [];
+	let vllmArgs: string[] = [];
+	let modelConfig = null;
+
+	if (options.vllmArgs?.length) {
+		// Custom args override everything
+		vllmArgs = options.vllmArgs;
+		console.log(chalk.gray("Using custom vLLM args, GPU allocation managed by vLLM"));
+	} else if (isKnownModel(modelId)) {
+		// Handle --gpus parameter for known models
+		if (options.gpus) {
+			// Validate GPU count
+			if (options.gpus > pod.gpus.length) {
+				console.error(chalk.red(`Error: Requested ${options.gpus} GPUs but pod only has ${pod.gpus.length}`));
+				process.exit(1);
+			}
+
+			// Try to find config for requested GPU count
+			modelConfig = getModelConfig(modelId, pod.gpus, options.gpus);
+			if (modelConfig) {
+				gpus = selectGPUs(pod, options.gpus);
+				vllmArgs = [...(modelConfig.args || [])];
+			} else {
+				console.error(
+					chalk.red(`Model '${getModelName(modelId)}' does not have a configuration for ${options.gpus} GPU(s)`),
+				);
+				console.error(chalk.yellow("Available configurations:"));
+
+				// Show available configurations
+				for (let gpuCount = 1; gpuCount <= pod.gpus.length; gpuCount++) {
+					const config = getModelConfig(modelId, pod.gpus, gpuCount);
+					if (config) {
+						console.error(chalk.gray(`  - ${gpuCount} GPU(s)`));
+					}
+				}
+				process.exit(1);
+			}
+		} else {
+			// Find best config for this hardware (original behavior)
+			for (let gpuCount = pod.gpus.length; gpuCount >= 1; gpuCount--) {
+				modelConfig = getModelConfig(modelId, pod.gpus, gpuCount);
+				if (modelConfig) {
+					gpus = selectGPUs(pod, gpuCount);
+					vllmArgs = [...(modelConfig.args || [])];
+					break;
+				}
+			}
+			if (!modelConfig) {
+				console.error(chalk.red(`Model '${getModelName(modelId)}' not compatible with this pod's GPUs`));
+				process.exit(1);
+			}
+		}
+	} else {
+		// Unknown model
+		if (options.gpus) {
+			console.error(chalk.red("Error: --gpus can only be used with predefined models"));
+			console.error(chalk.yellow("For custom models, use --vllm with tensor-parallel-size or similar arguments"));
+			process.exit(1);
+		}
+		// Single GPU default
+		gpus = selectGPUs(pod, 1);
+		console.log(chalk.gray("Unknown model, defaulting to single GPU"));
+	}
+
+	// Apply memory/context overrides
+	if (!options.vllmArgs?.length) {
+		if (options.memory) {
+			const fraction = parseFloat(options.memory.replace("%", "")) / 100;
+			vllmArgs = vllmArgs.filter((arg) => !arg.includes("gpu-memory-utilization"));
+			vllmArgs.push("--gpu-memory-utilization", String(fraction));
+		}
+		if (options.context) {
+			const contextSizes: Record<string, number> = {
+				"4k": 4096,
+				"8k": 8192,
+				"16k": 16384,
+				"32k": 32768,
+				"64k": 65536,
+				"128k": 131072,
+			};
+			const maxTokens = contextSizes[options.context.toLowerCase()] || parseInt(options.context);
+			vllmArgs = vllmArgs.filter((arg) => !arg.includes("max-model-len"));
+			vllmArgs.push("--max-model-len", String(maxTokens));
+		}
+	}
+
+	// Show what we're doing
+	console.log(chalk.green(`Starting model '${name}' on pod '${podName}'...`));
+	console.log(`Model: ${modelId}`);
+	console.log(`Port: ${port}`);
+	console.log(`GPU(s): ${gpus.length ? gpus.join(", ") : "Managed by vLLM"}`);
+	if (modelConfig?.notes) console.log(chalk.yellow(`Note: ${modelConfig.notes}`));
+	console.log("");
+
+	// Read and customize model_run.sh script with our values
+	const scriptPath = join(dirname(fileURLToPath(import.meta.url)), "../../scripts/model_run.sh");
+	let scriptContent = readFileSync(scriptPath, "utf-8");
+
+	// Replace placeholders - no escaping needed, heredoc with 'EOF' is literal
+	scriptContent = scriptContent
+		.replace("{{MODEL_ID}}", modelId)
+		.replace("{{NAME}}", name)
+		.replace("{{PORT}}", String(port))
+		.replace("{{VLLM_ARGS}}", vllmArgs.join(" "));
+
+	// Upload customized script
+	const result = await sshExec(
+		pod.ssh,
+		`cat > /tmp/model_run_${name}.sh << 'EOF'
+${scriptContent}
+EOF
+chmod +x /tmp/model_run_${name}.sh`,
+	);
+
+	// Prepare environment
+	const env = [
+		`HF_TOKEN='${process.env.HF_TOKEN}'`,
+		`PI_API_KEY='${process.env.PI_API_KEY}'`,
+		`HF_HUB_ENABLE_HF_TRANSFER=1`,
+		`VLLM_NO_USAGE_STATS=1`,
+		`PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`,
+		`FORCE_COLOR=1`,
+		`TERM=xterm-256color`,
+		...(gpus.length === 1 ? [`CUDA_VISIBLE_DEVICES=${gpus[0]}`] : []),
+		...Object.entries(modelConfig?.env || {}).map(([k, v]) => `${k}='${v}'`),
+	]
+		.map((e) => `export ${e}`)
+		.join("\n");
+
+	// Start the model runner with script command for pseudo-TTY (preserves colors)
+	// Note: We use script to preserve colors and create a log file
+	// setsid creates a new session so it survives SSH disconnection
+	const startCmd = `
+		${env}
+		mkdir -p ~/.vllm_logs
+		# Create a wrapper that monitors the script command
+		cat > /tmp/model_wrapper_${name}.sh << 'WRAPPER'
+#!/bin/bash
+script -q -f -c "/tmp/model_run_${name}.sh" ~/.vllm_logs/${name}.log
+exit_code=$?
+echo "Script exited with code $exit_code" >> ~/.vllm_logs/${name}.log
+exit $exit_code
+WRAPPER
+		chmod +x /tmp/model_wrapper_${name}.sh
+		setsid /tmp/model_wrapper_${name}.sh </dev/null >/dev/null 2>&1 &
+		echo $!
+		exit 0
+	`;
+
+	const pidResult = await sshExec(pod.ssh, startCmd);
+	const pid = parseInt(pidResult.stdout.trim());
+	if (!pid) {
+		console.error(chalk.red("Failed to start model runner"));
+		process.exit(1);
+	}
+
+	// Save to config
+	const config = loadConfig();
+	config.pods[podName].models[name] = { model: modelId, port, gpu: gpus, pid };
+	saveConfig(config);
+
+	console.log(`Model runner started with PID: ${pid}`);
+	console.log("Streaming logs... (waiting for startup)\n");
+
+	// Small delay to ensure log file is created
+	await new Promise((resolve) => setTimeout(resolve, 500));
+
+	// Stream logs with color support, watching for startup complete
+	const sshParts = pod.ssh.split(" ");
+	const sshCommand = sshParts[0]; // "ssh"
+	const sshArgs = sshParts.slice(1); // ["root@86.38.238.55"]
+	const host = sshArgs[0].split("@")[1] || "localhost";
+	const tailCmd = `tail -f ~/.vllm_logs/${name}.log`;
+
+	// Build the full args array for spawn
+	const fullArgs = [...sshArgs, tailCmd];
+
+	const logProcess = spawn(sshCommand, fullArgs, {
+		stdio: ["inherit", "pipe", "pipe"], // capture stdout and stderr
+		env: { ...process.env, FORCE_COLOR: "1" },
+	});
+
+	let interrupted = false;
+	let startupComplete = false;
+
+	// Handle Ctrl+C
+	const sigintHandler = () => {
+		interrupted = true;
+		logProcess.kill();
+	};
+	process.on("SIGINT", sigintHandler);
+
+	// Process log output line by line
+	const processOutput = (data: Buffer) => {
+		const lines = data.toString().split("\n");
+		for (const line of lines) {
+			if (line) {
+				console.log(line); // Echo the line to console
+
+				// Check for startup complete message
+				if (line.includes("Application startup complete")) {
+					startupComplete = true;
+					logProcess.kill(); // Stop tailing logs
+				}
+			}
+		}
+	};
+
+	logProcess.stdout?.on("data", processOutput);
+	logProcess.stderr?.on("data", processOutput);
+
+	await new Promise<void>((resolve) => logProcess.on("exit", resolve));
+	process.removeListener("SIGINT", sigintHandler);
+
+	if (startupComplete) {
+		// Model started successfully - output connection details
+		console.log("\n" + chalk.green("✓ Model started successfully!"));
+		console.log("\n" + chalk.bold("Connection Details:"));
+		console.log(chalk.cyan("─".repeat(50)));
+		console.log(chalk.white("Base URL:    ") + chalk.yellow(`http://${host}:${port}/v1`));
+		console.log(chalk.white("Model:       ") + chalk.yellow(modelId));
+		console.log(chalk.white("API Key:     ") + chalk.yellow(process.env.PI_API_KEY || "(not set)"));
+		console.log(chalk.cyan("─".repeat(50)));
+
+		console.log("\n" + chalk.bold("Export for shell:"));
+		console.log(chalk.gray(`export OPENAI_BASE_URL="http://${host}:${port}/v1"`));
+		console.log(chalk.gray(`export OPENAI_API_KEY="${process.env.PI_API_KEY || "your-api-key"}"`));
+		console.log(chalk.gray(`export OPENAI_MODEL="${modelId}"`));
+
+		console.log("\n" + chalk.bold("Example usage:"));
+		console.log(
+			chalk.gray(`
+  # Python
+  from openai import OpenAI
+  client = OpenAI()  # Uses env vars
+  response = client.chat.completions.create(
+      model="${modelId}",
+      messages=[{"role": "user", "content": "Hello!"}]
+  )
+
+  # CLI
+  curl $OPENAI_BASE_URL/chat/completions \\
+    -H "Authorization: Bearer $OPENAI_API_KEY" \\
+    -H "Content-Type: application/json" \\
+    -d '{"model":"${modelId}","messages":[{"role":"user","content":"Hi"}]}'`),
+		);
+		console.log("");
+		console.log(chalk.cyan(`Chat with model:  pi agent ${name} "Your message"`));
+		console.log(chalk.cyan(`Interactive mode: pi agent ${name} -i`));
+		console.log(chalk.cyan(`Monitor logs:     pi logs ${name}`));
+		console.log(chalk.cyan(`Stop model:       pi stop ${name}`));
+	} else if (interrupted) {
+		console.log(chalk.yellow("\n\nStopped monitoring. Model deployment continues in background."));
+		console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
+		console.log(chalk.cyan(`Check status: pi logs ${name}`));
+		console.log(chalk.cyan(`Stop model: pi stop ${name}`));
+	} else {
+		console.log(chalk.yellow("\n\nLog stream ended. Model may still be running."));
+		console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
+		console.log(chalk.cyan(`Check status: pi logs ${name}`));
+		console.log(chalk.cyan(`Stop model: pi stop ${name}`));
+	}
+};
+
+/**
+ * Stop a model
+ */
+export const stopModel = async (name: string, options: { pod?: string }) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	const model = pod.models[name];
+	if (!model) {
+		console.error(chalk.red(`Model '${name}' not found on pod '${podName}'`));
+		process.exit(1);
+	}
+
+	console.log(chalk.yellow(`Stopping model '${name}' on pod '${podName}'...`));
+
+	// Kill the script process and all its children
+	// Using pkill to kill the process and all children
+	const killCmd = `
+		# Kill the script process and all its children
+		pkill -TERM -P ${model.pid} 2>/dev/null || true
+		kill ${model.pid} 2>/dev/null || true
+	`;
+	await sshExec(pod.ssh, killCmd);
+
+	// Remove from config
+	const config = loadConfig();
+	delete config.pods[podName].models[name];
+	saveConfig(config);
+
+	console.log(chalk.green(`✓ Model '${name}' stopped`));
+};
+
+/**
+ * Stop all models on a pod
+ */
+export const stopAllModels = async (options: { pod?: string }) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	const modelNames = Object.keys(pod.models);
+	if (modelNames.length === 0) {
+		console.log(`No models running on pod '${podName}'`);
+		return;
+	}
+
+	console.log(chalk.yellow(`Stopping ${modelNames.length} model(s) on pod '${podName}'...`));
+
+	// Kill all script processes and their children
+	const pids = Object.values(pod.models).map((m) => m.pid);
+	const killCmd = `
+		for PID in ${pids.join(" ")}; do
+			pkill -TERM -P $PID 2>/dev/null || true
+			kill $PID 2>/dev/null || true
+		done
+	`;
+	await sshExec(pod.ssh, killCmd);
+
+	// Clear all models from config
+	const config = loadConfig();
+	config.pods[podName].models = {};
+	saveConfig(config);
+
+	console.log(chalk.green(`✓ Stopped all models: ${modelNames.join(", ")}`));
+};
+
+/**
+ * List all models
+ */
+export const listModels = async (options: { pod?: string }) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	const modelNames = Object.keys(pod.models);
+	if (modelNames.length === 0) {
+		console.log(`No models running on pod '${podName}'`);
+		return;
+	}
+
+	// Get pod SSH host for URL display
+	const sshParts = pod.ssh.split(" ");
+	const host = sshParts.find((p) => p.includes("@"))?.split("@")[1] || "unknown";
+
+	console.log(`Models on pod '${chalk.bold(podName)}':`);
+	for (const name of modelNames) {
+		const model = pod.models[name];
+		const gpuStr =
+			model.gpu.length > 1
+				? `GPUs ${model.gpu.join(",")}`
+				: model.gpu.length === 1
+					? `GPU ${model.gpu[0]}`
+					: "GPU unknown";
+		console.log(`  ${chalk.green(name)} - Port ${model.port} - ${gpuStr} - PID ${model.pid}`);
+		console.log(`    Model: ${chalk.gray(model.model)}`);
+		console.log(`    URL: ${chalk.cyan(`http://${host}:${model.port}/v1`)}`);
+	}
+
+	// Optionally verify processes are still running
+	console.log("");
+	console.log("Verifying processes...");
+	let anyDead = false;
+	for (const name of modelNames) {
+		const model = pod.models[name];
+		// Check both the wrapper process and if vLLM is responding
+		const checkCmd = `
+			# Check if wrapper process exists
+			if ps -p ${model.pid} > /dev/null 2>&1; then
+				# Process exists, now check if vLLM is responding
+				if curl -s -f http://localhost:${model.port}/health > /dev/null 2>&1; then
+					echo "running"
+				else
+					# Check if it's still starting up
+					if tail -n 20 ~/.vllm_logs/${name}.log 2>/dev/null | grep -q "ERROR\\|Failed\\|Cuda error\\|died"; then
+						echo "crashed"
+					else
+						echo "starting"
+					fi
+				fi
+			else
+				echo "dead"
+			fi
+		`;
+		const result = await sshExec(pod.ssh, checkCmd);
+		const status = result.stdout.trim();
+		if (status === "dead") {
+			console.log(chalk.red(`  ${name}: Process ${model.pid} is not running`));
+			anyDead = true;
+		} else if (status === "crashed") {
+			console.log(chalk.red(`  ${name}: vLLM crashed (check logs with 'pi logs ${name}')`));
+			anyDead = true;
+		} else if (status === "starting") {
+			console.log(chalk.yellow(`  ${name}: Still starting up...`));
+		}
+	}
+
+	if (anyDead) {
+		console.log("");
+		console.log(chalk.yellow("Some models are not running. Clean up with:"));
+		console.log(chalk.cyan("  pi stop <name>"));
+	} else {
+		console.log(chalk.green("✓ All processes verified"));
+	}
+};
+
+/**
+ * View model logs
+ */
+export const viewLogs = async (name: string, options: { pod?: string }) => {
+	const { name: podName, pod } = getPod(options.pod);
+
+	const model = pod.models[name];
+	if (!model) {
+		console.error(chalk.red(`Model '${name}' not found on pod '${podName}'`));
+		process.exit(1);
+	}
+
+	console.log(chalk.green(`Streaming logs for '${name}' on pod '${podName}'...`));
+	console.log(chalk.gray("Press Ctrl+C to stop"));
+	console.log("");
+
+	// Stream logs with color preservation
+	const sshParts = pod.ssh.split(" ");
+	const sshCommand = sshParts[0]; // "ssh"
+	const sshArgs = sshParts.slice(1); // ["root@86.38.238.55"]
+	const tailCmd = `tail -f ~/.vllm_logs/${name}.log`;
+
+	const logProcess = spawn(sshCommand, [...sshArgs, tailCmd], {
+		stdio: "inherit",
+		env: {
+			...process.env,
+			FORCE_COLOR: "1",
+		},
+	});
+
+	// Wait for process to exit
+	await new Promise<void>((resolve) => {
+		logProcess.on("exit", () => resolve());
+	});
+};
+
+/**
+ * Show known models and their hardware requirements
+ */
+export const showKnownModels = async () => {
+	const modelsJson = await import("../models.json", { assert: { type: "json" } });
+	const models = modelsJson.default.models;
+
+	// Get active pod info if available
+	const activePod = getActivePod();
+	let podGpuCount = 0;
+	let podGpuType = "";
+
+	if (activePod) {
+		podGpuCount = activePod.pod.gpus.length;
+		// Extract GPU type from name (e.g., "NVIDIA H200" -> "H200")
+		podGpuType = activePod.pod.gpus[0]?.name?.replace("NVIDIA", "")?.trim()?.split(" ")[0] || "";
+
+		console.log(chalk.bold(`Known Models for ${activePod.name} (${podGpuCount}x ${podGpuType || "GPU"}):\n`));
+	} else {
+		console.log(chalk.bold("Known Models:\n"));
+		console.log(chalk.yellow("No active pod. Use 'pi pods active <name>' to filter compatible models.\n"));
+	}
+
+	console.log("Usage: pi start <model> --name <name> [options]\n");
+
+	// Group models by compatibility and family
+	const compatible: Record<string, Array<{ id: string; name: string; config: string; notes?: string }>> = {};
+	const incompatible: Record<string, Array<{ id: string; name: string; minGpu: string; notes?: string }>> = {};
+
+	for (const [modelId, info] of Object.entries(models)) {
+		const modelInfo = info as any;
+		const family = modelInfo.name.split("-")[0] || "Other";
+
+		let isCompatible = false;
+		let compatibleConfig = "";
+		let minGpu = "Unknown";
+		let minNotes: string | undefined;
+
+		if (modelInfo.configs && modelInfo.configs.length > 0) {
+			// Sort configs by GPU count to find minimum
+			const sortedConfigs = [...modelInfo.configs].sort((a: any, b: any) => (a.gpuCount || 1) - (b.gpuCount || 1));
+
+			// Find minimum requirements
+			const minConfig = sortedConfigs[0];
+			const minGpuCount = minConfig.gpuCount || 1;
+			const gpuTypes = minConfig.gpuTypes?.join("/") || "H100/H200";
+
+			if (minGpuCount === 1) {
+				minGpu = `1x ${gpuTypes}`;
+			} else {
+				minGpu = `${minGpuCount}x ${gpuTypes}`;
+			}
+
+			minNotes = minConfig.notes || modelInfo.notes;
+
+			// Check compatibility with active pod
+			if (activePod && podGpuCount > 0) {
+				// Find best matching config for this pod
+				for (const config of sortedConfigs) {
+					const configGpuCount = config.gpuCount || 1;
+					const configGpuTypes = config.gpuTypes || [];
+
+					// Check if we have enough GPUs
+					if (configGpuCount <= podGpuCount) {
+						// Check if GPU type matches (if specified)
+						if (
+							configGpuTypes.length === 0 ||
+							configGpuTypes.some((type: string) => podGpuType.includes(type) || type.includes(podGpuType))
+						) {
+							isCompatible = true;
+							if (configGpuCount === 1) {
+								compatibleConfig = `1x ${podGpuType}`;
+							} else {
+								compatibleConfig = `${configGpuCount}x ${podGpuType}`;
+							}
+							minNotes = config.notes || modelInfo.notes;
+							break;
+						}
+					}
+				}
+			}
+		}
+
+		const modelEntry = {
+			id: modelId,
+			name: modelInfo.name,
+			notes: minNotes,
+		};
+
+		if (activePod && isCompatible) {
+			if (!compatible[family]) {
+				compatible[family] = [];
+			}
+			compatible[family].push({ ...modelEntry, config: compatibleConfig });
+		} else {
+			if (!incompatible[family]) {
+				incompatible[family] = [];
+			}
+			incompatible[family].push({ ...modelEntry, minGpu });
+		}
+	}
+
+	// Display compatible models first
+	if (activePod && Object.keys(compatible).length > 0) {
+		console.log(chalk.green.bold("✓ Compatible Models:\n"));
+
+		const sortedFamilies = Object.keys(compatible).sort();
+		for (const family of sortedFamilies) {
+			console.log(chalk.cyan(`${family} Models:`));
+
+			const modelList = compatible[family].sort((a, b) => a.name.localeCompare(b.name));
+
+			for (const model of modelList) {
+				console.log(`  ${chalk.green(model.id)}`);
+				console.log(`    Name: ${model.name}`);
+				console.log(`    Config: ${model.config}`);
+				if (model.notes) {
+					console.log(chalk.gray(`    Note: ${model.notes}`));
+				}
+				console.log("");
+			}
+		}
+	}
+
+	// Display incompatible models
+	if (Object.keys(incompatible).length > 0) {
+		if (activePod && Object.keys(compatible).length > 0) {
+			console.log(chalk.red.bold("✗ Incompatible Models (need more/different GPUs):\n"));
+		}
+
+		const sortedFamilies = Object.keys(incompatible).sort();
+		for (const family of sortedFamilies) {
+			if (!activePod) {
+				console.log(chalk.cyan(`${family} Models:`));
+			} else {
+				console.log(chalk.gray(`${family} Models:`));
+			}
+
+			const modelList = incompatible[family].sort((a, b) => a.name.localeCompare(b.name));
+
+			for (const model of modelList) {
+				const color = activePod ? chalk.gray : chalk.green;
+				console.log(`  ${color(model.id)}`);
+				console.log(chalk.gray(`    Name: ${model.name}`));
+				console.log(chalk.gray(`    Min Hardware: ${model.minGpu}`));
+				if (model.notes && !activePod) {
+					console.log(chalk.gray(`    Note: ${model.notes}`));
+				}
+				if (activePod) {
+					console.log(""); // Less verbose for incompatible models when filtered
+				} else {
+					console.log("");
+				}
+			}
+		}
+	}
+
+	console.log(chalk.gray("\nFor unknown models, defaults to single GPU deployment."));
+	console.log(chalk.gray("Use --vllm to pass custom arguments to vLLM."));
+};
--- a/packages/pods/src/commands/pods.ts
+++ b/packages/pods/src/commands/pods.ts
@ -0,0 +1,205 @@
+import chalk from "chalk";
+import { dirname, join } from "path";
+import { fileURLToPath } from "url";
+import { addPod, loadConfig, removePod, setActivePod } from "../config.js";
+import { scpFile, sshExec, sshExecStream } from "../ssh.js";
+import type { GPU, Pod } from "../types.js";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+
+/**
+ * List all pods
+ */
+export const listPods = () => {
+	const config = loadConfig();
+	const podNames = Object.keys(config.pods);
+
+	if (podNames.length === 0) {
+		console.log("No pods configured. Use 'pi pods setup' to add a pod.");
+		return;
+	}
+
+	console.log("Configured pods:");
+	for (const name of podNames) {
+		const pod = config.pods[name];
+		const isActive = config.active === name;
+		const marker = isActive ? chalk.green("*") : " ";
+		const gpuCount = pod.gpus?.length || 0;
+		const gpuInfo = gpuCount > 0 ? `${gpuCount}x ${pod.gpus[0].name}` : "no GPUs detected";
+		const vllmInfo = pod.vllmVersion ? ` (vLLM: ${pod.vllmVersion})` : "";
+		console.log(`${marker} ${chalk.bold(name)} - ${gpuInfo}${vllmInfo} - ${pod.ssh}`);
+		if (pod.modelsPath) {
+			console.log(`    Models: ${pod.modelsPath}`);
+		}
+		if (pod.vllmVersion === "gpt-oss") {
+			console.log(chalk.yellow(`    ⚠️  GPT-OSS build - only for GPT-OSS models`));
+		}
+	}
+};
+
+/**
+ * Setup a new pod
+ */
+export const setupPod = async (
+	name: string,
+	sshCmd: string,
+	options: { mount?: string; modelsPath?: string; vllm?: "release" | "nightly" | "gpt-oss" },
+) => {
+	// Validate environment variables
+	const hfToken = process.env.HF_TOKEN;
+	const vllmApiKey = process.env.PI_API_KEY;
+
+	if (!hfToken) {
+		console.error(chalk.red("ERROR: HF_TOKEN environment variable is required"));
+		console.error("Get a token from: https://huggingface.co/settings/tokens");
+		console.error("Then run: export HF_TOKEN=your_token_here");
+		process.exit(1);
+	}
+
+	if (!vllmApiKey) {
+		console.error(chalk.red("ERROR: PI_API_KEY environment variable is required"));
+		console.error("Set an API key: export PI_API_KEY=your_api_key_here");
+		process.exit(1);
+	}
+
+	// Determine models path
+	let modelsPath = options.modelsPath;
+	if (!modelsPath && options.mount) {
+		// Extract path from mount command if not explicitly provided
+		// e.g., "mount -t nfs ... /mnt/sfs" -> "/mnt/sfs"
+		const parts = options.mount.split(" ");
+		modelsPath = parts[parts.length - 1];
+	}
+
+	if (!modelsPath) {
+		console.error(chalk.red("ERROR: --models-path is required (or must be extractable from --mount)"));
+		process.exit(1);
+	}
+
+	console.log(chalk.green(`Setting up pod '${name}'...`));
+	console.log(`SSH: ${sshCmd}`);
+	console.log(`Models path: ${modelsPath}`);
+	console.log(
+		`vLLM version: ${options.vllm || "release"} ${options.vllm === "gpt-oss" ? chalk.yellow("(GPT-OSS special build)") : ""}`,
+	);
+	if (options.mount) {
+		console.log(`Mount command: ${options.mount}`);
+	}
+	console.log("");
+
+	// Test SSH connection
+	console.log("Testing SSH connection...");
+	const testResult = await sshExec(sshCmd, "echo 'SSH OK'");
+	if (testResult.exitCode !== 0) {
+		console.error(chalk.red("Failed to connect via SSH"));
+		console.error(testResult.stderr);
+		process.exit(1);
+	}
+	console.log(chalk.green("✓ SSH connection successful"));
+
+	// Copy setup script
+	console.log("Copying setup script...");
+	const scriptPath = join(__dirname, "../../scripts/pod_setup.sh");
+	const success = await scpFile(sshCmd, scriptPath, "/tmp/pod_setup.sh");
+	if (!success) {
+		console.error(chalk.red("Failed to copy setup script"));
+		process.exit(1);
+	}
+	console.log(chalk.green("✓ Setup script copied"));
+
+	// Build setup command
+	let setupCmd = `bash /tmp/pod_setup.sh --models-path '${modelsPath}' --hf-token '${hfToken}' --vllm-api-key '${vllmApiKey}'`;
+	if (options.mount) {
+		setupCmd += ` --mount '${options.mount}'`;
+	}
+	// Add vLLM version flag
+	const vllmVersion = options.vllm || "release";
+	setupCmd += ` --vllm '${vllmVersion}'`;
+
+	// Run setup script
+	console.log("");
+	console.log(chalk.yellow("Running setup (this will take 2-5 minutes)..."));
+	console.log("");
+
+	// Use forceTTY to preserve colors from apt, pip, etc.
+	const exitCode = await sshExecStream(sshCmd, setupCmd, { forceTTY: true });
+	if (exitCode !== 0) {
+		console.error(chalk.red("\nSetup failed. Check the output above for errors."));
+		process.exit(1);
+	}
+
+	// Parse GPU info from setup output
+	console.log("");
+	console.log("Detecting GPU configuration...");
+	const gpuResult = await sshExec(sshCmd, "nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader");
+
+	const gpus: GPU[] = [];
+	if (gpuResult.exitCode === 0 && gpuResult.stdout) {
+		const lines = gpuResult.stdout.trim().split("\n");
+		for (const line of lines) {
+			const [id, name, memory] = line.split(",").map((s) => s.trim());
+			if (id !== undefined) {
+				gpus.push({
+					id: parseInt(id),
+					name: name || "Unknown",
+					memory: memory || "Unknown",
+				});
+			}
+		}
+	}
+
+	console.log(chalk.green(`✓ Detected ${gpus.length} GPU(s)`));
+	for (const gpu of gpus) {
+		console.log(`  GPU ${gpu.id}: ${gpu.name} (${gpu.memory})`);
+	}
+
+	// Save pod configuration
+	const pod: Pod = {
+		ssh: sshCmd,
+		gpus,
+		models: {},
+		modelsPath,
+		vllmVersion: options.vllm || "release",
+	};
+
+	addPod(name, pod);
+	console.log("");
+	console.log(chalk.green(`✓ Pod '${name}' setup complete and set as active pod`));
+	console.log("");
+	console.log("You can now deploy models with:");
+	console.log(chalk.cyan(`  pi start <model> --name <name>`));
+};
+
+/**
+ * Switch active pod
+ */
+export const switchActivePod = (name: string) => {
+	const config = loadConfig();
+	if (!config.pods[name]) {
+		console.error(chalk.red(`Pod '${name}' not found`));
+		console.log("\nAvailable pods:");
+		for (const podName of Object.keys(config.pods)) {
+			console.log(`  ${podName}`);
+		}
+		process.exit(1);
+	}
+
+	setActivePod(name);
+	console.log(chalk.green(`✓ Switched active pod to '${name}'`));
+};
+
+/**
+ * Remove a pod from config
+ */
+export const removePodCommand = (name: string) => {
+	const config = loadConfig();
+	if (!config.pods[name]) {
+		console.error(chalk.red(`Pod '${name}' not found`));
+		process.exit(1);
+	}
+
+	removePod(name);
+	console.log(chalk.green(`✓ Removed pod '${name}' from configuration`));
+	console.log(chalk.yellow("Note: This only removes the local configuration. The remote pod is not affected."));
+};
--- a/packages/pods/src/commands/prompt.ts
+++ b/packages/pods/src/commands/prompt.ts
@ -0,0 +1,85 @@
+import { main as agentMain } from "@mariozechner/pi-agent";
+import chalk from "chalk";
+import { getActivePod, loadConfig } from "../config.js";
+
+// ────────────────────────────────────────────────────────────────────────────────
+// Types
+// ────────────────────────────────────────────────────────────────────────────────
+
+interface PromptOptions {
+	pod?: string;
+	apiKey?: string;
+}
+
+// ────────────────────────────────────────────────────────────────────────────────
+// Main prompt function
+// ────────────────────────────────────────────────────────────────────────────────
+
+export async function promptModel(modelName: string, userArgs: string[], opts: PromptOptions = {}) {
+	// Get pod and model configuration
+	const activePod = opts.pod ? { name: opts.pod, pod: loadConfig().pods[opts.pod] } : getActivePod();
+
+	if (!activePod) {
+		console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
+		process.exit(1);
+	}
+
+	const { name: podName, pod } = activePod;
+	const modelConfig = pod.models[modelName];
+
+	if (!modelConfig) {
+		console.error(chalk.red(`Model '${modelName}' not found on pod '${podName}'`));
+		process.exit(1);
+	}
+
+	// Extract host from SSH string
+	const host =
+		pod.ssh
+			.split(" ")
+			.find((p) => p.includes("@"))
+			?.split("@")[1] ?? "localhost";
+
+	// Build the system prompt for code navigation
+	const systemPrompt = `You help the user understand and navigate the codebase in the current working directory.
+
+You can read files, list directories, and execute shell commands via the respective tools.
+
+Do not output file contents you read via the read_file tool directly, unless asked to.
+
+Do not output markdown tables as part of your responses.
+
+Keep your responses concise and relevant to the user's request.
+
+File paths you output must include line numbers where possible, e.g. "src/index.ts:10-20" for lines 10 to 20 in src/index.ts.
+
+Current working directory: ${process.cwd()}`;
+
+	// Build arguments for agent main function
+	const args: string[] = [];
+
+	// Add base configuration that we control
+	args.push(
+		"--base-url",
+		`http://${host}:${modelConfig.port}/v1`,
+		"--model",
+		modelConfig.model,
+		"--api-key",
+		opts.apiKey || process.env.PI_API_KEY || "dummy",
+		"--api",
+		modelConfig.model.toLowerCase().includes("gpt-oss") ? "responses" : "completions",
+		"--system-prompt",
+		systemPrompt,
+	);
+
+	// Pass through all user-provided arguments
+	// This includes messages, --continue, --json, etc.
+	args.push(...userArgs);
+
+	// Call agent main function directly
+	try {
+		await agentMain(args);
+	} catch (err: any) {
+		console.error(chalk.red(`Agent error: ${err.message}`));
+		process.exit(1);
+	}
+}