mirror of
https://github.com/getcompanion-ai/co-mono.git
synced 2026-04-21 06:04:44 +00:00
Initial monorepo setup with npm workspaces and dual TypeScript configuration
- Set up npm workspaces for three packages: pi-tui, pi-agent, and pi (pods) - Implemented dual TypeScript configuration: - Root tsconfig.json with path mappings for development and type checking - Package-specific tsconfig.build.json for clean production builds - Configured lockstep versioning with sync script for inter-package dependencies - Added comprehensive documentation for development and publishing workflows - All packages at version 0.5.0 ready for npm publishing
This commit is contained in:
commit
a74c5da112
63 changed files with 14558 additions and 0 deletions
362
packages/pods/src/cli.ts
Normal file
362
packages/pods/src/cli.ts
Normal file
|
|
@ -0,0 +1,362 @@
|
|||
#!/usr/bin/env node
|
||||
import chalk from "chalk";
|
||||
import { spawn } from "child_process";
|
||||
import { readFileSync } from "fs";
|
||||
import { dirname, join } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
import { listModels, startModel, stopModel, viewLogs } from "./commands/models.js";
|
||||
import { listPods, removePodCommand, setupPod, switchActivePod } from "./commands/pods.js";
|
||||
import { promptModel } from "./commands/prompt.js";
|
||||
import { getActivePod, loadConfig } from "./config.js";
|
||||
import { sshExecStream } from "./ssh.js";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
const packageJson = JSON.parse(readFileSync(join(__dirname, "../package.json"), "utf-8"));
|
||||
|
||||
function printHelp() {
|
||||
console.log(`pi v${packageJson.version} - Manage vLLM deployments on GPU pods
|
||||
|
||||
Pod Management:
|
||||
pi pods setup <name> "<ssh>" --mount "<mount>" Setup pod with mount command
|
||||
Options:
|
||||
--vllm release Install latest vLLM release >=0.10.0 (default)
|
||||
--vllm nightly Install vLLM nightly build (latest features)
|
||||
--vllm gpt-oss Install vLLM 0.10.1+gptoss with PyTorch nightly (GPT-OSS only)
|
||||
pi pods List all pods (* = active)
|
||||
pi pods active <name> Switch active pod
|
||||
pi pods remove <name> Remove pod from local config
|
||||
pi shell [<name>] Open shell on pod (active or specified)
|
||||
pi ssh [<name>] "<command>" Run SSH command on pod
|
||||
|
||||
Model Management:
|
||||
pi start <model> --name <name> [options] Start a model
|
||||
--memory <percent> GPU memory allocation (30%, 50%, 90%)
|
||||
--context <size> Context window (4k, 8k, 16k, 32k, 64k, 128k)
|
||||
--gpus <count> Number of GPUs to use (predefined models only)
|
||||
--vllm <args...> Pass remaining args to vLLM (ignores other options)
|
||||
pi stop [<name>] Stop model (or all if no name)
|
||||
pi list List running models
|
||||
pi logs <name> Stream model logs
|
||||
pi agent <name> ["<message>"...] [options] Chat with model using agent & tools
|
||||
pi agent <name> [options] Interactive chat mode
|
||||
--continue, -c Continue previous session
|
||||
--json Output as JSONL
|
||||
(All pi-agent options are supported)
|
||||
|
||||
All model commands support --pod <name> to override the active pod.
|
||||
|
||||
Environment:
|
||||
HF_TOKEN HuggingFace token for model downloads
|
||||
PI_API_KEY API key for vLLM endpoints
|
||||
PI_CONFIG_DIR Config directory (default: ~/.pi)`);
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
if (args.length === 0 || args[0] === "--help" || args[0] === "-h") {
|
||||
printHelp();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (args[0] === "--version" || args[0] === "-v") {
|
||||
console.log(packageJson.version);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const command = args[0];
|
||||
const subcommand = args[1];
|
||||
|
||||
// Main command handler
|
||||
try {
|
||||
// Handle "pi pods" commands
|
||||
if (command === "pods") {
|
||||
if (!subcommand) {
|
||||
// pi pods - list all pods
|
||||
listPods();
|
||||
} else if (subcommand === "setup") {
|
||||
// pi pods setup <name> "<ssh>" [--mount "<mount>"] [--models-path <path>] [--vllm release|nightly|gpt-oss]
|
||||
const name = args[2];
|
||||
const sshCmd = args[3];
|
||||
|
||||
if (!name || !sshCmd) {
|
||||
console.error(
|
||||
'Usage: pi pods setup <name> "<ssh>" [--mount "<mount>"] [--models-path <path>] [--vllm release|nightly|gpt-oss]',
|
||||
);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Parse options
|
||||
const options: { mount?: string; modelsPath?: string; vllm?: "release" | "nightly" | "gpt-oss" } = {};
|
||||
for (let i = 4; i < args.length; i++) {
|
||||
if (args[i] === "--mount" && i + 1 < args.length) {
|
||||
options.mount = args[i + 1];
|
||||
i++;
|
||||
} else if (args[i] === "--models-path" && i + 1 < args.length) {
|
||||
options.modelsPath = args[i + 1];
|
||||
i++;
|
||||
} else if (args[i] === "--vllm" && i + 1 < args.length) {
|
||||
const vllmType = args[i + 1];
|
||||
if (vllmType === "release" || vllmType === "nightly" || vllmType === "gpt-oss") {
|
||||
options.vllm = vllmType;
|
||||
} else {
|
||||
console.error(chalk.red(`Invalid vLLM type: ${vllmType}`));
|
||||
console.error("Valid options: release, nightly, gpt-oss");
|
||||
process.exit(1);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
// If --mount provided but no --models-path, try to extract path from mount command
|
||||
if (options.mount && !options.modelsPath) {
|
||||
// Extract last part of mount command as models path
|
||||
const parts = options.mount.trim().split(" ");
|
||||
const lastPart = parts[parts.length - 1];
|
||||
if (lastPart?.startsWith("/")) {
|
||||
options.modelsPath = lastPart;
|
||||
}
|
||||
}
|
||||
|
||||
await setupPod(name, sshCmd, options);
|
||||
} else if (subcommand === "active") {
|
||||
// pi pods active <name>
|
||||
const name = args[2];
|
||||
if (!name) {
|
||||
console.error("Usage: pi pods active <name>");
|
||||
process.exit(1);
|
||||
}
|
||||
switchActivePod(name);
|
||||
} else if (subcommand === "remove") {
|
||||
// pi pods remove <name>
|
||||
const name = args[2];
|
||||
if (!name) {
|
||||
console.error("Usage: pi pods remove <name>");
|
||||
process.exit(1);
|
||||
}
|
||||
removePodCommand(name);
|
||||
} else {
|
||||
console.error(`Unknown pods subcommand: ${subcommand}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} else {
|
||||
// Parse --pod override for model commands
|
||||
let podOverride: string | undefined;
|
||||
const podIndex = args.indexOf("--pod");
|
||||
if (podIndex !== -1 && podIndex + 1 < args.length) {
|
||||
podOverride = args[podIndex + 1];
|
||||
// Remove --pod and its value from args
|
||||
args.splice(podIndex, 2);
|
||||
}
|
||||
|
||||
// Handle SSH/shell commands and model commands
|
||||
switch (command) {
|
||||
case "shell": {
|
||||
// pi shell [<name>] - open interactive shell
|
||||
const podName = args[1];
|
||||
let podInfo: { name: string; pod: import("./types.js").Pod } | null = null;
|
||||
|
||||
if (podName) {
|
||||
const config = loadConfig();
|
||||
const pod = config.pods[podName];
|
||||
if (pod) {
|
||||
podInfo = { name: podName, pod };
|
||||
}
|
||||
} else {
|
||||
podInfo = getActivePod();
|
||||
}
|
||||
|
||||
if (!podInfo) {
|
||||
if (podName) {
|
||||
console.error(chalk.red(`Pod '${podName}' not found`));
|
||||
} else {
|
||||
console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(chalk.green(`Connecting to pod '${podInfo.name}'...`));
|
||||
|
||||
// Execute SSH in interactive mode
|
||||
const sshArgs = podInfo.pod.ssh.split(" ").slice(1); // Remove 'ssh' from command
|
||||
const sshProcess = spawn("ssh", sshArgs, {
|
||||
stdio: "inherit",
|
||||
env: process.env,
|
||||
});
|
||||
|
||||
sshProcess.on("exit", (code) => {
|
||||
process.exit(code || 0);
|
||||
});
|
||||
break;
|
||||
}
|
||||
case "ssh": {
|
||||
// pi ssh [<name>] "<command>" - run command via SSH
|
||||
let podName: string | undefined;
|
||||
let sshCommand: string;
|
||||
|
||||
if (args.length === 2) {
|
||||
// pi ssh "<command>" - use active pod
|
||||
sshCommand = args[1];
|
||||
} else if (args.length === 3) {
|
||||
// pi ssh <name> "<command>"
|
||||
podName = args[1];
|
||||
sshCommand = args[2];
|
||||
} else {
|
||||
console.error('Usage: pi ssh [<name>] "<command>"');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let podInfo: { name: string; pod: import("./types.js").Pod } | null = null;
|
||||
|
||||
if (podName) {
|
||||
const config = loadConfig();
|
||||
const pod = config.pods[podName];
|
||||
if (pod) {
|
||||
podInfo = { name: podName, pod };
|
||||
}
|
||||
} else {
|
||||
podInfo = getActivePod();
|
||||
}
|
||||
|
||||
if (!podInfo) {
|
||||
if (podName) {
|
||||
console.error(chalk.red(`Pod '${podName}' not found`));
|
||||
} else {
|
||||
console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(chalk.gray(`Running on pod '${podInfo.name}': ${sshCommand}`));
|
||||
|
||||
// Execute command and stream output
|
||||
const exitCode = await sshExecStream(podInfo.pod.ssh, sshCommand);
|
||||
process.exit(exitCode);
|
||||
break;
|
||||
}
|
||||
case "start": {
|
||||
// pi start <model> --name <name> [options]
|
||||
const modelId = args[1];
|
||||
if (!modelId) {
|
||||
// Show available models
|
||||
const { showKnownModels } = await import("./commands/models.js");
|
||||
await showKnownModels();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Parse options
|
||||
let name: string | undefined;
|
||||
let memory: string | undefined;
|
||||
let context: string | undefined;
|
||||
let gpus: number | undefined;
|
||||
const vllmArgs: string[] = [];
|
||||
let inVllmArgs = false;
|
||||
|
||||
for (let i = 2; i < args.length; i++) {
|
||||
if (inVllmArgs) {
|
||||
vllmArgs.push(args[i]);
|
||||
} else if (args[i] === "--name" && i + 1 < args.length) {
|
||||
name = args[i + 1];
|
||||
i++;
|
||||
} else if (args[i] === "--memory" && i + 1 < args.length) {
|
||||
memory = args[i + 1];
|
||||
i++;
|
||||
} else if (args[i] === "--context" && i + 1 < args.length) {
|
||||
context = args[i + 1];
|
||||
i++;
|
||||
} else if (args[i] === "--gpus" && i + 1 < args.length) {
|
||||
gpus = parseInt(args[i + 1]);
|
||||
if (Number.isNaN(gpus) || gpus < 1) {
|
||||
console.error(chalk.red("--gpus must be a positive number"));
|
||||
process.exit(1);
|
||||
}
|
||||
i++;
|
||||
} else if (args[i] === "--vllm") {
|
||||
inVllmArgs = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!name) {
|
||||
console.error("--name is required");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Warn if --vllm is used with other parameters
|
||||
if (vllmArgs.length > 0 && (memory || context || gpus)) {
|
||||
console.log(
|
||||
chalk.yellow("⚠ Warning: --memory, --context, and --gpus are ignored when --vllm is specified"),
|
||||
);
|
||||
console.log(chalk.yellow(" Using only custom vLLM arguments"));
|
||||
console.log("");
|
||||
}
|
||||
|
||||
await startModel(modelId, name, {
|
||||
pod: podOverride,
|
||||
memory,
|
||||
context,
|
||||
gpus,
|
||||
vllmArgs: vllmArgs.length > 0 ? vllmArgs : undefined,
|
||||
});
|
||||
break;
|
||||
}
|
||||
case "stop": {
|
||||
// pi stop [name] - stop specific model or all models
|
||||
const name = args[1];
|
||||
if (!name) {
|
||||
// Stop all models on the active pod
|
||||
const { stopAllModels } = await import("./commands/models.js");
|
||||
await stopAllModels({ pod: podOverride });
|
||||
} else {
|
||||
await stopModel(name, { pod: podOverride });
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "list":
|
||||
// pi list
|
||||
await listModels({ pod: podOverride });
|
||||
break;
|
||||
case "logs": {
|
||||
// pi logs <name>
|
||||
const name = args[1];
|
||||
if (!name) {
|
||||
console.error("Usage: pi logs <name>");
|
||||
process.exit(1);
|
||||
}
|
||||
await viewLogs(name, { pod: podOverride });
|
||||
break;
|
||||
}
|
||||
case "agent": {
|
||||
// pi agent <name> [messages...] [options]
|
||||
const name = args[1];
|
||||
if (!name) {
|
||||
console.error("Usage: pi agent <name> [messages...] [options]");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const apiKey = process.env.PI_API_KEY;
|
||||
|
||||
// Pass all args after the model name
|
||||
const agentArgs = args.slice(2);
|
||||
|
||||
// If no messages provided, it's interactive mode
|
||||
await promptModel(name, agentArgs, {
|
||||
pod: podOverride,
|
||||
apiKey,
|
||||
}).catch(() => {
|
||||
// Error already handled in promptModel, just exit cleanly
|
||||
process.exit(0);
|
||||
});
|
||||
break;
|
||||
}
|
||||
default:
|
||||
console.error(`Unknown command: ${command}`);
|
||||
printHelp();
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error:", error);
|
||||
process.exit(1);
|
||||
}
|
||||
703
packages/pods/src/commands/models.ts
Normal file
703
packages/pods/src/commands/models.ts
Normal file
|
|
@ -0,0 +1,703 @@
|
|||
import chalk from "chalk";
|
||||
import { spawn } from "child_process";
|
||||
import { readFileSync } from "fs";
|
||||
import { dirname, join } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
import { getActivePod, loadConfig, saveConfig } from "../config.js";
|
||||
import { getModelConfig, getModelName, isKnownModel } from "../model-configs.js";
|
||||
import { sshExec } from "../ssh.js";
|
||||
import type { Pod } from "../types.js";
|
||||
|
||||
/**
|
||||
* Get the pod to use (active or override)
|
||||
*/
|
||||
const getPod = (podOverride?: string): { name: string; pod: Pod } => {
|
||||
if (podOverride) {
|
||||
const config = loadConfig();
|
||||
const pod = config.pods[podOverride];
|
||||
if (!pod) {
|
||||
console.error(chalk.red(`Pod '${podOverride}' not found`));
|
||||
process.exit(1);
|
||||
}
|
||||
return { name: podOverride, pod };
|
||||
}
|
||||
|
||||
const active = getActivePod();
|
||||
if (!active) {
|
||||
console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
|
||||
process.exit(1);
|
||||
}
|
||||
return active;
|
||||
};
|
||||
|
||||
/**
|
||||
* Find next available port starting from 8001
|
||||
*/
|
||||
const getNextPort = (pod: Pod): number => {
|
||||
const usedPorts = Object.values(pod.models).map((m) => m.port);
|
||||
let port = 8001;
|
||||
while (usedPorts.includes(port)) {
|
||||
port++;
|
||||
}
|
||||
return port;
|
||||
};
|
||||
|
||||
/**
|
||||
* Select GPUs for model deployment (round-robin)
|
||||
*/
|
||||
const selectGPUs = (pod: Pod, count: number = 1): number[] => {
|
||||
if (count === pod.gpus.length) {
|
||||
// Use all GPUs
|
||||
return pod.gpus.map((g) => g.id);
|
||||
}
|
||||
|
||||
// Count GPU usage across all models
|
||||
const gpuUsage = new Map<number, number>();
|
||||
for (const gpu of pod.gpus) {
|
||||
gpuUsage.set(gpu.id, 0);
|
||||
}
|
||||
|
||||
for (const model of Object.values(pod.models)) {
|
||||
for (const gpuId of model.gpu) {
|
||||
gpuUsage.set(gpuId, (gpuUsage.get(gpuId) || 0) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Sort GPUs by usage (least used first)
|
||||
const sortedGPUs = Array.from(gpuUsage.entries())
|
||||
.sort((a, b) => a[1] - b[1])
|
||||
.map((entry) => entry[0]);
|
||||
|
||||
// Return the least used GPUs
|
||||
return sortedGPUs.slice(0, count);
|
||||
};
|
||||
|
||||
/**
|
||||
* Start a model
|
||||
*/
|
||||
export const startModel = async (
|
||||
modelId: string,
|
||||
name: string,
|
||||
options: {
|
||||
pod?: string;
|
||||
vllmArgs?: string[];
|
||||
memory?: string;
|
||||
context?: string;
|
||||
gpus?: number;
|
||||
},
|
||||
) => {
|
||||
const { name: podName, pod } = getPod(options.pod);
|
||||
|
||||
// Validation
|
||||
if (!pod.modelsPath) {
|
||||
console.error(chalk.red("Pod does not have a models path configured"));
|
||||
process.exit(1);
|
||||
}
|
||||
if (pod.models[name]) {
|
||||
console.error(chalk.red(`Model '${name}' already exists on pod '${podName}'`));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const port = getNextPort(pod);
|
||||
|
||||
// Determine GPU allocation and vLLM args
|
||||
let gpus: number[] = [];
|
||||
let vllmArgs: string[] = [];
|
||||
let modelConfig = null;
|
||||
|
||||
if (options.vllmArgs?.length) {
|
||||
// Custom args override everything
|
||||
vllmArgs = options.vllmArgs;
|
||||
console.log(chalk.gray("Using custom vLLM args, GPU allocation managed by vLLM"));
|
||||
} else if (isKnownModel(modelId)) {
|
||||
// Handle --gpus parameter for known models
|
||||
if (options.gpus) {
|
||||
// Validate GPU count
|
||||
if (options.gpus > pod.gpus.length) {
|
||||
console.error(chalk.red(`Error: Requested ${options.gpus} GPUs but pod only has ${pod.gpus.length}`));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Try to find config for requested GPU count
|
||||
modelConfig = getModelConfig(modelId, pod.gpus, options.gpus);
|
||||
if (modelConfig) {
|
||||
gpus = selectGPUs(pod, options.gpus);
|
||||
vllmArgs = [...(modelConfig.args || [])];
|
||||
} else {
|
||||
console.error(
|
||||
chalk.red(`Model '${getModelName(modelId)}' does not have a configuration for ${options.gpus} GPU(s)`),
|
||||
);
|
||||
console.error(chalk.yellow("Available configurations:"));
|
||||
|
||||
// Show available configurations
|
||||
for (let gpuCount = 1; gpuCount <= pod.gpus.length; gpuCount++) {
|
||||
const config = getModelConfig(modelId, pod.gpus, gpuCount);
|
||||
if (config) {
|
||||
console.error(chalk.gray(` - ${gpuCount} GPU(s)`));
|
||||
}
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
} else {
|
||||
// Find best config for this hardware (original behavior)
|
||||
for (let gpuCount = pod.gpus.length; gpuCount >= 1; gpuCount--) {
|
||||
modelConfig = getModelConfig(modelId, pod.gpus, gpuCount);
|
||||
if (modelConfig) {
|
||||
gpus = selectGPUs(pod, gpuCount);
|
||||
vllmArgs = [...(modelConfig.args || [])];
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!modelConfig) {
|
||||
console.error(chalk.red(`Model '${getModelName(modelId)}' not compatible with this pod's GPUs`));
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Unknown model
|
||||
if (options.gpus) {
|
||||
console.error(chalk.red("Error: --gpus can only be used with predefined models"));
|
||||
console.error(chalk.yellow("For custom models, use --vllm with tensor-parallel-size or similar arguments"));
|
||||
process.exit(1);
|
||||
}
|
||||
// Single GPU default
|
||||
gpus = selectGPUs(pod, 1);
|
||||
console.log(chalk.gray("Unknown model, defaulting to single GPU"));
|
||||
}
|
||||
|
||||
// Apply memory/context overrides
|
||||
if (!options.vllmArgs?.length) {
|
||||
if (options.memory) {
|
||||
const fraction = parseFloat(options.memory.replace("%", "")) / 100;
|
||||
vllmArgs = vllmArgs.filter((arg) => !arg.includes("gpu-memory-utilization"));
|
||||
vllmArgs.push("--gpu-memory-utilization", String(fraction));
|
||||
}
|
||||
if (options.context) {
|
||||
const contextSizes: Record<string, number> = {
|
||||
"4k": 4096,
|
||||
"8k": 8192,
|
||||
"16k": 16384,
|
||||
"32k": 32768,
|
||||
"64k": 65536,
|
||||
"128k": 131072,
|
||||
};
|
||||
const maxTokens = contextSizes[options.context.toLowerCase()] || parseInt(options.context);
|
||||
vllmArgs = vllmArgs.filter((arg) => !arg.includes("max-model-len"));
|
||||
vllmArgs.push("--max-model-len", String(maxTokens));
|
||||
}
|
||||
}
|
||||
|
||||
// Show what we're doing
|
||||
console.log(chalk.green(`Starting model '${name}' on pod '${podName}'...`));
|
||||
console.log(`Model: ${modelId}`);
|
||||
console.log(`Port: ${port}`);
|
||||
console.log(`GPU(s): ${gpus.length ? gpus.join(", ") : "Managed by vLLM"}`);
|
||||
if (modelConfig?.notes) console.log(chalk.yellow(`Note: ${modelConfig.notes}`));
|
||||
console.log("");
|
||||
|
||||
// Read and customize model_run.sh script with our values
|
||||
const scriptPath = join(dirname(fileURLToPath(import.meta.url)), "../../scripts/model_run.sh");
|
||||
let scriptContent = readFileSync(scriptPath, "utf-8");
|
||||
|
||||
// Replace placeholders - no escaping needed, heredoc with 'EOF' is literal
|
||||
scriptContent = scriptContent
|
||||
.replace("{{MODEL_ID}}", modelId)
|
||||
.replace("{{NAME}}", name)
|
||||
.replace("{{PORT}}", String(port))
|
||||
.replace("{{VLLM_ARGS}}", vllmArgs.join(" "));
|
||||
|
||||
// Upload customized script
|
||||
const result = await sshExec(
|
||||
pod.ssh,
|
||||
`cat > /tmp/model_run_${name}.sh << 'EOF'
|
||||
${scriptContent}
|
||||
EOF
|
||||
chmod +x /tmp/model_run_${name}.sh`,
|
||||
);
|
||||
|
||||
// Prepare environment
|
||||
const env = [
|
||||
`HF_TOKEN='${process.env.HF_TOKEN}'`,
|
||||
`PI_API_KEY='${process.env.PI_API_KEY}'`,
|
||||
`HF_HUB_ENABLE_HF_TRANSFER=1`,
|
||||
`VLLM_NO_USAGE_STATS=1`,
|
||||
`PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`,
|
||||
`FORCE_COLOR=1`,
|
||||
`TERM=xterm-256color`,
|
||||
...(gpus.length === 1 ? [`CUDA_VISIBLE_DEVICES=${gpus[0]}`] : []),
|
||||
...Object.entries(modelConfig?.env || {}).map(([k, v]) => `${k}='${v}'`),
|
||||
]
|
||||
.map((e) => `export ${e}`)
|
||||
.join("\n");
|
||||
|
||||
// Start the model runner with script command for pseudo-TTY (preserves colors)
|
||||
// Note: We use script to preserve colors and create a log file
|
||||
// setsid creates a new session so it survives SSH disconnection
|
||||
const startCmd = `
|
||||
${env}
|
||||
mkdir -p ~/.vllm_logs
|
||||
# Create a wrapper that monitors the script command
|
||||
cat > /tmp/model_wrapper_${name}.sh << 'WRAPPER'
|
||||
#!/bin/bash
|
||||
script -q -f -c "/tmp/model_run_${name}.sh" ~/.vllm_logs/${name}.log
|
||||
exit_code=$?
|
||||
echo "Script exited with code $exit_code" >> ~/.vllm_logs/${name}.log
|
||||
exit $exit_code
|
||||
WRAPPER
|
||||
chmod +x /tmp/model_wrapper_${name}.sh
|
||||
setsid /tmp/model_wrapper_${name}.sh </dev/null >/dev/null 2>&1 &
|
||||
echo $!
|
||||
exit 0
|
||||
`;
|
||||
|
||||
const pidResult = await sshExec(pod.ssh, startCmd);
|
||||
const pid = parseInt(pidResult.stdout.trim());
|
||||
if (!pid) {
|
||||
console.error(chalk.red("Failed to start model runner"));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Save to config
|
||||
const config = loadConfig();
|
||||
config.pods[podName].models[name] = { model: modelId, port, gpu: gpus, pid };
|
||||
saveConfig(config);
|
||||
|
||||
console.log(`Model runner started with PID: ${pid}`);
|
||||
console.log("Streaming logs... (waiting for startup)\n");
|
||||
|
||||
// Small delay to ensure log file is created
|
||||
await new Promise((resolve) => setTimeout(resolve, 500));
|
||||
|
||||
// Stream logs with color support, watching for startup complete
|
||||
const sshParts = pod.ssh.split(" ");
|
||||
const sshCommand = sshParts[0]; // "ssh"
|
||||
const sshArgs = sshParts.slice(1); // ["root@86.38.238.55"]
|
||||
const host = sshArgs[0].split("@")[1] || "localhost";
|
||||
const tailCmd = `tail -f ~/.vllm_logs/${name}.log`;
|
||||
|
||||
// Build the full args array for spawn
|
||||
const fullArgs = [...sshArgs, tailCmd];
|
||||
|
||||
const logProcess = spawn(sshCommand, fullArgs, {
|
||||
stdio: ["inherit", "pipe", "pipe"], // capture stdout and stderr
|
||||
env: { ...process.env, FORCE_COLOR: "1" },
|
||||
});
|
||||
|
||||
let interrupted = false;
|
||||
let startupComplete = false;
|
||||
|
||||
// Handle Ctrl+C
|
||||
const sigintHandler = () => {
|
||||
interrupted = true;
|
||||
logProcess.kill();
|
||||
};
|
||||
process.on("SIGINT", sigintHandler);
|
||||
|
||||
// Process log output line by line
|
||||
const processOutput = (data: Buffer) => {
|
||||
const lines = data.toString().split("\n");
|
||||
for (const line of lines) {
|
||||
if (line) {
|
||||
console.log(line); // Echo the line to console
|
||||
|
||||
// Check for startup complete message
|
||||
if (line.includes("Application startup complete")) {
|
||||
startupComplete = true;
|
||||
logProcess.kill(); // Stop tailing logs
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
logProcess.stdout?.on("data", processOutput);
|
||||
logProcess.stderr?.on("data", processOutput);
|
||||
|
||||
await new Promise<void>((resolve) => logProcess.on("exit", resolve));
|
||||
process.removeListener("SIGINT", sigintHandler);
|
||||
|
||||
if (startupComplete) {
|
||||
// Model started successfully - output connection details
|
||||
console.log("\n" + chalk.green("✓ Model started successfully!"));
|
||||
console.log("\n" + chalk.bold("Connection Details:"));
|
||||
console.log(chalk.cyan("─".repeat(50)));
|
||||
console.log(chalk.white("Base URL: ") + chalk.yellow(`http://${host}:${port}/v1`));
|
||||
console.log(chalk.white("Model: ") + chalk.yellow(modelId));
|
||||
console.log(chalk.white("API Key: ") + chalk.yellow(process.env.PI_API_KEY || "(not set)"));
|
||||
console.log(chalk.cyan("─".repeat(50)));
|
||||
|
||||
console.log("\n" + chalk.bold("Export for shell:"));
|
||||
console.log(chalk.gray(`export OPENAI_BASE_URL="http://${host}:${port}/v1"`));
|
||||
console.log(chalk.gray(`export OPENAI_API_KEY="${process.env.PI_API_KEY || "your-api-key"}"`));
|
||||
console.log(chalk.gray(`export OPENAI_MODEL="${modelId}"`));
|
||||
|
||||
console.log("\n" + chalk.bold("Example usage:"));
|
||||
console.log(
|
||||
chalk.gray(`
|
||||
# Python
|
||||
from openai import OpenAI
|
||||
client = OpenAI() # Uses env vars
|
||||
response = client.chat.completions.create(
|
||||
model="${modelId}",
|
||||
messages=[{"role": "user", "content": "Hello!"}]
|
||||
)
|
||||
|
||||
# CLI
|
||||
curl $OPENAI_BASE_URL/chat/completions \\
|
||||
-H "Authorization: Bearer $OPENAI_API_KEY" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{"model":"${modelId}","messages":[{"role":"user","content":"Hi"}]}'`),
|
||||
);
|
||||
console.log("");
|
||||
console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
|
||||
console.log(chalk.cyan(`Interactive mode: pi agent ${name} -i`));
|
||||
console.log(chalk.cyan(`Monitor logs: pi logs ${name}`));
|
||||
console.log(chalk.cyan(`Stop model: pi stop ${name}`));
|
||||
} else if (interrupted) {
|
||||
console.log(chalk.yellow("\n\nStopped monitoring. Model deployment continues in background."));
|
||||
console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
|
||||
console.log(chalk.cyan(`Check status: pi logs ${name}`));
|
||||
console.log(chalk.cyan(`Stop model: pi stop ${name}`));
|
||||
} else {
|
||||
console.log(chalk.yellow("\n\nLog stream ended. Model may still be running."));
|
||||
console.log(chalk.cyan(`Chat with model: pi agent ${name} "Your message"`));
|
||||
console.log(chalk.cyan(`Check status: pi logs ${name}`));
|
||||
console.log(chalk.cyan(`Stop model: pi stop ${name}`));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Stop a model
|
||||
*/
|
||||
export const stopModel = async (name: string, options: { pod?: string }) => {
|
||||
const { name: podName, pod } = getPod(options.pod);
|
||||
|
||||
const model = pod.models[name];
|
||||
if (!model) {
|
||||
console.error(chalk.red(`Model '${name}' not found on pod '${podName}'`));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(chalk.yellow(`Stopping model '${name}' on pod '${podName}'...`));
|
||||
|
||||
// Kill the script process and all its children
|
||||
// Using pkill to kill the process and all children
|
||||
const killCmd = `
|
||||
# Kill the script process and all its children
|
||||
pkill -TERM -P ${model.pid} 2>/dev/null || true
|
||||
kill ${model.pid} 2>/dev/null || true
|
||||
`;
|
||||
await sshExec(pod.ssh, killCmd);
|
||||
|
||||
// Remove from config
|
||||
const config = loadConfig();
|
||||
delete config.pods[podName].models[name];
|
||||
saveConfig(config);
|
||||
|
||||
console.log(chalk.green(`✓ Model '${name}' stopped`));
|
||||
};
|
||||
|
||||
/**
|
||||
* Stop all models on a pod
|
||||
*/
|
||||
export const stopAllModels = async (options: { pod?: string }) => {
|
||||
const { name: podName, pod } = getPod(options.pod);
|
||||
|
||||
const modelNames = Object.keys(pod.models);
|
||||
if (modelNames.length === 0) {
|
||||
console.log(`No models running on pod '${podName}'`);
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(chalk.yellow(`Stopping ${modelNames.length} model(s) on pod '${podName}'...`));
|
||||
|
||||
// Kill all script processes and their children
|
||||
const pids = Object.values(pod.models).map((m) => m.pid);
|
||||
const killCmd = `
|
||||
for PID in ${pids.join(" ")}; do
|
||||
pkill -TERM -P $PID 2>/dev/null || true
|
||||
kill $PID 2>/dev/null || true
|
||||
done
|
||||
`;
|
||||
await sshExec(pod.ssh, killCmd);
|
||||
|
||||
// Clear all models from config
|
||||
const config = loadConfig();
|
||||
config.pods[podName].models = {};
|
||||
saveConfig(config);
|
||||
|
||||
console.log(chalk.green(`✓ Stopped all models: ${modelNames.join(", ")}`));
|
||||
};
|
||||
|
||||
/**
|
||||
* List all models
|
||||
*/
|
||||
export const listModels = async (options: { pod?: string }) => {
|
||||
const { name: podName, pod } = getPod(options.pod);
|
||||
|
||||
const modelNames = Object.keys(pod.models);
|
||||
if (modelNames.length === 0) {
|
||||
console.log(`No models running on pod '${podName}'`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Get pod SSH host for URL display
|
||||
const sshParts = pod.ssh.split(" ");
|
||||
const host = sshParts.find((p) => p.includes("@"))?.split("@")[1] || "unknown";
|
||||
|
||||
console.log(`Models on pod '${chalk.bold(podName)}':`);
|
||||
for (const name of modelNames) {
|
||||
const model = pod.models[name];
|
||||
const gpuStr =
|
||||
model.gpu.length > 1
|
||||
? `GPUs ${model.gpu.join(",")}`
|
||||
: model.gpu.length === 1
|
||||
? `GPU ${model.gpu[0]}`
|
||||
: "GPU unknown";
|
||||
console.log(` ${chalk.green(name)} - Port ${model.port} - ${gpuStr} - PID ${model.pid}`);
|
||||
console.log(` Model: ${chalk.gray(model.model)}`);
|
||||
console.log(` URL: ${chalk.cyan(`http://${host}:${model.port}/v1`)}`);
|
||||
}
|
||||
|
||||
// Optionally verify processes are still running
|
||||
console.log("");
|
||||
console.log("Verifying processes...");
|
||||
let anyDead = false;
|
||||
for (const name of modelNames) {
|
||||
const model = pod.models[name];
|
||||
// Check both the wrapper process and if vLLM is responding
|
||||
const checkCmd = `
|
||||
# Check if wrapper process exists
|
||||
if ps -p ${model.pid} > /dev/null 2>&1; then
|
||||
# Process exists, now check if vLLM is responding
|
||||
if curl -s -f http://localhost:${model.port}/health > /dev/null 2>&1; then
|
||||
echo "running"
|
||||
else
|
||||
# Check if it's still starting up
|
||||
if tail -n 20 ~/.vllm_logs/${name}.log 2>/dev/null | grep -q "ERROR\\|Failed\\|Cuda error\\|died"; then
|
||||
echo "crashed"
|
||||
else
|
||||
echo "starting"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
echo "dead"
|
||||
fi
|
||||
`;
|
||||
const result = await sshExec(pod.ssh, checkCmd);
|
||||
const status = result.stdout.trim();
|
||||
if (status === "dead") {
|
||||
console.log(chalk.red(` ${name}: Process ${model.pid} is not running`));
|
||||
anyDead = true;
|
||||
} else if (status === "crashed") {
|
||||
console.log(chalk.red(` ${name}: vLLM crashed (check logs with 'pi logs ${name}')`));
|
||||
anyDead = true;
|
||||
} else if (status === "starting") {
|
||||
console.log(chalk.yellow(` ${name}: Still starting up...`));
|
||||
}
|
||||
}
|
||||
|
||||
if (anyDead) {
|
||||
console.log("");
|
||||
console.log(chalk.yellow("Some models are not running. Clean up with:"));
|
||||
console.log(chalk.cyan(" pi stop <name>"));
|
||||
} else {
|
||||
console.log(chalk.green("✓ All processes verified"));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* View model logs
|
||||
*/
|
||||
export const viewLogs = async (name: string, options: { pod?: string }) => {
|
||||
const { name: podName, pod } = getPod(options.pod);
|
||||
|
||||
const model = pod.models[name];
|
||||
if (!model) {
|
||||
console.error(chalk.red(`Model '${name}' not found on pod '${podName}'`));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(chalk.green(`Streaming logs for '${name}' on pod '${podName}'...`));
|
||||
console.log(chalk.gray("Press Ctrl+C to stop"));
|
||||
console.log("");
|
||||
|
||||
// Stream logs with color preservation
|
||||
const sshParts = pod.ssh.split(" ");
|
||||
const sshCommand = sshParts[0]; // "ssh"
|
||||
const sshArgs = sshParts.slice(1); // ["root@86.38.238.55"]
|
||||
const tailCmd = `tail -f ~/.vllm_logs/${name}.log`;
|
||||
|
||||
const logProcess = spawn(sshCommand, [...sshArgs, tailCmd], {
|
||||
stdio: "inherit",
|
||||
env: {
|
||||
...process.env,
|
||||
FORCE_COLOR: "1",
|
||||
},
|
||||
});
|
||||
|
||||
// Wait for process to exit
|
||||
await new Promise<void>((resolve) => {
|
||||
logProcess.on("exit", () => resolve());
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* Show known models and their hardware requirements
|
||||
*/
|
||||
export const showKnownModels = async () => {
|
||||
const modelsJson = await import("../models.json", { assert: { type: "json" } });
|
||||
const models = modelsJson.default.models;
|
||||
|
||||
// Get active pod info if available
|
||||
const activePod = getActivePod();
|
||||
let podGpuCount = 0;
|
||||
let podGpuType = "";
|
||||
|
||||
if (activePod) {
|
||||
podGpuCount = activePod.pod.gpus.length;
|
||||
// Extract GPU type from name (e.g., "NVIDIA H200" -> "H200")
|
||||
podGpuType = activePod.pod.gpus[0]?.name?.replace("NVIDIA", "")?.trim()?.split(" ")[0] || "";
|
||||
|
||||
console.log(chalk.bold(`Known Models for ${activePod.name} (${podGpuCount}x ${podGpuType || "GPU"}):\n`));
|
||||
} else {
|
||||
console.log(chalk.bold("Known Models:\n"));
|
||||
console.log(chalk.yellow("No active pod. Use 'pi pods active <name>' to filter compatible models.\n"));
|
||||
}
|
||||
|
||||
console.log("Usage: pi start <model> --name <name> [options]\n");
|
||||
|
||||
// Group models by compatibility and family
|
||||
const compatible: Record<string, Array<{ id: string; name: string; config: string; notes?: string }>> = {};
|
||||
const incompatible: Record<string, Array<{ id: string; name: string; minGpu: string; notes?: string }>> = {};
|
||||
|
||||
for (const [modelId, info] of Object.entries(models)) {
|
||||
const modelInfo = info as any;
|
||||
const family = modelInfo.name.split("-")[0] || "Other";
|
||||
|
||||
let isCompatible = false;
|
||||
let compatibleConfig = "";
|
||||
let minGpu = "Unknown";
|
||||
let minNotes: string | undefined;
|
||||
|
||||
if (modelInfo.configs && modelInfo.configs.length > 0) {
|
||||
// Sort configs by GPU count to find minimum
|
||||
const sortedConfigs = [...modelInfo.configs].sort((a: any, b: any) => (a.gpuCount || 1) - (b.gpuCount || 1));
|
||||
|
||||
// Find minimum requirements
|
||||
const minConfig = sortedConfigs[0];
|
||||
const minGpuCount = minConfig.gpuCount || 1;
|
||||
const gpuTypes = minConfig.gpuTypes?.join("/") || "H100/H200";
|
||||
|
||||
if (minGpuCount === 1) {
|
||||
minGpu = `1x ${gpuTypes}`;
|
||||
} else {
|
||||
minGpu = `${minGpuCount}x ${gpuTypes}`;
|
||||
}
|
||||
|
||||
minNotes = minConfig.notes || modelInfo.notes;
|
||||
|
||||
// Check compatibility with active pod
|
||||
if (activePod && podGpuCount > 0) {
|
||||
// Find best matching config for this pod
|
||||
for (const config of sortedConfigs) {
|
||||
const configGpuCount = config.gpuCount || 1;
|
||||
const configGpuTypes = config.gpuTypes || [];
|
||||
|
||||
// Check if we have enough GPUs
|
||||
if (configGpuCount <= podGpuCount) {
|
||||
// Check if GPU type matches (if specified)
|
||||
if (
|
||||
configGpuTypes.length === 0 ||
|
||||
configGpuTypes.some((type: string) => podGpuType.includes(type) || type.includes(podGpuType))
|
||||
) {
|
||||
isCompatible = true;
|
||||
if (configGpuCount === 1) {
|
||||
compatibleConfig = `1x ${podGpuType}`;
|
||||
} else {
|
||||
compatibleConfig = `${configGpuCount}x ${podGpuType}`;
|
||||
}
|
||||
minNotes = config.notes || modelInfo.notes;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const modelEntry = {
|
||||
id: modelId,
|
||||
name: modelInfo.name,
|
||||
notes: minNotes,
|
||||
};
|
||||
|
||||
if (activePod && isCompatible) {
|
||||
if (!compatible[family]) {
|
||||
compatible[family] = [];
|
||||
}
|
||||
compatible[family].push({ ...modelEntry, config: compatibleConfig });
|
||||
} else {
|
||||
if (!incompatible[family]) {
|
||||
incompatible[family] = [];
|
||||
}
|
||||
incompatible[family].push({ ...modelEntry, minGpu });
|
||||
}
|
||||
}
|
||||
|
||||
// Display compatible models first
|
||||
if (activePod && Object.keys(compatible).length > 0) {
|
||||
console.log(chalk.green.bold("✓ Compatible Models:\n"));
|
||||
|
||||
const sortedFamilies = Object.keys(compatible).sort();
|
||||
for (const family of sortedFamilies) {
|
||||
console.log(chalk.cyan(`${family} Models:`));
|
||||
|
||||
const modelList = compatible[family].sort((a, b) => a.name.localeCompare(b.name));
|
||||
|
||||
for (const model of modelList) {
|
||||
console.log(` ${chalk.green(model.id)}`);
|
||||
console.log(` Name: ${model.name}`);
|
||||
console.log(` Config: ${model.config}`);
|
||||
if (model.notes) {
|
||||
console.log(chalk.gray(` Note: ${model.notes}`));
|
||||
}
|
||||
console.log("");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Display incompatible models
|
||||
if (Object.keys(incompatible).length > 0) {
|
||||
if (activePod && Object.keys(compatible).length > 0) {
|
||||
console.log(chalk.red.bold("✗ Incompatible Models (need more/different GPUs):\n"));
|
||||
}
|
||||
|
||||
const sortedFamilies = Object.keys(incompatible).sort();
|
||||
for (const family of sortedFamilies) {
|
||||
if (!activePod) {
|
||||
console.log(chalk.cyan(`${family} Models:`));
|
||||
} else {
|
||||
console.log(chalk.gray(`${family} Models:`));
|
||||
}
|
||||
|
||||
const modelList = incompatible[family].sort((a, b) => a.name.localeCompare(b.name));
|
||||
|
||||
for (const model of modelList) {
|
||||
const color = activePod ? chalk.gray : chalk.green;
|
||||
console.log(` ${color(model.id)}`);
|
||||
console.log(chalk.gray(` Name: ${model.name}`));
|
||||
console.log(chalk.gray(` Min Hardware: ${model.minGpu}`));
|
||||
if (model.notes && !activePod) {
|
||||
console.log(chalk.gray(` Note: ${model.notes}`));
|
||||
}
|
||||
if (activePod) {
|
||||
console.log(""); // Less verbose for incompatible models when filtered
|
||||
} else {
|
||||
console.log("");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(chalk.gray("\nFor unknown models, defaults to single GPU deployment."));
|
||||
console.log(chalk.gray("Use --vllm to pass custom arguments to vLLM."));
|
||||
};
|
||||
205
packages/pods/src/commands/pods.ts
Normal file
205
packages/pods/src/commands/pods.ts
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
import chalk from "chalk";
|
||||
import { dirname, join } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
import { addPod, loadConfig, removePod, setActivePod } from "../config.js";
|
||||
import { scpFile, sshExec, sshExecStream } from "../ssh.js";
|
||||
import type { GPU, Pod } from "../types.js";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
/**
|
||||
* List all pods
|
||||
*/
|
||||
export const listPods = () => {
|
||||
const config = loadConfig();
|
||||
const podNames = Object.keys(config.pods);
|
||||
|
||||
if (podNames.length === 0) {
|
||||
console.log("No pods configured. Use 'pi pods setup' to add a pod.");
|
||||
return;
|
||||
}
|
||||
|
||||
console.log("Configured pods:");
|
||||
for (const name of podNames) {
|
||||
const pod = config.pods[name];
|
||||
const isActive = config.active === name;
|
||||
const marker = isActive ? chalk.green("*") : " ";
|
||||
const gpuCount = pod.gpus?.length || 0;
|
||||
const gpuInfo = gpuCount > 0 ? `${gpuCount}x ${pod.gpus[0].name}` : "no GPUs detected";
|
||||
const vllmInfo = pod.vllmVersion ? ` (vLLM: ${pod.vllmVersion})` : "";
|
||||
console.log(`${marker} ${chalk.bold(name)} - ${gpuInfo}${vllmInfo} - ${pod.ssh}`);
|
||||
if (pod.modelsPath) {
|
||||
console.log(` Models: ${pod.modelsPath}`);
|
||||
}
|
||||
if (pod.vllmVersion === "gpt-oss") {
|
||||
console.log(chalk.yellow(` ⚠️ GPT-OSS build - only for GPT-OSS models`));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Setup a new pod
|
||||
*/
|
||||
export const setupPod = async (
|
||||
name: string,
|
||||
sshCmd: string,
|
||||
options: { mount?: string; modelsPath?: string; vllm?: "release" | "nightly" | "gpt-oss" },
|
||||
) => {
|
||||
// Validate environment variables
|
||||
const hfToken = process.env.HF_TOKEN;
|
||||
const vllmApiKey = process.env.PI_API_KEY;
|
||||
|
||||
if (!hfToken) {
|
||||
console.error(chalk.red("ERROR: HF_TOKEN environment variable is required"));
|
||||
console.error("Get a token from: https://huggingface.co/settings/tokens");
|
||||
console.error("Then run: export HF_TOKEN=your_token_here");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!vllmApiKey) {
|
||||
console.error(chalk.red("ERROR: PI_API_KEY environment variable is required"));
|
||||
console.error("Set an API key: export PI_API_KEY=your_api_key_here");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Determine models path
|
||||
let modelsPath = options.modelsPath;
|
||||
if (!modelsPath && options.mount) {
|
||||
// Extract path from mount command if not explicitly provided
|
||||
// e.g., "mount -t nfs ... /mnt/sfs" -> "/mnt/sfs"
|
||||
const parts = options.mount.split(" ");
|
||||
modelsPath = parts[parts.length - 1];
|
||||
}
|
||||
|
||||
if (!modelsPath) {
|
||||
console.error(chalk.red("ERROR: --models-path is required (or must be extractable from --mount)"));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(chalk.green(`Setting up pod '${name}'...`));
|
||||
console.log(`SSH: ${sshCmd}`);
|
||||
console.log(`Models path: ${modelsPath}`);
|
||||
console.log(
|
||||
`vLLM version: ${options.vllm || "release"} ${options.vllm === "gpt-oss" ? chalk.yellow("(GPT-OSS special build)") : ""}`,
|
||||
);
|
||||
if (options.mount) {
|
||||
console.log(`Mount command: ${options.mount}`);
|
||||
}
|
||||
console.log("");
|
||||
|
||||
// Test SSH connection
|
||||
console.log("Testing SSH connection...");
|
||||
const testResult = await sshExec(sshCmd, "echo 'SSH OK'");
|
||||
if (testResult.exitCode !== 0) {
|
||||
console.error(chalk.red("Failed to connect via SSH"));
|
||||
console.error(testResult.stderr);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(chalk.green("✓ SSH connection successful"));
|
||||
|
||||
// Copy setup script
|
||||
console.log("Copying setup script...");
|
||||
const scriptPath = join(__dirname, "../../scripts/pod_setup.sh");
|
||||
const success = await scpFile(sshCmd, scriptPath, "/tmp/pod_setup.sh");
|
||||
if (!success) {
|
||||
console.error(chalk.red("Failed to copy setup script"));
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(chalk.green("✓ Setup script copied"));
|
||||
|
||||
// Build setup command
|
||||
let setupCmd = `bash /tmp/pod_setup.sh --models-path '${modelsPath}' --hf-token '${hfToken}' --vllm-api-key '${vllmApiKey}'`;
|
||||
if (options.mount) {
|
||||
setupCmd += ` --mount '${options.mount}'`;
|
||||
}
|
||||
// Add vLLM version flag
|
||||
const vllmVersion = options.vllm || "release";
|
||||
setupCmd += ` --vllm '${vllmVersion}'`;
|
||||
|
||||
// Run setup script
|
||||
console.log("");
|
||||
console.log(chalk.yellow("Running setup (this will take 2-5 minutes)..."));
|
||||
console.log("");
|
||||
|
||||
// Use forceTTY to preserve colors from apt, pip, etc.
|
||||
const exitCode = await sshExecStream(sshCmd, setupCmd, { forceTTY: true });
|
||||
if (exitCode !== 0) {
|
||||
console.error(chalk.red("\nSetup failed. Check the output above for errors."));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Parse GPU info from setup output
|
||||
console.log("");
|
||||
console.log("Detecting GPU configuration...");
|
||||
const gpuResult = await sshExec(sshCmd, "nvidia-smi --query-gpu=index,name,memory.total --format=csv,noheader");
|
||||
|
||||
const gpus: GPU[] = [];
|
||||
if (gpuResult.exitCode === 0 && gpuResult.stdout) {
|
||||
const lines = gpuResult.stdout.trim().split("\n");
|
||||
for (const line of lines) {
|
||||
const [id, name, memory] = line.split(",").map((s) => s.trim());
|
||||
if (id !== undefined) {
|
||||
gpus.push({
|
||||
id: parseInt(id),
|
||||
name: name || "Unknown",
|
||||
memory: memory || "Unknown",
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(chalk.green(`✓ Detected ${gpus.length} GPU(s)`));
|
||||
for (const gpu of gpus) {
|
||||
console.log(` GPU ${gpu.id}: ${gpu.name} (${gpu.memory})`);
|
||||
}
|
||||
|
||||
// Save pod configuration
|
||||
const pod: Pod = {
|
||||
ssh: sshCmd,
|
||||
gpus,
|
||||
models: {},
|
||||
modelsPath,
|
||||
vllmVersion: options.vllm || "release",
|
||||
};
|
||||
|
||||
addPod(name, pod);
|
||||
console.log("");
|
||||
console.log(chalk.green(`✓ Pod '${name}' setup complete and set as active pod`));
|
||||
console.log("");
|
||||
console.log("You can now deploy models with:");
|
||||
console.log(chalk.cyan(` pi start <model> --name <name>`));
|
||||
};
|
||||
|
||||
/**
|
||||
* Switch active pod
|
||||
*/
|
||||
export const switchActivePod = (name: string) => {
|
||||
const config = loadConfig();
|
||||
if (!config.pods[name]) {
|
||||
console.error(chalk.red(`Pod '${name}' not found`));
|
||||
console.log("\nAvailable pods:");
|
||||
for (const podName of Object.keys(config.pods)) {
|
||||
console.log(` ${podName}`);
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
setActivePod(name);
|
||||
console.log(chalk.green(`✓ Switched active pod to '${name}'`));
|
||||
};
|
||||
|
||||
/**
|
||||
* Remove a pod from config
|
||||
*/
|
||||
export const removePodCommand = (name: string) => {
|
||||
const config = loadConfig();
|
||||
if (!config.pods[name]) {
|
||||
console.error(chalk.red(`Pod '${name}' not found`));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
removePod(name);
|
||||
console.log(chalk.green(`✓ Removed pod '${name}' from configuration`));
|
||||
console.log(chalk.yellow("Note: This only removes the local configuration. The remote pod is not affected."));
|
||||
};
|
||||
85
packages/pods/src/commands/prompt.ts
Normal file
85
packages/pods/src/commands/prompt.ts
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
import { main as agentMain } from "@mariozechner/pi-agent";
|
||||
import chalk from "chalk";
|
||||
import { getActivePod, loadConfig } from "../config.js";
|
||||
|
||||
// ────────────────────────────────────────────────────────────────────────────────
|
||||
// Types
|
||||
// ────────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
interface PromptOptions {
|
||||
pod?: string;
|
||||
apiKey?: string;
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────────────────────
|
||||
// Main prompt function
|
||||
// ────────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
export async function promptModel(modelName: string, userArgs: string[], opts: PromptOptions = {}) {
|
||||
// Get pod and model configuration
|
||||
const activePod = opts.pod ? { name: opts.pod, pod: loadConfig().pods[opts.pod] } : getActivePod();
|
||||
|
||||
if (!activePod) {
|
||||
console.error(chalk.red("No active pod. Use 'pi pods active <name>' to set one."));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const { name: podName, pod } = activePod;
|
||||
const modelConfig = pod.models[modelName];
|
||||
|
||||
if (!modelConfig) {
|
||||
console.error(chalk.red(`Model '${modelName}' not found on pod '${podName}'`));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Extract host from SSH string
|
||||
const host =
|
||||
pod.ssh
|
||||
.split(" ")
|
||||
.find((p) => p.includes("@"))
|
||||
?.split("@")[1] ?? "localhost";
|
||||
|
||||
// Build the system prompt for code navigation
|
||||
const systemPrompt = `You help the user understand and navigate the codebase in the current working directory.
|
||||
|
||||
You can read files, list directories, and execute shell commands via the respective tools.
|
||||
|
||||
Do not output file contents you read via the read_file tool directly, unless asked to.
|
||||
|
||||
Do not output markdown tables as part of your responses.
|
||||
|
||||
Keep your responses concise and relevant to the user's request.
|
||||
|
||||
File paths you output must include line numbers where possible, e.g. "src/index.ts:10-20" for lines 10 to 20 in src/index.ts.
|
||||
|
||||
Current working directory: ${process.cwd()}`;
|
||||
|
||||
// Build arguments for agent main function
|
||||
const args: string[] = [];
|
||||
|
||||
// Add base configuration that we control
|
||||
args.push(
|
||||
"--base-url",
|
||||
`http://${host}:${modelConfig.port}/v1`,
|
||||
"--model",
|
||||
modelConfig.model,
|
||||
"--api-key",
|
||||
opts.apiKey || process.env.PI_API_KEY || "dummy",
|
||||
"--api",
|
||||
modelConfig.model.toLowerCase().includes("gpt-oss") ? "responses" : "completions",
|
||||
"--system-prompt",
|
||||
systemPrompt,
|
||||
);
|
||||
|
||||
// Pass through all user-provided arguments
|
||||
// This includes messages, --continue, --json, etc.
|
||||
args.push(...userArgs);
|
||||
|
||||
// Call agent main function directly
|
||||
try {
|
||||
await agentMain(args);
|
||||
} catch (err: any) {
|
||||
console.error(chalk.red(`Agent error: ${err.message}`));
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
80
packages/pods/src/config.ts
Normal file
80
packages/pods/src/config.ts
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
||||
import { homedir } from "os";
|
||||
import { join } from "path";
|
||||
import type { Config, Pod } from "./types.js";
|
||||
|
||||
// Get config directory from env or use default
|
||||
const getConfigDir = (): string => {
|
||||
const configDir = process.env.PI_CONFIG_DIR || join(homedir(), ".pi");
|
||||
if (!existsSync(configDir)) {
|
||||
mkdirSync(configDir, { recursive: true });
|
||||
}
|
||||
return configDir;
|
||||
};
|
||||
|
||||
const getConfigPath = (): string => {
|
||||
return join(getConfigDir(), "pods.json");
|
||||
};
|
||||
|
||||
export const loadConfig = (): Config => {
|
||||
const configPath = getConfigPath();
|
||||
if (!existsSync(configPath)) {
|
||||
// Return empty config if file doesn't exist
|
||||
return { pods: {} };
|
||||
}
|
||||
try {
|
||||
const data = readFileSync(configPath, "utf-8");
|
||||
return JSON.parse(data);
|
||||
} catch (e) {
|
||||
console.error(`Error reading config: ${e}`);
|
||||
return { pods: {} };
|
||||
}
|
||||
};
|
||||
|
||||
export const saveConfig = (config: Config): void => {
|
||||
const configPath = getConfigPath();
|
||||
try {
|
||||
writeFileSync(configPath, JSON.stringify(config, null, 2));
|
||||
} catch (e) {
|
||||
console.error(`Error saving config: ${e}`);
|
||||
process.exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
export const getActivePod = (): { name: string; pod: Pod } | null => {
|
||||
const config = loadConfig();
|
||||
if (!config.active || !config.pods[config.active]) {
|
||||
return null;
|
||||
}
|
||||
return { name: config.active, pod: config.pods[config.active] };
|
||||
};
|
||||
|
||||
export const addPod = (name: string, pod: Pod): void => {
|
||||
const config = loadConfig();
|
||||
config.pods[name] = pod;
|
||||
// If no active pod, make this one active
|
||||
if (!config.active) {
|
||||
config.active = name;
|
||||
}
|
||||
saveConfig(config);
|
||||
};
|
||||
|
||||
export const removePod = (name: string): void => {
|
||||
const config = loadConfig();
|
||||
delete config.pods[name];
|
||||
// If this was the active pod, clear active
|
||||
if (config.active === name) {
|
||||
config.active = undefined;
|
||||
}
|
||||
saveConfig(config);
|
||||
};
|
||||
|
||||
export const setActivePod = (name: string): void => {
|
||||
const config = loadConfig();
|
||||
if (!config.pods[name]) {
|
||||
console.error(`Pod '${name}' not found`);
|
||||
process.exit(1);
|
||||
}
|
||||
config.active = name;
|
||||
saveConfig(config);
|
||||
};
|
||||
2
packages/pods/src/index.ts
Normal file
2
packages/pods/src/index.ts
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
// Main library exports
|
||||
export * from "./types.js";
|
||||
111
packages/pods/src/model-configs.ts
Normal file
111
packages/pods/src/model-configs.ts
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
import { readFileSync } from "fs";
|
||||
import { dirname, join } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
import type { GPU } from "./types.js";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
interface ModelConfig {
|
||||
gpuCount: number;
|
||||
gpuTypes?: string[];
|
||||
args: string[];
|
||||
env?: Record<string, string>;
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
interface ModelInfo {
|
||||
name: string;
|
||||
configs: ModelConfig[];
|
||||
notes?: string;
|
||||
}
|
||||
|
||||
interface ModelsData {
|
||||
models: Record<string, ModelInfo>;
|
||||
}
|
||||
|
||||
// Load models configuration - resolve relative to this file
|
||||
const modelsJsonPath = join(__dirname, "models.json");
|
||||
const modelsData: ModelsData = JSON.parse(readFileSync(modelsJsonPath, "utf-8"));
|
||||
|
||||
/**
|
||||
* Get the best configuration for a model based on available GPUs
|
||||
*/
|
||||
export const getModelConfig = (
|
||||
modelId: string,
|
||||
gpus: GPU[],
|
||||
requestedGpuCount: number,
|
||||
): { args: string[]; env?: Record<string, string>; notes?: string } | null => {
|
||||
const modelInfo = modelsData.models[modelId];
|
||||
if (!modelInfo) {
|
||||
// Unknown model, no default config
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract GPU type from the first GPU name (e.g., "NVIDIA H200" -> "H200")
|
||||
const gpuType = gpus[0]?.name?.replace("NVIDIA", "")?.trim()?.split(" ")[0] || "";
|
||||
|
||||
// Find best matching config
|
||||
let bestConfig: ModelConfig | null = null;
|
||||
|
||||
for (const config of modelInfo.configs) {
|
||||
// Check GPU count
|
||||
if (config.gpuCount !== requestedGpuCount) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check GPU type if specified
|
||||
if (config.gpuTypes && config.gpuTypes.length > 0) {
|
||||
const typeMatches = config.gpuTypes.some((type) => gpuType.includes(type) || type.includes(gpuType));
|
||||
if (!typeMatches) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// This config matches
|
||||
bestConfig = config;
|
||||
break;
|
||||
}
|
||||
|
||||
// If no exact match, try to find a config with just the right GPU count
|
||||
if (!bestConfig) {
|
||||
for (const config of modelInfo.configs) {
|
||||
if (config.gpuCount === requestedGpuCount) {
|
||||
bestConfig = config;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!bestConfig) {
|
||||
// No suitable config found
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
args: [...bestConfig.args],
|
||||
env: bestConfig.env ? { ...bestConfig.env } : undefined,
|
||||
notes: bestConfig.notes || modelInfo.notes,
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Check if a model is known
|
||||
*/
|
||||
export const isKnownModel = (modelId: string): boolean => {
|
||||
return modelId in modelsData.models;
|
||||
};
|
||||
|
||||
/**
|
||||
* Get all known models
|
||||
*/
|
||||
export const getKnownModels = (): string[] => {
|
||||
return Object.keys(modelsData.models);
|
||||
};
|
||||
|
||||
/**
|
||||
* Get model display name
|
||||
*/
|
||||
export const getModelName = (modelId: string): string => {
|
||||
return modelsData.models[modelId]?.name || modelId;
|
||||
};
|
||||
305
packages/pods/src/models.json
Normal file
305
packages/pods/src/models.json
Normal file
|
|
@ -0,0 +1,305 @@
|
|||
{
|
||||
"models": {
|
||||
"Qwen/Qwen2.5-Coder-32B-Instruct": {
|
||||
"name": "Qwen2.5-Coder-32B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
|
||||
},
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
|
||||
}
|
||||
]
|
||||
},
|
||||
"Qwen/Qwen3-Coder-30B-A3B-Instruct": {
|
||||
"name": "Qwen3-Coder-30B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
|
||||
"notes": "Fits comfortably on single GPU. ~60GB model weight."
|
||||
},
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"2",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"qwen3_coder"
|
||||
],
|
||||
"notes": "For higher throughput/longer context."
|
||||
}
|
||||
]
|
||||
},
|
||||
"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
|
||||
"name": "Qwen3-Coder-30B-FP8",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
|
||||
"env": {
|
||||
"VLLM_USE_DEEP_GEMM": "1"
|
||||
},
|
||||
"notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
|
||||
}
|
||||
]
|
||||
},
|
||||
"Qwen/Qwen3-Coder-480B-A35B-Instruct": {
|
||||
"name": "Qwen3-Coder-480B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H200", "H20"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"8",
|
||||
"--max-model-len",
|
||||
"32000",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"qwen3_coder"
|
||||
],
|
||||
"notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
|
||||
}
|
||||
]
|
||||
},
|
||||
"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
|
||||
"name": "Qwen3-Coder-480B-FP8",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H200", "H20"],
|
||||
"args": [
|
||||
"--max-model-len",
|
||||
"131072",
|
||||
"--enable-expert-parallel",
|
||||
"--data-parallel-size",
|
||||
"8",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"qwen3_coder"
|
||||
],
|
||||
"env": {
|
||||
"VLLM_USE_DEEP_GEMM": "1"
|
||||
},
|
||||
"notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
|
||||
}
|
||||
]
|
||||
},
|
||||
"openai/gpt-oss-20b": {
|
||||
"name": "GPT-OSS-20B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--async-scheduling"]
|
||||
},
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["B200"],
|
||||
"args": ["--async-scheduling"],
|
||||
"env": {
|
||||
"VLLM_USE_TRTLLM_ATTENTION": "1",
|
||||
"VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
|
||||
"VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
|
||||
"VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
|
||||
}
|
||||
}
|
||||
],
|
||||
"notes": "Requires vLLM 0.10.1+gptoss. Tools/functoin calls only via /v1/responses endpoint."
|
||||
},
|
||||
"openai/gpt-oss-120b": {
|
||||
"name": "GPT-OSS-120B",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
|
||||
"notes": "Single GPU deployment. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
|
||||
},
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
|
||||
"notes": "Recommended for H100/H200. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
|
||||
},
|
||||
{
|
||||
"gpuCount": 4,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tensor-parallel-size", "4", "--async-scheduling"],
|
||||
"notes": "Higher throughput. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
|
||||
},
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": ["--tensor-parallel-size", "8", "--async-scheduling"],
|
||||
"notes": "Maximum throughput for evaluation workloads. Requires vLLM 0.10.1+gptoss. Tools/function calls only via /v1/responses endpoint."
|
||||
}
|
||||
]
|
||||
},
|
||||
"zai-org/GLM-4.5": {
|
||||
"name": "GLM-4.5",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 16,
|
||||
"gpuTypes": ["H100"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"16",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H200"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"8",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
]
|
||||
}
|
||||
],
|
||||
"notes": "Models default to thinking mode. For full 128K context, double the GPU count."
|
||||
},
|
||||
"zai-org/GLM-4.5-FP8": {
|
||||
"name": "GLM-4.5-FP8",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 8,
|
||||
"gpuTypes": ["H100"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"8",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
]
|
||||
},
|
||||
{
|
||||
"gpuCount": 4,
|
||||
"gpuTypes": ["H200"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"4",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"zai-org/GLM-4.5-Air-FP8": {
|
||||
"name": "GLM-4.5-Air-FP8",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"2",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice",
|
||||
"--quantization",
|
||||
"fp8"
|
||||
],
|
||||
"env": {
|
||||
"VLLM_ATTENTION_BACKEND": "XFORMERS"
|
||||
},
|
||||
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
|
||||
},
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H200"],
|
||||
"args": [
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice",
|
||||
"--quantization",
|
||||
"fp8"
|
||||
],
|
||||
"env": {
|
||||
"VLLM_ATTENTION_BACKEND": "XFORMERS"
|
||||
},
|
||||
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
|
||||
}
|
||||
]
|
||||
},
|
||||
"zai-org/GLM-4.5-Air": {
|
||||
"name": "GLM-4.5-Air",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 2,
|
||||
"gpuTypes": ["H100", "H200"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"2",
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice"
|
||||
],
|
||||
"notes": "Non-quantized BF16 version, more compatible"
|
||||
},
|
||||
{
|
||||
"gpuCount": 1,
|
||||
"gpuTypes": ["H200"],
|
||||
"args": [
|
||||
"--tool-call-parser",
|
||||
"glm4_moe",
|
||||
"--reasoning-parser",
|
||||
"glm4_moe",
|
||||
"--enable-auto-tool-choice",
|
||||
"--gpu-memory-utilization",
|
||||
"0.95"
|
||||
],
|
||||
"notes": "Single H200 can fit the BF16 model with high memory utilization"
|
||||
}
|
||||
]
|
||||
},
|
||||
"moonshotai/Kimi-K2-Instruct": {
|
||||
"name": "Kimi-K2",
|
||||
"configs": [
|
||||
{
|
||||
"gpuCount": 16,
|
||||
"gpuTypes": ["H200", "H20"],
|
||||
"args": [
|
||||
"--tensor-parallel-size",
|
||||
"16",
|
||||
"--trust-remote-code",
|
||||
"--enable-auto-tool-choice",
|
||||
"--tool-call-parser",
|
||||
"kimi_k2"
|
||||
],
|
||||
"notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
|
||||
}
|
||||
],
|
||||
"notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
|
||||
}
|
||||
}
|
||||
}
|
||||
151
packages/pods/src/ssh.ts
Normal file
151
packages/pods/src/ssh.ts
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
import { type SpawnOptions, spawn } from "child_process";
|
||||
|
||||
export interface SSHResult {
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
exitCode: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute an SSH command and return the result
|
||||
*/
|
||||
export const sshExec = async (
|
||||
sshCmd: string,
|
||||
command: string,
|
||||
options?: { keepAlive?: boolean },
|
||||
): Promise<SSHResult> => {
|
||||
return new Promise((resolve) => {
|
||||
// Parse SSH command (e.g., "ssh root@1.2.3.4" or "ssh -p 22 root@1.2.3.4")
|
||||
const sshParts = sshCmd.split(" ").filter((p) => p);
|
||||
const sshBinary = sshParts[0];
|
||||
let sshArgs = [...sshParts.slice(1)];
|
||||
|
||||
// Add SSH keepalive options for long-running commands
|
||||
if (options?.keepAlive) {
|
||||
// ServerAliveInterval=30 sends keepalive every 30 seconds
|
||||
// ServerAliveCountMax=120 allows up to 120 failures (60 minutes total)
|
||||
sshArgs = ["-o", "ServerAliveInterval=30", "-o", "ServerAliveCountMax=120", ...sshArgs];
|
||||
}
|
||||
|
||||
sshArgs.push(command);
|
||||
|
||||
const proc = spawn(sshBinary, sshArgs, {
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
|
||||
proc.stdout.on("data", (data) => {
|
||||
stdout += data.toString();
|
||||
});
|
||||
|
||||
proc.stderr.on("data", (data) => {
|
||||
stderr += data.toString();
|
||||
});
|
||||
|
||||
proc.on("close", (code) => {
|
||||
resolve({
|
||||
stdout,
|
||||
stderr,
|
||||
exitCode: code || 0,
|
||||
});
|
||||
});
|
||||
|
||||
proc.on("error", (err) => {
|
||||
resolve({
|
||||
stdout,
|
||||
stderr: err.message,
|
||||
exitCode: 1,
|
||||
});
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* Execute an SSH command with streaming output to console
|
||||
*/
|
||||
export const sshExecStream = async (
|
||||
sshCmd: string,
|
||||
command: string,
|
||||
options?: { silent?: boolean; forceTTY?: boolean; keepAlive?: boolean },
|
||||
): Promise<number> => {
|
||||
return new Promise((resolve) => {
|
||||
const sshParts = sshCmd.split(" ").filter((p) => p);
|
||||
const sshBinary = sshParts[0];
|
||||
|
||||
// Build SSH args
|
||||
let sshArgs = [...sshParts.slice(1)];
|
||||
|
||||
// Add -t flag if requested and not already present
|
||||
if (options?.forceTTY && !sshParts.includes("-t")) {
|
||||
sshArgs = ["-t", ...sshArgs];
|
||||
}
|
||||
|
||||
// Add SSH keepalive options for long-running commands
|
||||
if (options?.keepAlive) {
|
||||
// ServerAliveInterval=30 sends keepalive every 30 seconds
|
||||
// ServerAliveCountMax=120 allows up to 120 failures (60 minutes total)
|
||||
sshArgs = ["-o", "ServerAliveInterval=30", "-o", "ServerAliveCountMax=120", ...sshArgs];
|
||||
}
|
||||
|
||||
sshArgs.push(command);
|
||||
|
||||
const spawnOptions: SpawnOptions = options?.silent
|
||||
? { stdio: ["ignore", "ignore", "ignore"] }
|
||||
: { stdio: "inherit" };
|
||||
|
||||
const proc = spawn(sshBinary, sshArgs, spawnOptions);
|
||||
|
||||
proc.on("close", (code) => {
|
||||
resolve(code || 0);
|
||||
});
|
||||
|
||||
proc.on("error", () => {
|
||||
resolve(1);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* Copy a file to remote via SCP
|
||||
*/
|
||||
export const scpFile = async (sshCmd: string, localPath: string, remotePath: string): Promise<boolean> => {
|
||||
// Extract host from SSH command
|
||||
const sshParts = sshCmd.split(" ").filter((p) => p);
|
||||
let host = "";
|
||||
let port = "22";
|
||||
let i = 1; // Skip 'ssh'
|
||||
|
||||
while (i < sshParts.length) {
|
||||
if (sshParts[i] === "-p" && i + 1 < sshParts.length) {
|
||||
port = sshParts[i + 1];
|
||||
i += 2;
|
||||
} else if (!sshParts[i].startsWith("-")) {
|
||||
host = sshParts[i];
|
||||
break;
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
if (!host) {
|
||||
console.error("Could not parse host from SSH command");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Build SCP command
|
||||
const scpArgs = ["-P", port, localPath, `${host}:${remotePath}`];
|
||||
|
||||
return new Promise((resolve) => {
|
||||
const proc = spawn("scp", scpArgs, { stdio: "inherit" });
|
||||
|
||||
proc.on("close", (code) => {
|
||||
resolve(code === 0);
|
||||
});
|
||||
|
||||
proc.on("error", () => {
|
||||
resolve(false);
|
||||
});
|
||||
});
|
||||
};
|
||||
27
packages/pods/src/types.ts
Normal file
27
packages/pods/src/types.ts
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
// Core type definitions for pi
|
||||
|
||||
export interface GPU {
|
||||
id: number;
|
||||
name: string;
|
||||
memory: string;
|
||||
}
|
||||
|
||||
export interface Model {
|
||||
model: string;
|
||||
port: number;
|
||||
gpu: number[]; // Array of GPU IDs for multi-GPU deployment
|
||||
pid: number;
|
||||
}
|
||||
|
||||
export interface Pod {
|
||||
ssh: string;
|
||||
gpus: GPU[];
|
||||
models: Record<string, Model>;
|
||||
modelsPath?: string;
|
||||
vllmVersion?: "release" | "nightly" | "gpt-oss"; // Track which vLLM version is installed
|
||||
}
|
||||
|
||||
export interface Config {
|
||||
pods: Record<string, Pod>;
|
||||
active?: string;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue