computer-host/internal/firecracker/launch.go
Hari b5c97aef07 host api alignment (#7)
* feat: add Firecracker API client methods for VM pause/resume and snapshots

Add PatchVm, GetVm, PutSnapshotCreate, and PutSnapshotLoad methods to the
API client, along with supporting types (VmState, SnapshotCreateParams,
SnapshotLoadParams, MemBackend).

* feat: add snapshot data layer - contract types, model, store, config

Add SnapshotID and snapshot contract types, SnapshotRecord model,
store interface CRUD methods with file store implementation,
snapshot paths helper, SnapshotsDir config, and directory creation.

* feat: add runtime methods for VM pause, resume, snapshot, and restore

Implement Pause, Resume, CreateSnapshot, and RestoreBoot on the
firecracker Runtime. RestoreBoot launches a jailer, stages snapshot
files into the chroot, loads the snapshot, and resumes the VM.

* feat: add daemon snapshot create, restore, and reconciliation logic

Implement CreateSnapshot (pause, snapshot, COW-copy disk, resume),
RestoreSnapshot (COW-copy disk, RestoreBoot, wait for guest),
GetSnapshot, ListSnapshots, DeleteSnapshotByID, and crash recovery
reconciliation for snapshot and restore operations.

* feat: add HTTP endpoints for snapshot create, get, list, delete, restore

Wire 5 snapshot routes: POST /machines/{id}/snapshots (create),
GET /machines/{id}/snapshots (list), GET /snapshots/{id} (get),
DELETE /snapshots/{id} (delete), POST /snapshots/{id}/restore (restore).

* fix: cross-device rename, restore network, and snapshot cleanup

- Replace os.Rename with copy+remove for moving snapshot files out of
  /proc/<pid>/root/ (cross-device link error on Linux)
- Reconfigure network interface after snapshot load so the restored VM
  uses its own tap device instead of the source VM's
- Clean partial snapshot dirs immediately on failure instead of only
  via reconcile
- Reject snapshot requests while a machine operation is already pending

* fix: test and modify snapshot runtime

* feat: snapshot lifecycle update, align runtime issues between host image
and daemon
2026-04-08 22:21:46 -04:00

282 lines
7.8 KiB
Go

package firecracker
import (
"context"
"fmt"
"os"
"os/exec"
"path"
"path/filepath"
"strconv"
"strings"
"time"
)
const (
defaultCgroupVersion = "2"
defaultFirecrackerInitTimeout = 10 * time.Second
defaultFirecrackerLogLevel = "Warning"
defaultFirecrackerPollInterval = 10 * time.Millisecond
defaultRootDriveID = "root_drive"
defaultVSockRunDir = "/run"
)
func configureMachine(ctx context.Context, client *apiClient, paths machinePaths, spec MachineSpec, network NetworkAllocation) error {
if err := client.PutMachineConfig(ctx, spec); err != nil {
return fmt.Errorf("put machine config: %w", err)
}
if err := client.PutBootSource(ctx, spec); err != nil {
return fmt.Errorf("put boot source: %w", err)
}
for _, drive := range additionalDriveRequests(spec) {
if err := client.PutDrive(ctx, drive); err != nil {
return fmt.Errorf("put drive %q: %w", drive.DriveID, err)
}
}
if err := client.PutDrive(ctx, rootDriveRequest(spec)); err != nil {
return fmt.Errorf("put root drive: %w", err)
}
if err := client.PutNetworkInterface(ctx, network); err != nil {
return fmt.Errorf("put network interface: %w", err)
}
if err := client.PutEntropy(ctx); err != nil {
return fmt.Errorf("put entropy device: %w", err)
}
if err := client.PutSerial(ctx, paths.JailedSerialLogPath); err != nil {
return fmt.Errorf("put serial device: %w", err)
}
if spec.Vsock != nil {
if err := client.PutVsock(ctx, *spec.Vsock); err != nil {
return fmt.Errorf("put vsock: %w", err)
}
}
if err := client.PutAction(ctx, defaultStartAction); err != nil {
return fmt.Errorf("start instance: %w", err)
}
return nil
}
func launchJailedFirecracker(paths machinePaths, machineID MachineID, firecrackerBinaryPath string, jailerBinaryPath string) (*exec.Cmd, error) {
command := exec.Command(
jailerBinaryPath,
"--id", string(machineID),
"--uid", strconv.Itoa(os.Getuid()),
"--gid", strconv.Itoa(os.Getgid()),
"--exec-file", firecrackerBinaryPath,
"--cgroup-version", defaultCgroupVersion,
"--chroot-base-dir", paths.JailerBaseDir,
"--daemonize",
"--new-pid-ns",
"--",
"--api-sock", defaultFirecrackerSocketPath,
"--log-path", paths.JailedFirecrackerLogPath,
"--level", defaultFirecrackerLogLevel,
"--show-level",
"--show-log-origin",
)
if err := command.Start(); err != nil {
return nil, fmt.Errorf("start jailer: %w", err)
}
go func() {
_ = command.Wait()
}()
return command, nil
}
func stageMachineFiles(spec MachineSpec, paths machinePaths) (MachineSpec, error) {
staged := spec
kernelImagePath, err := stagedFileName(spec.KernelImagePath)
if err != nil {
return MachineSpec{}, fmt.Errorf("kernel image path: %w", err)
}
if err := linkMachineFile(spec.KernelImagePath, filepath.Join(paths.ChrootRootDir, kernelImagePath)); err != nil {
return MachineSpec{}, fmt.Errorf("link kernel image into jail: %w", err)
}
staged.KernelImagePath = kernelImagePath
rootFSPath, err := stagedFileName(spec.RootFSPath)
if err != nil {
return MachineSpec{}, fmt.Errorf("rootfs path: %w", err)
}
if err := linkMachineFile(spec.RootFSPath, filepath.Join(paths.ChrootRootDir, rootFSPath)); err != nil {
return MachineSpec{}, fmt.Errorf("link rootfs into jail: %w", err)
}
staged.RootFSPath = rootFSPath
staged.Drives = make([]DriveSpec, len(spec.Drives))
for i, drive := range spec.Drives {
stagedDrive := drive
stagedDrivePath, err := stagedFileName(drive.Path)
if err != nil {
return MachineSpec{}, fmt.Errorf("drive %q path: %w", drive.ID, err)
}
if err := linkMachineFile(drive.Path, filepath.Join(paths.ChrootRootDir, stagedDrivePath)); err != nil {
return MachineSpec{}, fmt.Errorf("link drive %q into jail: %w", drive.ID, err)
}
stagedDrive.Path = stagedDrivePath
staged.Drives[i] = stagedDrive
}
if spec.Vsock != nil {
vsock := *spec.Vsock
vsock.Path = jailedVSockPath(spec)
staged.Vsock = &vsock
}
return staged, nil
}
func waitForSocket(ctx context.Context, client *apiClient, socketPath string) error {
waitContext, cancel := context.WithTimeout(ctx, defaultFirecrackerInitTimeout)
defer cancel()
ticker := time.NewTicker(defaultFirecrackerPollInterval)
defer ticker.Stop()
var lastStatErr error
var lastPingErr error
for {
select {
case <-waitContext.Done():
switch {
case lastPingErr != nil:
return fmt.Errorf("%w (socket=%q last_ping_err=%v)", waitContext.Err(), socketPath, lastPingErr)
case lastStatErr != nil:
return fmt.Errorf("%w (socket=%q last_stat_err=%v)", waitContext.Err(), socketPath, lastStatErr)
default:
return fmt.Errorf("%w (socket=%q)", waitContext.Err(), socketPath)
}
case <-ticker.C:
if _, err := os.Stat(socketPath); err != nil {
if os.IsNotExist(err) {
lastStatErr = err
continue
}
return fmt.Errorf("stat socket %q: %w", socketPath, err)
}
lastStatErr = nil
if err := client.Ping(waitContext); err != nil {
lastPingErr = err
continue
}
lastPingErr = nil
return nil
}
}
}
func additionalDriveRequests(spec MachineSpec) []driveRequest {
requests := make([]driveRequest, 0, len(spec.Drives))
for _, drive := range spec.Drives {
requests = append(requests, driveRequest{
DriveID: drive.ID,
IsReadOnly: drive.ReadOnly,
IsRootDevice: false,
PathOnHost: drive.Path,
})
}
return requests
}
func cleanupStartedProcess(command *exec.Cmd) {
if command == nil || command.Process == nil {
return
}
_ = command.Process.Kill()
}
func readPIDFile(pidFilePath string) (int, error) {
payload, err := os.ReadFile(pidFilePath)
if err != nil {
return 0, err
}
pid, err := strconv.Atoi(strings.TrimSpace(string(payload)))
if err != nil {
return 0, fmt.Errorf("parse pid file %q: %w", pidFilePath, err)
}
if pid < 1 {
return 0, fmt.Errorf("pid file %q must contain a positive pid", pidFilePath)
}
return pid, nil
}
func waitForPIDFile(ctx context.Context, pidFilePath string) (int, error) {
waitContext, cancel := context.WithTimeout(ctx, defaultFirecrackerInitTimeout)
defer cancel()
ticker := time.NewTicker(defaultFirecrackerPollInterval)
defer ticker.Stop()
var lastErr error
for {
select {
case <-waitContext.Done():
if lastErr != nil {
return 0, fmt.Errorf("%w (pid_file=%q last_err=%v)", waitContext.Err(), pidFilePath, lastErr)
}
return 0, fmt.Errorf("%w (pid_file=%q)", waitContext.Err(), pidFilePath)
case <-ticker.C:
pid, err := readPIDFile(pidFilePath)
if err == nil {
return pid, nil
}
lastErr = err
if os.IsNotExist(err) {
continue
}
return 0, err
}
}
}
func hostVSockPath(paths machinePaths, spec MachineSpec) string {
if spec.Vsock == nil {
return ""
}
return filepath.Join(paths.ChrootRootDir, defaultFirecrackerSocketDir, filepath.Base(strings.TrimSpace(spec.Vsock.Path)))
}
func jailedVSockPath(spec MachineSpec) string {
if spec.Vsock == nil {
return ""
}
return path.Join(defaultVSockRunDir, filepath.Base(strings.TrimSpace(spec.Vsock.Path)))
}
func linkMachineFile(source string, target string) error {
resolvedSource, err := filepath.EvalSymlinks(source)
if err != nil {
return err
}
if err := os.Link(resolvedSource, target); err != nil {
return err
}
return nil
}
func rootDriveRequest(spec MachineSpec) driveRequest {
return driveRequest{
DriveID: defaultRootDriveID,
IsReadOnly: false,
IsRootDevice: true,
PathOnHost: spec.RootFSPath,
}
}
func stagedFileName(filePath string) (string, error) {
name := filepath.Base(strings.TrimSpace(filePath))
if name == "" || name == "." || name == string(filepath.Separator) {
return "", fmt.Errorf("file path is required")
}
return name, nil
}
func stageSnapshotFile(sourcePath string, chrootRootDir string, name string) (string, error) {
target := filepath.Join(chrootRootDir, name)
if err := linkMachineFile(sourcePath, target); err != nil {
return "", err
}
return name, nil
}