host api alignment (#7)

* feat: add Firecracker API client methods for VM pause/resume and snapshots

Add PatchVm, GetVm, PutSnapshotCreate, and PutSnapshotLoad methods to the
API client, along with supporting types (VmState, SnapshotCreateParams,
SnapshotLoadParams, MemBackend).

* feat: add snapshot data layer - contract types, model, store, config

Add SnapshotID and snapshot contract types, SnapshotRecord model,
store interface CRUD methods with file store implementation,
snapshot paths helper, SnapshotsDir config, and directory creation.

* feat: add runtime methods for VM pause, resume, snapshot, and restore

Implement Pause, Resume, CreateSnapshot, and RestoreBoot on the
firecracker Runtime. RestoreBoot launches a jailer, stages snapshot
files into the chroot, loads the snapshot, and resumes the VM.

* feat: add daemon snapshot create, restore, and reconciliation logic

Implement CreateSnapshot (pause, snapshot, COW-copy disk, resume),
RestoreSnapshot (COW-copy disk, RestoreBoot, wait for guest),
GetSnapshot, ListSnapshots, DeleteSnapshotByID, and crash recovery
reconciliation for snapshot and restore operations.

* feat: add HTTP endpoints for snapshot create, get, list, delete, restore

Wire 5 snapshot routes: POST /machines/{id}/snapshots (create),
GET /machines/{id}/snapshots (list), GET /snapshots/{id} (get),
DELETE /snapshots/{id} (delete), POST /snapshots/{id}/restore (restore).

* fix: cross-device rename, restore network, and snapshot cleanup

- Replace os.Rename with copy+remove for moving snapshot files out of
  /proc/<pid>/root/ (cross-device link error on Linux)
- Reconfigure network interface after snapshot load so the restored VM
  uses its own tap device instead of the source VM's
- Clean partial snapshot dirs immediately on failure instead of only
  via reconcile
- Reject snapshot requests while a machine operation is already pending

* fix: test and modify snapshot runtime

* feat: snapshot lifecycle update, align runtime issues between host image
and daemon
This commit is contained in:
Hari 2026-04-08 22:21:46 -04:00 committed by GitHub
parent 9382de7eba
commit b5c97aef07
17 changed files with 1287 additions and 20 deletions

View file

@ -146,6 +146,69 @@ func (c *apiClient) PutVsock(ctx context.Context, spec VsockSpec) error {
return c.do(ctx, http.MethodPut, "/vsock", body, nil, http.StatusNoContent)
}
type VmState string
const (
VmStatePaused VmState = "Paused"
VmStateResumed VmState = "Resumed"
)
type vmRequest struct {
State VmState `json:"state"`
}
type vmResponse struct {
State string `json:"state"`
}
type SnapshotCreateParams struct {
MemFilePath string `json:"mem_file_path"`
SnapshotPath string `json:"snapshot_path"`
SnapshotType string `json:"snapshot_type"`
}
type SnapshotLoadParams struct {
SnapshotPath string `json:"snapshot_path"`
MemBackend *MemBackend `json:"mem_backend,omitempty"`
ResumeVm bool `json:"resume_vm"`
NetworkOverrides []NetworkOverride `json:"network_overrides,omitempty"`
VsockOverride *VsockOverride `json:"vsock_override,omitempty"`
}
type MemBackend struct {
BackendType string `json:"backend_type"`
BackendPath string `json:"backend_path"`
}
type NetworkOverride struct {
IfaceID string `json:"iface_id"`
HostDevName string `json:"host_dev_name"`
}
type VsockOverride struct {
UDSPath string `json:"uds_path"`
}
func (c *apiClient) PatchVm(ctx context.Context, state VmState) error {
return c.do(ctx, http.MethodPatch, "/vm", vmRequest{State: state}, nil, http.StatusNoContent)
}
func (c *apiClient) GetVm(ctx context.Context) (*vmResponse, error) {
var response vmResponse
if err := c.do(ctx, http.MethodGet, "/vm", nil, &response, http.StatusOK); err != nil {
return nil, err
}
return &response, nil
}
func (c *apiClient) PutSnapshotCreate(ctx context.Context, params SnapshotCreateParams) error {
return c.do(ctx, http.MethodPut, "/snapshot/create", params, nil, http.StatusNoContent)
}
func (c *apiClient) PutSnapshotLoad(ctx context.Context, params SnapshotLoadParams) error {
return c.do(ctx, http.MethodPut, "/snapshot/load", params, nil, http.StatusNoContent)
}
func (c *apiClient) do(ctx context.Context, method string, endpoint string, input any, output any, wantStatus int) error {
var body io.Reader
if input != nil {

View file

@ -0,0 +1,54 @@
package firecracker
import (
"context"
"io"
"net/http"
"testing"
)
func TestPutSnapshotLoadIncludesNetworkOverrides(t *testing.T) {
var (
gotPath string
gotBody string
)
socketPath, shutdown := startUnixSocketServer(t, func(w http.ResponseWriter, r *http.Request) {
body, err := io.ReadAll(r.Body)
if err != nil {
t.Fatalf("read request body: %v", err)
}
gotPath = r.URL.Path
gotBody = string(body)
w.WriteHeader(http.StatusNoContent)
})
defer shutdown()
client := newAPIClient(socketPath)
err := client.PutSnapshotLoad(context.Background(), SnapshotLoadParams{
SnapshotPath: "vmstate.bin",
MemBackend: &MemBackend{
BackendType: "File",
BackendPath: "memory.bin",
},
ResumeVm: false,
NetworkOverrides: []NetworkOverride{
{
IfaceID: "net0",
HostDevName: "fctap7",
},
},
})
if err != nil {
t.Fatalf("put snapshot load: %v", err)
}
if gotPath != "/snapshot/load" {
t.Fatalf("request path mismatch: got %q want %q", gotPath, "/snapshot/load")
}
want := "{\"snapshot_path\":\"vmstate.bin\",\"mem_backend\":{\"backend_type\":\"File\",\"backend_path\":\"memory.bin\"},\"resume_vm\":false,\"network_overrides\":[{\"iface_id\":\"net0\",\"host_dev_name\":\"fctap7\"}]}"
if gotBody != want {
t.Fatalf("request body mismatch:\n got: %s\nwant: %s", gotBody, want)
}
}

View file

@ -272,3 +272,11 @@ func stagedFileName(filePath string) (string, error) {
}
return name, nil
}
func stageSnapshotFile(sourcePath string, chrootRootDir string, name string) (string, error) {
target := filepath.Join(chrootRootDir, name)
if err := linkMachineFile(sourcePath, target); err != nil {
return "", err
}
return name, nil
}

View file

@ -67,3 +67,18 @@ func buildMachinePaths(rootDir string, id MachineID, firecrackerBinaryPath strin
func procSocketPath(pid int) string {
return filepath.Join("/proc", strconv.Itoa(pid), "root", defaultFirecrackerSocketDir, defaultFirecrackerSocketName)
}
type snapshotPaths struct {
BaseDir string
MemFilePath string
StateFilePath string
}
func buildSnapshotPaths(rootDir string, id string) snapshotPaths {
baseDir := filepath.Join(rootDir, "snapshots", id)
return snapshotPaths{
BaseDir: baseDir,
MemFilePath: filepath.Join(baseDir, "memory.bin"),
StateFilePath: filepath.Join(baseDir, "vmstate.bin"),
}
}

View file

@ -220,6 +220,155 @@ func (r *Runtime) Delete(ctx context.Context, state MachineState) error {
return nil
}
func (r *Runtime) Pause(ctx context.Context, state MachineState) error {
client := newAPIClient(state.SocketPath)
return client.PatchVm(ctx, VmStatePaused)
}
func (r *Runtime) Resume(ctx context.Context, state MachineState) error {
client := newAPIClient(state.SocketPath)
return client.PatchVm(ctx, VmStateResumed)
}
func (r *Runtime) CreateSnapshot(ctx context.Context, state MachineState, paths SnapshotPaths) error {
client := newAPIClient(state.SocketPath)
return client.PutSnapshotCreate(ctx, SnapshotCreateParams{
MemFilePath: paths.MemFilePath,
SnapshotPath: paths.StateFilePath,
SnapshotType: "Full",
})
}
func (r *Runtime) RestoreBoot(ctx context.Context, loadSpec SnapshotLoadSpec, usedNetworks []NetworkAllocation) (*MachineState, error) {
cleanup := func(network NetworkAllocation, paths machinePaths, command *exec.Cmd, firecrackerPID int) {
if preserveFailureArtifacts() {
return
}
cleanupRunningProcess(firecrackerPID)
cleanupStartedProcess(command)
_ = r.networkProvisioner.Remove(context.Background(), network)
if paths.BaseDir != "" {
_ = os.RemoveAll(paths.BaseDir)
}
}
var network NetworkAllocation
if loadSpec.Network != nil {
network = *loadSpec.Network
} else {
var err error
network, err = r.networkAllocator.Allocate(usedNetworks)
if err != nil {
return nil, err
}
}
paths, err := buildMachinePaths(r.rootDir, loadSpec.ID, r.firecrackerBinaryPath)
if err != nil {
cleanup(network, machinePaths{}, nil, 0)
return nil, err
}
if err := os.MkdirAll(paths.LogDir, 0o755); err != nil {
cleanup(network, paths, nil, 0)
return nil, fmt.Errorf("create machine log dir %q: %w", paths.LogDir, err)
}
if err := r.networkProvisioner.Ensure(ctx, network); err != nil {
cleanup(network, paths, nil, 0)
return nil, err
}
command, err := launchJailedFirecracker(paths, loadSpec.ID, r.firecrackerBinaryPath, r.jailerBinaryPath)
if err != nil {
cleanup(network, paths, nil, 0)
return nil, err
}
firecrackerPID, err := waitForPIDFile(ctx, paths.PIDFilePath)
if err != nil {
cleanup(network, paths, command, 0)
return nil, fmt.Errorf("wait for firecracker pid: %w", err)
}
socketPath := procSocketPath(firecrackerPID)
client := newAPIClient(socketPath)
if err := waitForSocket(ctx, client, socketPath); err != nil {
cleanup(network, paths, command, firecrackerPID)
return nil, fmt.Errorf("wait for firecracker socket: %w", err)
}
// Stage snapshot files and disk images into the chroot
chrootMemPath, err := stageSnapshotFile(loadSpec.MemFilePath, paths.ChrootRootDir, "memory.bin")
if err != nil {
cleanup(network, paths, command, firecrackerPID)
return nil, fmt.Errorf("stage memory file: %w", err)
}
chrootStatePath, err := stageSnapshotFile(loadSpec.SnapshotPath, paths.ChrootRootDir, "vmstate.bin")
if err != nil {
cleanup(network, paths, command, firecrackerPID)
return nil, fmt.Errorf("stage vmstate file: %w", err)
}
// Stage root filesystem
rootFSName, err := stagedFileName(loadSpec.RootFSPath)
if err != nil {
cleanup(network, paths, command, firecrackerPID)
return nil, fmt.Errorf("rootfs path: %w", err)
}
if err := linkMachineFile(loadSpec.RootFSPath, filepath.Join(paths.ChrootRootDir, rootFSName)); err != nil {
cleanup(network, paths, command, firecrackerPID)
return nil, fmt.Errorf("link rootfs into jail: %w", err)
}
// Stage additional drives
for driveID, drivePath := range loadSpec.DiskPaths {
driveName, err := stagedFileName(drivePath)
if err != nil {
cleanup(network, paths, command, firecrackerPID)
return nil, fmt.Errorf("drive %q path: %w", driveID, err)
}
if err := linkMachineFile(drivePath, filepath.Join(paths.ChrootRootDir, driveName)); err != nil {
cleanup(network, paths, command, firecrackerPID)
return nil, fmt.Errorf("link drive %q into jail: %w", driveID, err)
}
}
// Load snapshot (replaces the full configure+start sequence)
if err := client.PutSnapshotLoad(ctx, SnapshotLoadParams{
SnapshotPath: chrootStatePath,
MemBackend: &MemBackend{
BackendType: "File",
BackendPath: chrootMemPath,
},
ResumeVm: false,
NetworkOverrides: []NetworkOverride{
{
IfaceID: network.InterfaceID,
HostDevName: network.TapName,
},
},
}); err != nil {
cleanup(network, paths, command, firecrackerPID)
return nil, fmt.Errorf("load snapshot: %w", err)
}
// Resume the restored VM
if err := client.PatchVm(ctx, VmStateResumed); err != nil {
cleanup(network, paths, command, firecrackerPID)
return nil, fmt.Errorf("resume restored vm: %w", err)
}
now := time.Now().UTC()
state := MachineState{
ID: loadSpec.ID,
Phase: PhaseRunning,
PID: firecrackerPID,
RuntimeHost: network.GuestIP().String(),
SocketPath: socketPath,
TapName: network.TapName,
StartedAt: &now,
}
return &state, nil
}
func processExists(pid int) bool {
if pid < 1 {
return false

View file

@ -5,6 +5,27 @@ import "time"
// Phase represents the lifecycle phase of a local microVM.
type Phase string
// SnapshotPaths holds the file paths for a VM snapshot.
type SnapshotPaths struct {
MemFilePath string
StateFilePath string
}
// SnapshotLoadSpec describes what is needed to restore a VM from a snapshot.
type SnapshotLoadSpec struct {
ID MachineID
SnapshotPath string
MemFilePath string
DiskPaths map[string]string // drive ID -> host path
RootFSPath string
KernelImagePath string
VCPUs int64
MemoryMiB int64
KernelArgs string
Vsock *VsockSpec
Network *NetworkAllocation
}
// MachineState describes the current host local state for a machine.
type MachineState struct {
ID MachineID