host api alignment (#7)

* feat: add Firecracker API client methods for VM pause/resume and snapshots

Add PatchVm, GetVm, PutSnapshotCreate, and PutSnapshotLoad methods to the
API client, along with supporting types (VmState, SnapshotCreateParams,
SnapshotLoadParams, MemBackend).

* feat: add snapshot data layer - contract types, model, store, config

Add SnapshotID and snapshot contract types, SnapshotRecord model,
store interface CRUD methods with file store implementation,
snapshot paths helper, SnapshotsDir config, and directory creation.

* feat: add runtime methods for VM pause, resume, snapshot, and restore

Implement Pause, Resume, CreateSnapshot, and RestoreBoot on the
firecracker Runtime. RestoreBoot launches a jailer, stages snapshot
files into the chroot, loads the snapshot, and resumes the VM.

* feat: add daemon snapshot create, restore, and reconciliation logic

Implement CreateSnapshot (pause, snapshot, COW-copy disk, resume),
RestoreSnapshot (COW-copy disk, RestoreBoot, wait for guest),
GetSnapshot, ListSnapshots, DeleteSnapshotByID, and crash recovery
reconciliation for snapshot and restore operations.

* feat: add HTTP endpoints for snapshot create, get, list, delete, restore

Wire 5 snapshot routes: POST /machines/{id}/snapshots (create),
GET /machines/{id}/snapshots (list), GET /snapshots/{id} (get),
DELETE /snapshots/{id} (delete), POST /snapshots/{id}/restore (restore).

* fix: cross-device rename, restore network, and snapshot cleanup

- Replace os.Rename with copy+remove for moving snapshot files out of
  /proc/<pid>/root/ (cross-device link error on Linux)
- Reconfigure network interface after snapshot load so the restored VM
  uses its own tap device instead of the source VM's
- Clean partial snapshot dirs immediately on failure instead of only
  via reconcile
- Reject snapshot requests while a machine operation is already pending

* fix: test and modify snapshot runtime

* feat: snapshot lifecycle update, align runtime issues between host image
and daemon
This commit is contained in:
Hari 2026-04-08 22:21:46 -04:00 committed by GitHub
parent 9382de7eba
commit b5c97aef07
17 changed files with 1287 additions and 20 deletions

View file

@ -29,6 +29,10 @@ type Runtime interface {
Boot(context.Context, firecracker.MachineSpec, []firecracker.NetworkAllocation) (*firecracker.MachineState, error)
Inspect(firecracker.MachineState) (*firecracker.MachineState, error)
Delete(context.Context, firecracker.MachineState) error
Pause(context.Context, firecracker.MachineState) error
Resume(context.Context, firecracker.MachineState) error
CreateSnapshot(context.Context, firecracker.MachineState, firecracker.SnapshotPaths) error
RestoreBoot(context.Context, firecracker.SnapshotLoadSpec, []firecracker.NetworkAllocation) (*firecracker.MachineState, error)
}
type Daemon struct {
@ -36,6 +40,8 @@ type Daemon struct {
store store.Store
runtime Runtime
reconfigureGuestIdentity func(context.Context, string, contracthost.MachineID) error
locksMu sync.Mutex
machineLocks map[contracthost.MachineID]*sync.Mutex
artifactLocks map[string]*sync.Mutex
@ -51,18 +57,20 @@ func New(cfg appconfig.Config, store store.Store, runtime Runtime) (*Daemon, err
if runtime == nil {
return nil, fmt.Errorf("runtime is required")
}
for _, dir := range []string{cfg.ArtifactsDir, cfg.MachineDisksDir, cfg.RuntimeDir} {
for _, dir := range []string{cfg.ArtifactsDir, cfg.MachineDisksDir, cfg.SnapshotsDir, cfg.RuntimeDir} {
if err := os.MkdirAll(dir, 0o755); err != nil {
return nil, fmt.Errorf("create daemon dir %q: %w", dir, err)
}
}
daemon := &Daemon{
config: cfg,
store: store,
runtime: runtime,
machineLocks: make(map[contracthost.MachineID]*sync.Mutex),
artifactLocks: make(map[string]*sync.Mutex),
config: cfg,
store: store,
runtime: runtime,
reconfigureGuestIdentity: nil,
machineLocks: make(map[contracthost.MachineID]*sync.Mutex),
artifactLocks: make(map[string]*sync.Mutex),
}
daemon.reconfigureGuestIdentity = daemon.reconfigureGuestIdentityOverSSH
if err := daemon.ensureBackendSSHKeyPair(); err != nil {
return nil, err
}

View file

@ -15,15 +15,18 @@ import (
appconfig "github.com/getcompanion-ai/computer-host/internal/config"
"github.com/getcompanion-ai/computer-host/internal/firecracker"
"github.com/getcompanion-ai/computer-host/internal/model"
"github.com/getcompanion-ai/computer-host/internal/store"
contracthost "github.com/getcompanion-ai/computer-host/contract"
)
type fakeRuntime struct {
bootState firecracker.MachineState
bootCalls int
deleteCalls []firecracker.MachineState
lastSpec firecracker.MachineSpec
bootState firecracker.MachineState
bootCalls int
restoreCalls int
deleteCalls []firecracker.MachineState
lastSpec firecracker.MachineSpec
lastLoadSpec firecracker.SnapshotLoadSpec
}
func (f *fakeRuntime) Boot(_ context.Context, spec firecracker.MachineSpec, _ []firecracker.NetworkAllocation) (*firecracker.MachineState, error) {
@ -43,6 +46,24 @@ func (f *fakeRuntime) Delete(_ context.Context, state firecracker.MachineState)
return nil
}
func (f *fakeRuntime) Pause(_ context.Context, _ firecracker.MachineState) error {
return nil
}
func (f *fakeRuntime) Resume(_ context.Context, _ firecracker.MachineState) error {
return nil
}
func (f *fakeRuntime) CreateSnapshot(_ context.Context, _ firecracker.MachineState, _ firecracker.SnapshotPaths) error {
return nil
}
func (f *fakeRuntime) RestoreBoot(_ context.Context, spec firecracker.SnapshotLoadSpec, _ []firecracker.NetworkAllocation) (*firecracker.MachineState, error) {
f.restoreCalls++
f.lastLoadSpec = spec
return &f.bootState, nil
}
func TestCreateMachineStagesArtifactsAndPersistsState(t *testing.T) {
root := t.TempDir()
cfg := testConfig(root)
@ -223,6 +244,219 @@ func TestNewEnsuresBackendSSHKeyPair(t *testing.T) {
}
}
func TestRestoreSnapshotRejectsRunningSourceMachine(t *testing.T) {
root := t.TempDir()
cfg := testConfig(root)
fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath)
if err != nil {
t.Fatalf("create file store: %v", err)
}
runtime := &fakeRuntime{}
hostDaemon, err := New(cfg, fileStore, runtime)
if err != nil {
t.Fatalf("create daemon: %v", err)
}
hostDaemon.reconfigureGuestIdentity = func(context.Context, string, contracthost.MachineID) error { return nil }
artifactRef := contracthost.ArtifactRef{KernelImageURL: "kernel", RootFSURL: "rootfs"}
kernelPath := filepath.Join(root, "artifact-kernel")
if err := os.WriteFile(kernelPath, []byte("kernel"), 0o644); err != nil {
t.Fatalf("write kernel: %v", err)
}
if err := fileStore.PutArtifact(context.Background(), model.ArtifactRecord{
Ref: artifactRef,
LocalKey: "artifact",
LocalDir: filepath.Join(root, "artifact"),
KernelImagePath: kernelPath,
RootFSPath: filepath.Join(root, "artifact-rootfs"),
CreatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("put artifact: %v", err)
}
if err := fileStore.CreateMachine(context.Background(), model.MachineRecord{
ID: "source",
Artifact: artifactRef,
SystemVolumeID: "source-system",
RuntimeHost: "172.16.0.2",
TapDevice: "fctap0",
Phase: contracthost.MachinePhaseRunning,
CreatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("create source machine: %v", err)
}
snapDisk := filepath.Join(root, "snapshots", "snap1", "system.img")
if err := os.MkdirAll(filepath.Dir(snapDisk), 0o755); err != nil {
t.Fatalf("create snapshot dir: %v", err)
}
if err := os.WriteFile(snapDisk, []byte("disk"), 0o644); err != nil {
t.Fatalf("write snapshot disk: %v", err)
}
if err := fileStore.CreateSnapshot(context.Background(), model.SnapshotRecord{
ID: "snap1",
MachineID: "source",
Artifact: artifactRef,
MemFilePath: filepath.Join(root, "snapshots", "snap1", "memory.bin"),
StateFilePath: filepath.Join(root, "snapshots", "snap1", "vmstate.bin"),
DiskPaths: []string{snapDisk},
SourceRuntimeHost: "172.16.0.2",
SourceTapDevice: "fctap0",
CreatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("create snapshot: %v", err)
}
_, err = hostDaemon.RestoreSnapshot(context.Background(), "snap1", contracthost.RestoreSnapshotRequest{
MachineID: "restored",
})
if err == nil {
t.Fatal("expected restore rejection while source is running")
}
if !strings.Contains(err.Error(), `source machine "source" is running`) {
t.Fatalf("unexpected restore error: %v", err)
}
if runtime.restoreCalls != 0 {
t.Fatalf("restore boot should not run when source machine is still running: got %d", runtime.restoreCalls)
}
ops, err := fileStore.ListOperations(context.Background())
if err != nil {
t.Fatalf("list operations: %v", err)
}
if len(ops) != 0 {
t.Fatalf("operation journal should be empty after handled restore rejection: got %d entries", len(ops))
}
}
func TestRestoreSnapshotUsesSnapshotMetadataWithoutSourceMachine(t *testing.T) {
root := t.TempDir()
cfg := testConfig(root)
fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath)
if err != nil {
t.Fatalf("create file store: %v", err)
}
sshListener := listenTestPort(t, int(defaultSSHPort))
defer sshListener.Close()
vncListener := listenTestPort(t, int(defaultVNCPort))
defer vncListener.Close()
startedAt := time.Unix(1700000099, 0).UTC()
runtime := &fakeRuntime{
bootState: firecracker.MachineState{
ID: "restored",
Phase: firecracker.PhaseRunning,
PID: 1234,
RuntimeHost: "127.0.0.1",
SocketPath: filepath.Join(cfg.RuntimeDir, "machines", "restored", "root", "run", "firecracker.sock"),
TapName: "fctap0",
StartedAt: &startedAt,
},
}
hostDaemon, err := New(cfg, fileStore, runtime)
if err != nil {
t.Fatalf("create daemon: %v", err)
}
var reconfiguredHost string
var reconfiguredMachine contracthost.MachineID
hostDaemon.reconfigureGuestIdentity = func(_ context.Context, host string, machineID contracthost.MachineID) error {
reconfiguredHost = host
reconfiguredMachine = machineID
return nil
}
artifactRef := contracthost.ArtifactRef{KernelImageURL: "kernel", RootFSURL: "rootfs"}
kernelPath := filepath.Join(root, "artifact-kernel")
rootFSPath := filepath.Join(root, "artifact-rootfs")
if err := os.WriteFile(kernelPath, []byte("kernel"), 0o644); err != nil {
t.Fatalf("write kernel: %v", err)
}
if err := os.WriteFile(rootFSPath, []byte("rootfs"), 0o644); err != nil {
t.Fatalf("write rootfs: %v", err)
}
if err := fileStore.PutArtifact(context.Background(), model.ArtifactRecord{
Ref: artifactRef,
LocalKey: "artifact",
LocalDir: filepath.Join(root, "artifact"),
KernelImagePath: kernelPath,
RootFSPath: rootFSPath,
CreatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("put artifact: %v", err)
}
snapDir := filepath.Join(root, "snapshots", "snap1")
if err := os.MkdirAll(snapDir, 0o755); err != nil {
t.Fatalf("create snapshot dir: %v", err)
}
snapDisk := filepath.Join(snapDir, "system.img")
if err := os.WriteFile(snapDisk, []byte("disk"), 0o644); err != nil {
t.Fatalf("write snapshot disk: %v", err)
}
if err := os.WriteFile(filepath.Join(snapDir, "memory.bin"), []byte("mem"), 0o644); err != nil {
t.Fatalf("write memory snapshot: %v", err)
}
if err := os.WriteFile(filepath.Join(snapDir, "vmstate.bin"), []byte("state"), 0o644); err != nil {
t.Fatalf("write vmstate snapshot: %v", err)
}
if err := fileStore.CreateSnapshot(context.Background(), model.SnapshotRecord{
ID: "snap1",
MachineID: "source",
Artifact: artifactRef,
MemFilePath: filepath.Join(snapDir, "memory.bin"),
StateFilePath: filepath.Join(snapDir, "vmstate.bin"),
DiskPaths: []string{snapDisk},
SourceRuntimeHost: "172.16.0.2",
SourceTapDevice: "fctap0",
CreatedAt: time.Now().UTC(),
}); err != nil {
t.Fatalf("create snapshot: %v", err)
}
response, err := hostDaemon.RestoreSnapshot(context.Background(), "snap1", contracthost.RestoreSnapshotRequest{
MachineID: "restored",
})
if err != nil {
t.Fatalf("restore snapshot: %v", err)
}
if response.Machine.ID != "restored" {
t.Fatalf("restored machine id mismatch: got %q", response.Machine.ID)
}
if runtime.restoreCalls != 1 {
t.Fatalf("restore boot call count mismatch: got %d want 1", runtime.restoreCalls)
}
if runtime.lastLoadSpec.Network == nil {
t.Fatal("restore boot did not receive snapshot network")
}
if got := runtime.lastLoadSpec.Network.GuestIP().String(); got != "172.16.0.2" {
t.Fatalf("restored guest network mismatch: got %q want %q", got, "172.16.0.2")
}
if runtime.lastLoadSpec.KernelImagePath != kernelPath {
t.Fatalf("restore boot kernel path mismatch: got %q want %q", runtime.lastLoadSpec.KernelImagePath, kernelPath)
}
if reconfiguredHost != "127.0.0.1" || reconfiguredMachine != "restored" {
t.Fatalf("guest identity reconfigure mismatch: host=%q machine=%q", reconfiguredHost, reconfiguredMachine)
}
machine, err := fileStore.GetMachine(context.Background(), "restored")
if err != nil {
t.Fatalf("get restored machine: %v", err)
}
if machine.Phase != contracthost.MachinePhaseRunning {
t.Fatalf("restored machine phase mismatch: got %q", machine.Phase)
}
ops, err := fileStore.ListOperations(context.Background())
if err != nil {
t.Fatalf("list operations: %v", err)
}
if len(ops) != 0 {
t.Fatalf("operation journal should be empty after successful restore: got %d entries", len(ops))
}
}
func TestCreateMachineRejectsNonHTTPArtifactURLs(t *testing.T) {
t.Parallel()
@ -282,6 +516,7 @@ func testConfig(root string) appconfig.Config {
OperationsPath: filepath.Join(root, "state", "ops.json"),
ArtifactsDir: filepath.Join(root, "artifacts"),
MachineDisksDir: filepath.Join(root, "machine-disks"),
SnapshotsDir: filepath.Join(root, "snapshots"),
RuntimeDir: filepath.Join(root, "runtime"),
SocketPath: filepath.Join(root, "firecracker-host.sock"),
EgressInterface: "eth0",

View file

@ -0,0 +1,59 @@
package daemon
import (
"context"
"fmt"
"os/exec"
"strconv"
"strings"
contracthost "github.com/getcompanion-ai/computer-host/contract"
)
func (d *Daemon) reconfigureGuestIdentityOverSSH(ctx context.Context, runtimeHost string, machineID contracthost.MachineID) error {
runtimeHost = strings.TrimSpace(runtimeHost)
machineName := strings.TrimSpace(string(machineID))
if runtimeHost == "" {
return fmt.Errorf("guest runtime host is required")
}
if machineName == "" {
return fmt.Errorf("machine id is required")
}
privateKeyPath := d.backendSSHPrivateKeyPath()
remoteScript := fmt.Sprintf(`set -euo pipefail
machine_name=%s
printf '%%s\n' "$machine_name" >/etc/microagent/machine-name
printf '%%s\n' "$machine_name" >/etc/hostname
cat >/etc/hosts <<EOF
127.0.0.1 localhost
127.0.1.1 $machine_name
::1 localhost ip6-localhost ip6-loopback
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
EOF
hostname "$machine_name" >/dev/null 2>&1 || true
`, strconv.Quote(machineName))
cmd := exec.CommandContext(
ctx,
"ssh",
"-i", privateKeyPath,
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-o", "IdentitiesOnly=yes",
"-o", "BatchMode=yes",
"-p", strconv.Itoa(int(defaultSSHPort)),
"node@"+runtimeHost,
"sudo bash -lc "+shellSingleQuote(remoteScript),
)
output, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("reconfigure guest identity over ssh: %w: %s", err, strings.TrimSpace(string(output)))
}
return nil
}
func shellSingleQuote(value string) string {
return "'" + strings.ReplaceAll(value, "'", `'"'"'`) + "'"
}

View file

@ -128,6 +128,14 @@ func (d *Daemon) Reconcile(ctx context.Context) error {
if err := d.reconcileDelete(ctx, operation.MachineID); err != nil {
return err
}
case model.MachineOperationSnapshot:
if err := d.reconcileSnapshot(ctx, operation); err != nil {
return err
}
case model.MachineOperationRestore:
if err := d.reconcileRestore(ctx, operation); err != nil {
return err
}
default:
return fmt.Errorf("unsupported operation type %q", operation.Type)
}
@ -325,3 +333,36 @@ func (d *Daemon) detachVolumesForMachine(ctx context.Context, machineID contract
}
return nil
}
func (d *Daemon) reconcileSnapshot(ctx context.Context, operation model.OperationRecord) error {
if operation.SnapshotID == nil {
return d.store.DeleteOperation(ctx, operation.MachineID)
}
_, err := d.store.GetSnapshot(ctx, *operation.SnapshotID)
if err == nil {
// Snapshot completed successfully, just clear the journal
return d.store.DeleteOperation(ctx, operation.MachineID)
}
// Snapshot did not complete: clean up partial snapshot directory and resume the machine
snapshotDir := filepath.Join(d.config.SnapshotsDir, string(*operation.SnapshotID))
_ = os.RemoveAll(snapshotDir)
// Try to resume the source machine in case it was left paused
record, err := d.store.GetMachine(ctx, operation.MachineID)
if err == nil && record.Phase == contracthost.MachinePhaseRunning && record.PID > 0 {
_ = d.runtime.Resume(ctx, machineToRuntimeState(*record))
}
return d.store.DeleteOperation(ctx, operation.MachineID)
}
func (d *Daemon) reconcileRestore(ctx context.Context, operation model.OperationRecord) error {
_, err := d.store.GetMachine(ctx, operation.MachineID)
if err == nil {
// Restore completed, clear journal
return d.store.DeleteOperation(ctx, operation.MachineID)
}
// Restore did not complete: clean up partial machine directory and disk
_ = os.RemoveAll(filepath.Dir(d.systemVolumePath(operation.MachineID)))
_ = os.RemoveAll(d.machineRuntimeBaseDir(operation.MachineID))
return d.store.DeleteOperation(ctx, operation.MachineID)
}

407
internal/daemon/snapshot.go Normal file
View file

@ -0,0 +1,407 @@
package daemon
import (
"context"
"crypto/rand"
"encoding/hex"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/getcompanion-ai/computer-host/internal/firecracker"
"github.com/getcompanion-ai/computer-host/internal/model"
"github.com/getcompanion-ai/computer-host/internal/store"
contracthost "github.com/getcompanion-ai/computer-host/contract"
)
func (d *Daemon) CreateSnapshot(ctx context.Context, machineID contracthost.MachineID) (*contracthost.CreateSnapshotResponse, error) {
unlock := d.lockMachine(machineID)
defer unlock()
record, err := d.store.GetMachine(ctx, machineID)
if err != nil {
return nil, err
}
if record.Phase != contracthost.MachinePhaseRunning {
return nil, fmt.Errorf("machine %q is not running", machineID)
}
// Reject if an operation is already pending for this machine
if ops, err := d.store.ListOperations(ctx); err == nil {
for _, op := range ops {
if op.MachineID == machineID {
return nil, fmt.Errorf("machine %q has a pending %q operation (started %s)", machineID, op.Type, op.StartedAt.Format(time.RFC3339))
}
}
}
snapshotID := contracthost.SnapshotID(generateID())
if err := d.store.UpsertOperation(ctx, model.OperationRecord{
MachineID: machineID,
Type: model.MachineOperationSnapshot,
StartedAt: time.Now().UTC(),
SnapshotID: &snapshotID,
}); err != nil {
return nil, err
}
clearOperation := false
defer func() {
if clearOperation {
_ = d.store.DeleteOperation(context.Background(), machineID)
}
}()
snapshotDir := filepath.Join(d.config.SnapshotsDir, string(snapshotID))
if err := os.MkdirAll(snapshotDir, 0o755); err != nil {
return nil, fmt.Errorf("create snapshot dir: %w", err)
}
runtimeState := machineToRuntimeState(*record)
// Pause the VM
if err := d.runtime.Pause(ctx, runtimeState); err != nil {
return nil, fmt.Errorf("pause machine %q: %w", machineID, err)
}
// Write snapshot inside the chroot (Firecracker can only write there)
// Use jailed paths relative to the chroot root
chrootMemPath := "memory.bin"
chrootStatePath := "vmstate.bin"
if err := d.runtime.CreateSnapshot(ctx, runtimeState, firecracker.SnapshotPaths{
MemFilePath: chrootMemPath,
StateFilePath: chrootStatePath,
}); err != nil {
_ = d.runtime.Resume(ctx, runtimeState)
_ = os.RemoveAll(snapshotDir)
return nil, fmt.Errorf("create snapshot for %q: %w", machineID, err)
}
// COW-copy disk files while paused for consistency
var diskPaths []string
systemVolume, err := d.store.GetVolume(ctx, record.SystemVolumeID)
if err != nil {
_ = d.runtime.Resume(ctx, runtimeState)
_ = os.RemoveAll(snapshotDir)
return nil, fmt.Errorf("get system volume: %w", err)
}
systemDiskTarget := filepath.Join(snapshotDir, "system.img")
if err := cowCopyFile(systemVolume.Path, systemDiskTarget); err != nil {
_ = d.runtime.Resume(ctx, runtimeState)
_ = os.RemoveAll(snapshotDir)
return nil, fmt.Errorf("copy system disk: %w", err)
}
diskPaths = append(diskPaths, systemDiskTarget)
// Resume the source VM
if err := d.runtime.Resume(ctx, runtimeState); err != nil {
_ = os.RemoveAll(snapshotDir)
return nil, fmt.Errorf("resume machine %q: %w", machineID, err)
}
// Copy snapshot files from chroot to snapshot directory, then remove originals.
// os.Rename fails across filesystem boundaries (/proc/<pid>/root/ is on procfs).
chrootRoot := filepath.Dir(filepath.Dir(runtimeState.SocketPath)) // strip /run/firecracker.socket
srcMemPath := filepath.Join(chrootRoot, chrootMemPath)
srcStatePath := filepath.Join(chrootRoot, chrootStatePath)
dstMemPath := filepath.Join(snapshotDir, "memory.bin")
dstStatePath := filepath.Join(snapshotDir, "vmstate.bin")
if err := moveFile(srcMemPath, dstMemPath); err != nil {
_ = os.RemoveAll(snapshotDir)
return nil, fmt.Errorf("move memory file: %w", err)
}
if err := moveFile(srcStatePath, dstStatePath); err != nil {
_ = os.RemoveAll(snapshotDir)
return nil, fmt.Errorf("move vmstate file: %w", err)
}
now := time.Now().UTC()
snapshotRecord := model.SnapshotRecord{
ID: snapshotID,
MachineID: machineID,
Artifact: record.Artifact,
MemFilePath: dstMemPath,
StateFilePath: dstStatePath,
DiskPaths: diskPaths,
SourceRuntimeHost: record.RuntimeHost,
SourceTapDevice: record.TapDevice,
CreatedAt: now,
}
if err := d.store.CreateSnapshot(ctx, snapshotRecord); err != nil {
_ = os.RemoveAll(snapshotDir)
return nil, err
}
clearOperation = true
return &contracthost.CreateSnapshotResponse{
Snapshot: snapshotToContract(snapshotRecord),
}, nil
}
func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.SnapshotID, req contracthost.RestoreSnapshotRequest) (*contracthost.RestoreSnapshotResponse, error) {
if err := validateMachineID(req.MachineID); err != nil {
return nil, err
}
unlock := d.lockMachine(req.MachineID)
defer unlock()
snap, err := d.store.GetSnapshot(ctx, snapshotID)
if err != nil {
return nil, err
}
if _, err := d.store.GetMachine(ctx, req.MachineID); err == nil {
return nil, fmt.Errorf("machine %q already exists", req.MachineID)
}
if err := d.store.UpsertOperation(ctx, model.OperationRecord{
MachineID: req.MachineID,
Type: model.MachineOperationRestore,
StartedAt: time.Now().UTC(),
SnapshotID: &snapshotID,
}); err != nil {
return nil, err
}
clearOperation := false
defer func() {
if clearOperation {
_ = d.store.DeleteOperation(context.Background(), req.MachineID)
}
}()
sourceMachine, err := d.store.GetMachine(ctx, snap.MachineID)
switch {
case err == nil && sourceMachine.Phase == contracthost.MachinePhaseRunning:
clearOperation = true
return nil, fmt.Errorf("restore from snapshot %q while source machine %q is running is not supported yet", snapshotID, snap.MachineID)
case err != nil && err != store.ErrNotFound:
return nil, fmt.Errorf("get source machine for restore: %w", err)
}
usedNetworks, err := d.listRunningNetworks(ctx, req.MachineID)
if err != nil {
return nil, err
}
restoreNetwork, err := restoreNetworkFromSnapshot(snap)
if err != nil {
clearOperation = true
return nil, err
}
if networkAllocationInUse(restoreNetwork, usedNetworks) {
clearOperation = true
return nil, fmt.Errorf("snapshot %q restore network %q (%s) is already in use", snapshotID, restoreNetwork.TapName, restoreNetwork.GuestIP())
}
artifact, err := d.store.GetArtifact(ctx, snap.Artifact)
if err != nil {
return nil, fmt.Errorf("get artifact for restore: %w", err)
}
// COW-copy system disk from snapshot to new machine's disk dir.
newSystemDiskPath := d.systemVolumePath(req.MachineID)
if err := os.MkdirAll(filepath.Dir(newSystemDiskPath), 0o755); err != nil {
return nil, fmt.Errorf("create machine disk dir: %w", err)
}
if len(snap.DiskPaths) < 1 {
clearOperation = true
return nil, fmt.Errorf("snapshot %q has no disk paths", snapshotID)
}
if err := cowCopyFile(snap.DiskPaths[0], newSystemDiskPath); err != nil {
clearOperation = true
return nil, fmt.Errorf("copy system disk for restore: %w", err)
}
loadSpec := firecracker.SnapshotLoadSpec{
ID: firecracker.MachineID(req.MachineID),
SnapshotPath: snap.StateFilePath,
MemFilePath: snap.MemFilePath,
RootFSPath: newSystemDiskPath,
KernelImagePath: artifact.KernelImagePath,
DiskPaths: map[string]string{},
Network: &restoreNetwork,
}
machineState, err := d.runtime.RestoreBoot(ctx, loadSpec, usedNetworks)
if err != nil {
_ = os.RemoveAll(filepath.Dir(newSystemDiskPath))
clearOperation = true
return nil, fmt.Errorf("restore boot: %w", err)
}
// Wait for guest to become ready
if err := waitForGuestReady(ctx, machineState.RuntimeHost, defaultMachinePorts()); err != nil {
_ = d.runtime.Delete(ctx, *machineState)
_ = os.RemoveAll(filepath.Dir(newSystemDiskPath))
clearOperation = true
return nil, fmt.Errorf("wait for restored guest ready: %w", err)
}
if err := d.reconfigureGuestIdentity(ctx, machineState.RuntimeHost, req.MachineID); err != nil {
_ = d.runtime.Delete(ctx, *machineState)
_ = os.RemoveAll(filepath.Dir(newSystemDiskPath))
clearOperation = true
return nil, fmt.Errorf("reconfigure restored guest identity: %w", err)
}
systemVolumeID := d.systemVolumeID(req.MachineID)
now := time.Now().UTC()
if err := d.store.CreateVolume(ctx, model.VolumeRecord{
ID: systemVolumeID,
Kind: contracthost.VolumeKindSystem,
AttachedMachineID: machineIDPtr(req.MachineID),
SourceArtifact: &snap.Artifact,
Pool: model.StoragePoolMachineDisks,
Path: newSystemDiskPath,
CreatedAt: now,
}); err != nil {
return nil, err
}
machineRecord := model.MachineRecord{
ID: req.MachineID,
Artifact: snap.Artifact,
SystemVolumeID: systemVolumeID,
RuntimeHost: machineState.RuntimeHost,
TapDevice: machineState.TapName,
Ports: defaultMachinePorts(),
Phase: contracthost.MachinePhaseRunning,
PID: machineState.PID,
SocketPath: machineState.SocketPath,
CreatedAt: now,
StartedAt: machineState.StartedAt,
}
if err := d.store.CreateMachine(ctx, machineRecord); err != nil {
return nil, err
}
clearOperation = true
return &contracthost.RestoreSnapshotResponse{
Machine: machineToContract(machineRecord),
}, nil
}
func (d *Daemon) GetSnapshot(ctx context.Context, snapshotID contracthost.SnapshotID) (*contracthost.GetSnapshotResponse, error) {
snap, err := d.store.GetSnapshot(ctx, snapshotID)
if err != nil {
return nil, err
}
return &contracthost.GetSnapshotResponse{Snapshot: snapshotToContract(*snap)}, nil
}
func (d *Daemon) ListSnapshots(ctx context.Context, machineID contracthost.MachineID) (*contracthost.ListSnapshotsResponse, error) {
records, err := d.store.ListSnapshotsByMachine(ctx, machineID)
if err != nil {
return nil, err
}
snapshots := make([]contracthost.Snapshot, 0, len(records))
for _, r := range records {
snapshots = append(snapshots, snapshotToContract(r))
}
return &contracthost.ListSnapshotsResponse{Snapshots: snapshots}, nil
}
func (d *Daemon) DeleteSnapshotByID(ctx context.Context, snapshotID contracthost.SnapshotID) error {
snap, err := d.store.GetSnapshot(ctx, snapshotID)
if err != nil {
return err
}
snapshotDir := filepath.Dir(snap.MemFilePath)
if err := os.RemoveAll(snapshotDir); err != nil {
return fmt.Errorf("remove snapshot dir %q: %w", snapshotDir, err)
}
return d.store.DeleteSnapshot(ctx, snapshotID)
}
func snapshotToContract(record model.SnapshotRecord) contracthost.Snapshot {
return contracthost.Snapshot{
ID: record.ID,
MachineID: record.MachineID,
CreatedAt: record.CreatedAt,
}
}
func restoreNetworkFromSnapshot(snap *model.SnapshotRecord) (firecracker.NetworkAllocation, error) {
if snap == nil {
return firecracker.NetworkAllocation{}, fmt.Errorf("snapshot is required")
}
if strings.TrimSpace(snap.SourceRuntimeHost) == "" || strings.TrimSpace(snap.SourceTapDevice) == "" {
return firecracker.NetworkAllocation{}, fmt.Errorf("snapshot %q is missing restore network metadata", snap.ID)
}
network, err := firecracker.AllocationFromGuestIP(snap.SourceRuntimeHost, snap.SourceTapDevice)
if err != nil {
return firecracker.NetworkAllocation{}, fmt.Errorf("reconstruct snapshot %q network: %w", snap.ID, err)
}
return network, nil
}
func networkAllocationInUse(target firecracker.NetworkAllocation, used []firecracker.NetworkAllocation) bool {
targetTap := strings.TrimSpace(target.TapName)
for _, network := range used {
if network.GuestIP() == target.GuestIP() {
return true
}
if targetTap != "" && strings.TrimSpace(network.TapName) == targetTap {
return true
}
}
return false
}
func generateID() string {
b := make([]byte, 16)
if _, err := rand.Read(b); err != nil {
panic(fmt.Sprintf("generate id: %v", err))
}
return hex.EncodeToString(b)
}
// moveFile copies src to dst then removes src. Works across filesystem boundaries
// unlike os.Rename, which is needed when moving files out of /proc/<pid>/root/.
func moveFile(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
out, err := os.Create(dst)
if err != nil {
return err
}
if _, err := io.Copy(out, in); err != nil {
out.Close()
_ = os.Remove(dst)
return err
}
if err := out.Close(); err != nil {
_ = os.Remove(dst)
return err
}
return os.Remove(src)
}
func cowCopyFile(source string, target string) error {
if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil {
return fmt.Errorf("create target dir for %q: %w", target, err)
}
cmd := exec.Command("cp", "--reflink=auto", "--sparse=always", source, target)
output, err := cmd.CombinedOutput()
if err != nil {
if cloneErr := cloneFile(source, target); cloneErr == nil {
return nil
} else {
return fmt.Errorf("cow copy %q to %q: cp failed: %w: %s; clone fallback failed: %w", source, target, err, string(output), cloneErr)
}
}
return nil
}