feat: vsock mmds snapshot

This commit is contained in:
Harivansh Rathi 2026-04-10 02:26:43 +00:00
parent 39f8882c30
commit 07975fb459
13 changed files with 390 additions and 148 deletions

View file

@ -12,10 +12,10 @@ import (
"strings"
"time"
contracthost "github.com/getcompanion-ai/computer-host/contract"
"github.com/getcompanion-ai/computer-host/internal/firecracker"
"github.com/getcompanion-ai/computer-host/internal/model"
"github.com/getcompanion-ai/computer-host/internal/store"
contracthost "github.com/getcompanion-ai/computer-host/contract"
)
func (d *Daemon) CreateSnapshot(ctx context.Context, machineID contracthost.MachineID, req contracthost.CreateSnapshotRequest) (*contracthost.CreateSnapshotResponse, error) {
@ -332,6 +332,9 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn
restoredDrivePaths[driveID] = volumePath
}
// Do not force vsock_override on restore: Firecracker rejects it for old
// snapshots without a vsock device, and the jailed /run path already
// relocates safely for snapshots created with the new vsock-backed guest.
loadSpec := firecracker.SnapshotLoadSpec{
ID: firecracker.MachineID(req.MachineID),
SnapshotPath: vmstateArtifact.LocalPath,
@ -349,27 +352,6 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn
return nil, fmt.Errorf("restore boot: %w", err)
}
// Wait for guest to become ready
if err := waitForGuestReady(ctx, machineState.RuntimeHost, defaultMachinePorts()); err != nil {
_ = d.runtime.Delete(ctx, *machineState)
_ = os.RemoveAll(filepath.Dir(newSystemDiskPath))
clearOperation = true
return nil, fmt.Errorf("wait for restored guest ready: %w", err)
}
if err := d.reconfigureGuestIdentity(ctx, machineState.RuntimeHost, req.MachineID, guestConfig); err != nil {
_ = d.runtime.Delete(ctx, *machineState)
_ = os.RemoveAll(filepath.Dir(newSystemDiskPath))
clearOperation = true
return nil, fmt.Errorf("reconfigure restored guest identity: %w", err)
}
guestSSHPublicKey, err := d.readGuestSSHPublicKey(ctx, machineState.RuntimeHost)
if err != nil {
_ = d.runtime.Delete(ctx, *machineState)
_ = os.RemoveAll(filepath.Dir(newSystemDiskPath))
clearOperation = true
return nil, fmt.Errorf("read restored guest ssh host key: %w", err)
}
systemVolumeID := d.systemVolumeID(req.MachineID)
now := time.Now().UTC()
@ -419,38 +401,13 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn
RuntimeHost: machineState.RuntimeHost,
TapDevice: machineState.TapName,
Ports: defaultMachinePorts(),
GuestSSHPublicKey: guestSSHPublicKey,
Phase: contracthost.MachinePhaseRunning,
GuestSSHPublicKey: "",
Phase: contracthost.MachinePhaseStarting,
PID: machineState.PID,
SocketPath: machineState.SocketPath,
CreatedAt: now,
StartedAt: machineState.StartedAt,
}
d.relayAllocMu.Lock()
sshRelayPort, err := d.allocateMachineRelayProxy(ctx, machineRecord, contracthost.MachinePortNameSSH, machineRecord.RuntimeHost, defaultSSHPort, minMachineSSHRelayPort, maxMachineSSHRelayPort)
var vncRelayPort uint16
if err == nil {
vncRelayPort, err = d.allocateMachineRelayProxy(ctx, machineRecord, contracthost.MachinePortNameVNC, machineRecord.RuntimeHost, defaultVNCPort, minMachineVNCRelayPort, maxMachineVNCRelayPort)
}
d.relayAllocMu.Unlock()
if err != nil {
d.stopMachineRelays(machineRecord.ID)
for _, restoredVolumeID := range restoredUserVolumeIDs {
_ = d.store.DeleteVolume(context.Background(), restoredVolumeID)
}
_ = d.store.DeleteVolume(context.Background(), systemVolumeID)
_ = d.runtime.Delete(ctx, *machineState)
_ = os.RemoveAll(filepath.Dir(newSystemDiskPath))
clearOperation = true
return nil, fmt.Errorf("allocate relay ports for restored machine: %w", err)
}
machineRecord.Ports = buildMachinePorts(sshRelayPort, vncRelayPort)
startedRelays := true
defer func() {
if startedRelays {
d.stopMachineRelays(machineRecord.ID)
}
}()
if err := d.store.CreateMachine(ctx, machineRecord); err != nil {
for _, restoredVolumeID := range restoredUserVolumeIDs {
_ = d.store.DeleteVolume(context.Background(), restoredVolumeID)
@ -462,7 +419,6 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn
return nil, err
}
startedRelays = false
clearOperation = true
return &contracthost.RestoreSnapshotResponse{
Machine: machineToContract(machineRecord),