feat: power-cycle

This commit is contained in:
Harivansh Rathi 2026-04-11 18:54:16 +00:00
parent 0e5e989192
commit d0f0530ca2
7 changed files with 242 additions and 18 deletions

View file

@ -6,6 +6,7 @@ import (
"path/filepath"
"strconv"
"strings"
"time"
"github.com/getcompanion-ai/computer-host/internal/firecracker"
)
@ -27,6 +28,7 @@ type Config struct {
SocketPath string
HTTPAddr string
EgressInterface string
ReconcileInterval time.Duration
FirecrackerBinaryPath string
JailerBinaryPath string
GuestLoginCAPublicKey string
@ -67,6 +69,10 @@ func Load() (Config, error) {
JailerBinaryPath: strings.TrimSpace(os.Getenv("JAILER_BINARY_PATH")),
GuestLoginCAPublicKey: strings.TrimSpace(os.Getenv("GUEST_LOGIN_CA_PUBLIC_KEY")),
}
cfg.ReconcileInterval, err = durationDefault("FIRECRACKER_HOST_RECONCILE_INTERVAL", 5*time.Second)
if err != nil {
return Config{}, err
}
if err := cfg.Validate(); err != nil {
return Config{}, err
}
@ -114,6 +120,9 @@ func (c Config) Validate() error {
if strings.TrimSpace(c.EgressInterface) == "" {
return fmt.Errorf("FIRECRACKER_HOST_EGRESS_INTERFACE is required")
}
if c.ReconcileInterval <= 0 {
return fmt.Errorf("FIRECRACKER_HOST_RECONCILE_INTERVAL must be greater than zero")
}
return nil
}
@ -174,3 +183,15 @@ func loadBool(name string) (bool, error) {
}
return parsed, nil
}
func durationDefault(name string, fallback time.Duration) (time.Duration, error) {
value := strings.TrimSpace(os.Getenv(name))
if value == "" {
return fallback, nil
}
parsed, err := time.ParseDuration(value)
if err != nil {
return 0, fmt.Errorf("%s must be a duration: %w", name, err)
}
return parsed, nil
}

View file

@ -72,6 +72,12 @@ func (d *Daemon) CreateMachine(ctx context.Context, req contracthost.CreateMachi
if err := os.Truncate(systemVolumePath, defaultGuestDiskSizeBytes); err != nil {
return nil, fmt.Errorf("expand system volume for %q: %w", req.MachineID, err)
}
if err := injectMachineIdentity(ctx, systemVolumePath, req.MachineID); err != nil {
return nil, fmt.Errorf("inject machine identity for %q: %w", req.MachineID, err)
}
if err := injectGuestConfig(ctx, systemVolumePath, guestConfig); err != nil {
return nil, fmt.Errorf("inject guest config for %q: %w", req.MachineID, err)
}
removeSystemVolumeOnFailure := true
defer func() {
if !removeSystemVolumeOnFailure {

View file

@ -236,6 +236,23 @@ func TestCreateMachineStagesArtifactsAndPersistsState(t *testing.T) {
if machine.GuestConfig == nil || len(machine.GuestConfig.AuthorizedKeys) == 0 {
t.Fatalf("stored guest config missing authorized keys: %#v", machine.GuestConfig)
}
machineName, err := readExt4File(runtime.lastSpec.RootFSPath, "/etc/microagent/machine-name")
if err != nil {
t.Fatalf("read injected machine-name: %v", err)
}
if machineName != "vm-1\n" {
t.Fatalf("machine-name mismatch: got %q want %q", machineName, "vm-1\n")
}
injectedAuthorizedKeys, err := readExt4File(runtime.lastSpec.RootFSPath, "/etc/microagent/authorized_keys")
if err != nil {
t.Fatalf("read injected authorized_keys: %v", err)
}
if !strings.Contains(injectedAuthorizedKeys, strings.TrimSpace(string(hostAuthorizedKeyBytes))) {
t.Fatalf("disk authorized_keys missing backend ssh key: %q", injectedAuthorizedKeys)
}
if !strings.Contains(injectedAuthorizedKeys, "daemon-test") {
t.Fatalf("disk authorized_keys missing request override key: %q", injectedAuthorizedKeys)
}
operations, err := fileStore.ListOperations(context.Background())
if err != nil {
@ -260,6 +277,11 @@ func TestStopMachineSyncsGuestFilesystemBeforeDelete(t *testing.T) {
t.Fatalf("create daemon: %v", err)
}
var syncedHost string
hostDaemon.syncGuestFilesystem = func(_ context.Context, runtimeHost string) error {
syncedHost = runtimeHost
return nil
}
var shutdownHost string
hostDaemon.shutdownGuest = func(_ context.Context, runtimeHost string) error {
shutdownHost = runtimeHost
@ -295,6 +317,9 @@ func TestStopMachineSyncsGuestFilesystemBeforeDelete(t *testing.T) {
if shutdownHost != "172.16.0.2" {
t.Fatalf("shutdown host mismatch: got %q want %q", shutdownHost, "172.16.0.2")
}
if syncedHost != "172.16.0.2" {
t.Fatalf("sync host mismatch: got %q want %q", syncedHost, "172.16.0.2")
}
// runtime.Delete is always called to clean up TAP device and runtime dir.
if len(runtime.deleteCalls) != 1 {
t.Fatalf("runtime delete call count mismatch: got %d want 1", len(runtime.deleteCalls))
@ -361,6 +386,10 @@ func TestReconcileStartingMachinePersonalizesBeforeRunning(t *testing.T) {
t.Fatalf("create machine: %v", err)
}
if err := hostDaemon.Reconcile(context.Background()); err != nil {
t.Fatalf("Reconcile returned error: %v", err)
}
response, err := hostDaemon.GetMachine(context.Background(), "vm-starting")
if err != nil {
t.Fatalf("GetMachine returned error: %v", err)
@ -376,6 +405,116 @@ func TestReconcileStartingMachinePersonalizesBeforeRunning(t *testing.T) {
}
}
func TestListMachinesDoesNotReconcileStartingMachines(t *testing.T) {
root := t.TempDir()
cfg := testConfig(root)
fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath)
if err != nil {
t.Fatalf("create file store: %v", err)
}
hostDaemon, err := New(cfg, fileStore, &fakeRuntime{})
if err != nil {
t.Fatalf("create daemon: %v", err)
}
hostDaemon.personalizeGuest = func(context.Context, *model.MachineRecord, firecracker.MachineState) error {
t.Fatalf("ListMachines should not reconcile guest personalization")
return nil
}
hostDaemon.readGuestSSHPublicKey = func(context.Context, string) (string, error) {
t.Fatalf("ListMachines should not read guest ssh public key")
return "", nil
}
now := time.Now().UTC()
if err := fileStore.CreateMachine(context.Background(), model.MachineRecord{
ID: "vm-list",
SystemVolumeID: "vm-list-system",
RuntimeHost: "127.0.0.1",
TapDevice: "fctap-list",
Ports: defaultMachinePorts(),
Phase: contracthost.MachinePhaseStarting,
PID: 4321,
SocketPath: filepath.Join(cfg.RuntimeDir, "machines", "vm-list", "root", "run", "firecracker.sock"),
CreatedAt: now,
StartedAt: &now,
}); err != nil {
t.Fatalf("create machine: %v", err)
}
response, err := hostDaemon.ListMachines(context.Background())
if err != nil {
t.Fatalf("ListMachines returned error: %v", err)
}
if len(response.Machines) != 1 {
t.Fatalf("machine count = %d, want 1", len(response.Machines))
}
if response.Machines[0].Phase != contracthost.MachinePhaseStarting {
t.Fatalf("machine phase = %q, want %q", response.Machines[0].Phase, contracthost.MachinePhaseStarting)
}
}
func TestReconcileStartingMachineIgnoresPersonalizationFailures(t *testing.T) {
root := t.TempDir()
cfg := testConfig(root)
fileStore, err := store.NewFileStore(cfg.StatePath, cfg.OperationsPath)
if err != nil {
t.Fatalf("create file store: %v", err)
}
sshListener := listenTestPort(t, int(defaultSSHPort))
defer func() { _ = sshListener.Close() }()
vncListener := listenTestPort(t, int(defaultVNCPort))
defer func() { _ = vncListener.Close() }()
startedAt := time.Unix(1700000201, 0).UTC()
runtime := &fakeRuntime{}
hostDaemon, err := New(cfg, fileStore, runtime)
if err != nil {
t.Fatalf("create daemon: %v", err)
}
hostDaemon.personalizeGuest = func(context.Context, *model.MachineRecord, firecracker.MachineState) error {
return errors.New("vsock EOF")
}
hostDaemon.readGuestSSHPublicKey = func(context.Context, string) (string, error) {
return "", errors.New("Permission denied")
}
if err := fileStore.CreateMachine(context.Background(), model.MachineRecord{
ID: "vm-best-effort",
SystemVolumeID: "vm-best-effort-system",
RuntimeHost: "127.0.0.1",
TapDevice: "fctap-best-effort",
Ports: defaultMachinePorts(),
GuestSSHPublicKey: "ssh-ed25519 AAAAExistingHostKey",
Phase: contracthost.MachinePhaseStarting,
PID: 4322,
SocketPath: filepath.Join(cfg.RuntimeDir, "machines", "vm-best-effort", "root", "run", "firecracker.sock"),
CreatedAt: time.Now().UTC(),
StartedAt: &startedAt,
}); err != nil {
t.Fatalf("create machine: %v", err)
}
if err := hostDaemon.Reconcile(context.Background()); err != nil {
t.Fatalf("Reconcile returned error: %v", err)
}
record, err := fileStore.GetMachine(context.Background(), "vm-best-effort")
if err != nil {
t.Fatalf("get machine: %v", err)
}
if record.Phase != contracthost.MachinePhaseRunning {
t.Fatalf("machine phase = %q, want %q", record.Phase, contracthost.MachinePhaseRunning)
}
if record.GuestSSHPublicKey != "ssh-ed25519 AAAAExistingHostKey" {
t.Fatalf("guest ssh public key = %q, want preserved value", record.GuestSSHPublicKey)
}
if len(runtime.deleteCalls) != 0 {
t.Fatalf("runtime delete calls = %d, want 0", len(runtime.deleteCalls))
}
}
func TestNewEnsuresBackendSSHKeyPair(t *testing.T) {
root := t.TempDir()
cfg := testConfig(root)
@ -436,7 +575,7 @@ func TestRestoreSnapshotFallsBackToLocalSnapshotNetwork(t *testing.T) {
server := newRestoreArtifactServer(t, map[string][]byte{
"/kernel": []byte("kernel"),
"/rootfs": []byte("rootfs"),
"/system": []byte("disk"),
"/system": buildTestExt4ImageBytes(t),
})
defer server.Close()
@ -582,7 +721,7 @@ func TestRestoreSnapshotUsesLocalSnapshotArtifacts(t *testing.T) {
t.Fatalf("create snapshot dir: %v", err)
}
systemPath := filepath.Join(snapshotDir, "system.img")
if err := os.WriteFile(systemPath, []byte("disk"), 0o644); err != nil {
if err := os.WriteFile(systemPath, buildTestExt4ImageBytes(t), 0o644); err != nil {
t.Fatalf("write system disk: %v", err)
}
if err := fileStore.CreateSnapshot(context.Background(), model.SnapshotRecord{
@ -620,6 +759,13 @@ func TestRestoreSnapshotUsesLocalSnapshotArtifacts(t *testing.T) {
if runtime.restoreCalls != 0 {
t.Fatalf("restore boot call count mismatch: got %d want 0", runtime.restoreCalls)
}
machineName, err := readExt4File(runtime.lastSpec.RootFSPath, "/etc/microagent/machine-name")
if err != nil {
t.Fatalf("read restored machine-name: %v", err)
}
if machineName != "restored-local\n" {
t.Fatalf("restored machine-name mismatch: got %q want %q", machineName, "restored-local\n")
}
}
func TestGetSnapshotArtifactReturnsLocalArtifactPath(t *testing.T) {
@ -713,7 +859,7 @@ func TestRestoreSnapshotUsesDurableSnapshotSpec(t *testing.T) {
server := newRestoreArtifactServer(t, map[string][]byte{
"/kernel": []byte("kernel"),
"/rootfs": []byte("rootfs"),
"/system": []byte("disk"),
"/system": buildTestExt4ImageBytes(t),
"/user-0": []byte("user-disk"),
})
defer server.Close()
@ -820,7 +966,7 @@ func TestRestoreSnapshotBootsWithFreshNetworkWhenSourceNetworkInUseOnHost(t *tes
server := newRestoreArtifactServer(t, map[string][]byte{
"/kernel": []byte("kernel"),
"/rootfs": []byte("rootfs"),
"/system": []byte("disk"),
"/system": buildTestExt4ImageBytes(t),
})
defer server.Close()
@ -946,11 +1092,27 @@ func testConfig(root string) appconfig.Config {
DriveIOEngine: firecracker.DriveIOEngineSync,
SocketPath: filepath.Join(root, "firecracker-host.sock"),
EgressInterface: "eth0",
ReconcileInterval: time.Second,
FirecrackerBinaryPath: "/usr/bin/firecracker",
JailerBinaryPath: "/usr/bin/jailer",
}
}
func buildTestExt4ImageBytes(t *testing.T) []byte {
t.Helper()
root := t.TempDir()
imagePath := filepath.Join(root, "rootfs.ext4")
if err := buildTestExt4Image(root, imagePath); err != nil {
t.Fatalf("build ext4 image: %v", err)
}
payload, err := os.ReadFile(imagePath)
if err != nil {
t.Fatalf("read ext4 image: %v", err)
}
return payload
}
func TestGuestKernelArgsDisablesPCIByDefault(t *testing.T) {
t.Parallel()

View file

@ -17,7 +17,7 @@ import (
)
func (d *Daemon) GetMachine(ctx context.Context, id contracthost.MachineID) (*contracthost.GetMachineResponse, error) {
record, err := d.reconcileMachine(ctx, id)
record, err := d.store.GetMachine(ctx, id)
if err != nil {
return nil, err
}
@ -32,11 +32,7 @@ func (d *Daemon) ListMachines(ctx context.Context) (*contracthost.ListMachinesRe
machines := make([]contracthost.Machine, 0, len(records))
for _, record := range records {
reconciled, err := d.reconcileMachine(ctx, record.ID)
if err != nil {
return nil, err
}
machines = append(machines, machineToContract(*reconciled))
machines = append(machines, machineToContract(record))
}
return &contracthost.ListMachinesResponse{Machines: machines}, nil
}
@ -391,11 +387,12 @@ func (d *Daemon) reconcileMachine(ctx context.Context, machineID contracthost.Ma
return record, nil
}
if err := d.personalizeGuest(ctx, record, *state); err != nil {
return d.failMachineStartup(ctx, record, err.Error())
fmt.Fprintf(os.Stderr, "warning: guest personalization for %q failed: %v\n", record.ID, err)
}
guestSSHPublicKey, err := d.readGuestSSHPublicKey(ctx, state.RuntimeHost)
if err != nil {
return d.failMachineStartup(ctx, record, err.Error())
fmt.Fprintf(os.Stderr, "warning: read guest ssh public key for %q failed: %v\n", record.ID, err)
guestSSHPublicKey = record.GuestSSHPublicKey
}
record.RuntimeHost = state.RuntimeHost
record.TapDevice = state.TapName
@ -501,6 +498,9 @@ func (d *Daemon) stopMachineRecord(ctx context.Context, record *model.MachineRec
d.stopPublishedPortsForMachine(record.ID)
if record.Phase == contracthost.MachinePhaseRunning && strings.TrimSpace(record.RuntimeHost) != "" {
if err := d.syncGuestFilesystem(ctx, record.RuntimeHost); err != nil {
fmt.Fprintf(os.Stderr, "warning: guest filesystem sync for %q failed: %v\n", record.ID, err)
}
d.shutdownGuestClean(ctx, record)
}
// Always call runtime.Delete: it cleans up the TAP device, runtime

View file

@ -468,8 +468,9 @@ func TestRestoreSnapshotTransitionsToStartingWithoutRelayAllocation(t *testing.T
if err := os.MkdirAll(snapDir, 0o755); err != nil {
t.Fatalf("create snapshot dir: %v", err)
}
systemDisk := buildTestExt4ImageBytes(t)
snapDisk := filepath.Join(snapDir, "system.img")
if err := os.WriteFile(snapDisk, []byte("disk"), 0o644); err != nil {
if err := os.WriteFile(snapDisk, systemDisk, 0o644); err != nil {
t.Fatalf("write snapshot disk: %v", err)
}
if err := baseStore.CreateSnapshot(context.Background(), model.SnapshotRecord{
@ -487,7 +488,7 @@ func TestRestoreSnapshotTransitionsToStartingWithoutRelayAllocation(t *testing.T
server := newRestoreArtifactServer(t, map[string][]byte{
"/kernel": []byte("kernel"),
"/rootfs": []byte("rootfs"),
"/system": []byte("disk"),
"/system": systemDisk,
})
defer server.Close()
@ -594,8 +595,16 @@ func TestStopMachineContinuesWhenGuestSyncFails(t *testing.T) {
t.Fatalf("create daemon: %v", err)
}
stubGuestSSHPublicKeyReader(hostDaemon)
hostDaemon.syncGuestFilesystem = func(context.Context, string) error {
return errors.New("guest sync failed")
}
hostDaemon.shutdownGuest = func(context.Context, string) error {
return errors.New("guest shutdown failed")
runtime.inspectOverride = func(state firecracker.MachineState) (*firecracker.MachineState, error) {
state.Phase = firecracker.PhaseStopped
state.PID = 0
return &state, nil
}
return nil
}
now := time.Now().UTC()
@ -678,11 +687,12 @@ func TestRestoreSnapshotCleansStagingArtifactsAfterSuccess(t *testing.T) {
}
stubGuestSSHPublicKeyReader(hostDaemon)
hostDaemon.reconfigureGuestIdentity = func(context.Context, string, contracthost.MachineID, *contracthost.GuestConfig) error { return nil }
systemDisk := buildTestExt4ImageBytes(t)
server := newRestoreArtifactServer(t, map[string][]byte{
"/kernel": []byte("kernel"),
"/rootfs": []byte("rootfs"),
"/system": []byte("disk"),
"/system": systemDisk,
})
defer server.Close()
@ -699,7 +709,7 @@ func TestRestoreSnapshotCleansStagingArtifactsAfterSuccess(t *testing.T) {
SourceRuntimeHost: "172.16.0.2",
SourceTapDevice: "fctap0",
Artifacts: []contracthost.SnapshotArtifact{
{ID: "disk-system", Kind: contracthost.SnapshotArtifactKindDisk, Name: "system.img", DownloadURL: server.URL + "/system", SHA256Hex: mustSHA256Hex(t, []byte("disk"))},
{ID: "disk-system", Kind: contracthost.SnapshotArtifactKindDisk, Name: "system.img", DownloadURL: server.URL + "/system", SHA256Hex: mustSHA256Hex(t, systemDisk)},
},
},
})
@ -727,11 +737,12 @@ func TestRestoreSnapshotCleansStagingArtifactsAfterDownloadFailure(t *testing.T)
t.Fatalf("create daemon: %v", err)
}
stubGuestSSHPublicKeyReader(hostDaemon)
systemDisk := buildTestExt4ImageBytes(t)
server := newRestoreArtifactServer(t, map[string][]byte{
"/kernel": []byte("kernel"),
"/rootfs": []byte("rootfs"),
"/system": []byte("disk"),
"/system": systemDisk,
})
defer server.Close()

View file

@ -265,6 +265,14 @@ func (d *Daemon) RestoreSnapshot(ctx context.Context, snapshotID contracthost.Sn
clearOperation = true
return nil, fmt.Errorf("copy system disk for restore: %w", err)
}
if err := injectMachineIdentity(ctx, newSystemDiskPath, req.MachineID); err != nil {
clearOperation = true
return nil, fmt.Errorf("inject machine identity for restore: %w", err)
}
if err := injectGuestConfig(ctx, newSystemDiskPath, guestConfig); err != nil {
clearOperation = true
return nil, fmt.Errorf("inject guest config for restore: %w", err)
}
type restoredUserVolume struct {
ID contracthost.VolumeID